diff --git a/go/README_cn.md b/go/README_cn.md
deleted file mode 100644
index 040540e939bc3a0993e7c963b281ad91fbfe1ffc..0000000000000000000000000000000000000000
--- a/go/README_cn.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Paddle 预测golang API
-
-## 安装
-首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``paddle_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`
-
-## 在Go中使用Paddle预测
-首先创建预测配置
-``` go
-config := paddle.NewAnalysisConfig()
-config.SetModel(model_file, params_file)
-config.SwitchUseFeedFetchOps(false)
-config.SwitchSpecifyInputNames(true)
-```
-
-创建predictor
-``` go
-predictor := paddle.NewPredictor(config)
-```
-
-获取输入Tensor和输出Tensor
-``` go
-inputs = predictor.GetInputTensors()
-```
-
-设置输入数据(假设只有一个输入)
-``` go
-input := inputs[0]
-input.SetValue(data)
-input.Reshape([]int32{1, 3, 300, 300})
-```
-
-运行预测
-``` go
-predictor.ZeroCopyRun()
-```
-
-获取输入Tensor的真实值
-``` go
-output := outputs[0]
-predictor.GetZeroCopyOutput(output)
-value := reflect.ValueOf(output.Value())
-shape, dtype := paddle.ShapeAndTypeOf(value)
-output_data := value.Interface().([][]float32)
-```
-
-## 示例
-源码见[mobilenet](./demo/mobilenet.go)
-
-下载[数据](https://paddle-inference-dist.cdn.bcebos.com/mobilenet-test-model-data.tar.gz)并解压到当前目录
-
-运行
-```bash
-go mod init github.com/paddlepaddle
-export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
-go run ./demo/mobilenet.go
-```
diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go
deleted file mode 100644
index c1ca2e967f72dc6646a6785d86ba59c709bfe25c..0000000000000000000000000000000000000000
--- a/go/demo/mobilenet.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-package main
-
-import "github.com/paddlepaddle/paddle"
-import "strings"
-import "io/ioutil"
-import "strconv"
-import "reflect"
-
-func main() {
-	config := paddle.NewAnalysisConfig()
-	config.SetModel("data/model/__model__", "data/model/__params__")
-    config.DisableGlogInfo()
-    config.SwitchUseFeedFetchOps(false)
-    config.SwitchSpecifyInputNames(true)
-
-    predictor := paddle.NewPredictor(config)
-
-    println("============== paddle inference ==============")
-    println("input num: ", predictor.GetInputNum())
-    println("input name: ", predictor.GetInputNames()[0])
-    println("output num: ", predictor.GetOutputNum())
-    println("output name: ", predictor.GetInputNames()[0])
-    println("============== run inference =================")
-
-    input := predictor.GetInputTensors()[0]
-    output := predictor.GetOutputTensors()[0]
-
-    filename := "data/data.txt"
-    data := ReadData(filename)
-    input.SetValue(data[:1 * 3 * 300 * 300])
-    input.Reshape([]int32{1, 3, 300, 300})
-
-    predictor.SetZeroCopyInput(input)
-    predictor.ZeroCopyRun()
-    predictor.GetZeroCopyOutput(output)
-
-    println("============= parse output ===================")
-    output_val := output.Value()
-    value := reflect.ValueOf(output_val)
-    shape, dtype := paddle.ShapeAndTypeOf(value)
-    switch dtype {
-    case paddle.PaddleDType(paddle.FLOAT32):
-        v := value.Interface().([][]float32)
-        println("v: ", v[0][0], v[0][1], "...")
-    case paddle.PaddleDType(paddle.UINT8):
-        v := value.Interface().([][]uint8)
-        println("v: ", v[0][0], v[0][1], "...")
-    case paddle.PaddleDType(paddle.INT32):
-        v := value.Interface().([][]int32)
-        println("v: ", v[0][0], v[0][1], "...")
-    case paddle.PaddleDType(paddle.INT64):
-        v := value.Interface().([][]int64)
-        println("v: ", v[0][0], v[0][1], "...")
-    }
-    println(shape[0], shape[1])
-    println(output.Shape()[0])
-}
-
-func ReadData(filename string) []float32 {
-    file_bytes, _ := ioutil.ReadFile(filename)
-    data_slice := strings.Split(string(file_bytes), " ")
-    var result []float32
-    for _, n := range data_slice {
-        r, _ := strconv.ParseFloat(n, 32)
-        result = append(result, float32(r))
-    }
-    return result
-}
diff --git a/go/demo/mobilenet_c.cc b/go/demo/mobilenet_c.cc
deleted file mode 100644
index 6a5cc683c9f9a9c88f73a3ca5ebac274210f3b7a..0000000000000000000000000000000000000000
--- a/go/demo/mobilenet_c.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <paddle_c_api.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-void SetConfig(PD_AnalysisConfig *);
-void ReadData(float *data, int size);
-
-int main(int argc, char *argv[]) {
-  PD_AnalysisConfig *config = PD_NewAnalysisConfig();
-  SetConfig(config);
-  PD_Predictor *predictor = PD_NewPredictor(config);
-
-  int input_num = PD_GetInputNum(predictor);
-  printf("Input num: %d\n", input_num);
-  int output_num = PD_GetOutputNum(predictor);
-  printf("Output num: %d\n", output_num);
-
-  PD_ZeroCopyTensor input;
-  PD_InitZeroCopyTensor(&input);
-  input.name = const_cast<char *>(PD_GetInputName(predictor, 0));  // NOLINT
-  input.data.capacity = sizeof(float) * 1 * 3 * 300 * 300;
-  input.data.length = input.data.capacity;
-  input.data.data = malloc(input.data.capacity);
-  int shape[] = {1, 3, 300, 300};
-  input.shape.data = static_cast<int *>(shape);
-  input.shape.capacity = sizeof(shape);
-  input.shape.length = sizeof(shape);
-  input.dtype = PD_FLOAT32;
-  ReadData((float *)input.data.data, 1 * 3 * 300 * 300);  // NOLINT
-  float *data = (float *)input.data.data;                 // NOLINT
-  PD_SetZeroCopyInput(predictor, &input);
-  int *shape_ptr = (int *)input.shape.data;  // NOLINT
-
-  PD_ZeroCopyRun(predictor);
-  PD_ZeroCopyTensor output;
-  PD_InitZeroCopyTensor(&output);
-  output.name = const_cast<char *>(PD_GetOutputName(predictor, 0));  // NOLINT
-  PD_GetZeroCopyOutput(predictor, &output);
-
-  PD_DestroyZeroCopyTensor(&output);
-
-  PD_DeleteAnalysisConfig(config);
-  PD_DeletePredictor(predictor);
-  return 0;
-}
-
-void SetConfig(PD_AnalysisConfig *config) {
-  PD_SetModel(config, "data/model/__model__", "data/model/__params__");
-  PD_SwitchUseFeedFetchOps(config, false);
-  PD_SwitchSpecifyInputNames(config, true);
-  PD_DisableGlogInfo(config);
-  // PD_SwitchIrOptim(config, false);
-}
-
-void ReadData(float *data, int n) {
-  FILE *fp = fopen("data/data.txt", "r");
-  for (int i = 0; i < n; i++) {
-    fscanf(fp, "%f", &data[i]);
-  }
-  fclose(fp);
-}
diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc
deleted file mode 100644
index b4f42dab6790bfb6dd33860a8ada704166bb74ac..0000000000000000000000000000000000000000
--- a/go/demo/mobilenet_c_exp.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <pd_inference_api.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-void ReadData(float* data, int size);
-
-int main(int argc, char* argv[]) {
-  PD_Config* config = PD_ConfigCreate();
-  PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__");
-  PD_ConfigDisableGlogInfo(config);
-
-  PD_Predictor* predictor = PD_PredictorCreate(config);
-  // config has destroyed in PD_PredictorCreate
-  config = NULL;
-
-  int input_num = PD_PredictorGetInputNum(predictor);
-  printf("Input num: %d\n", input_num);
-  int output_num = PD_PredictorGetOutputNum(predictor);
-  printf("Output num: %d\n", output_num);
-
-  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
-  PD_Tensor* input_tensor =
-      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
-  PD_OneDimArrayCstrDestroy(input_names);
-  input_names = NULL;
-
-  int32_t shape[] = {1, 3, 300, 300};
-  float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300);  // NOLINT
-  ReadData(data, 1 * 3 * 300 * 300);                                // NOLINT
-  PD_TensorReshape(input_tensor, 4, shape);
-  PD_TensorCopyFromCpuFloat(input_tensor, data);
-  free(data);
-  data = NULL;
-  PD_PredictorRun(predictor);
-
-  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
-  PD_Tensor* output_tensor =
-      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
-  PD_OneDimArrayCstrDestroy(output_names);
-  output_names = nullptr;
-
-  PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor);
-  int32_t size = 1;
-  for (size_t index = 0; index < out_shape->size; ++index) {
-    size = size * out_shape->data[index];
-  }
-  PD_OneDimArrayInt32Destroy(out_shape);
-  out_shape = NULL;
-
-  data = (float*)malloc(sizeof(float) * size);  // NOLINT
-  PD_TensorCopyToCpuFloat(output_tensor, data);
-  free(data);
-  data = NULL;
-
-  PD_TensorDestroy(output_tensor);
-  output_tensor = NULL;
-  PD_TensorDestroy(input_tensor);
-  input_tensor = NULL;
-  PD_PredictorDestroy(predictor);
-  predictor = NULL;
-
-  return 0;
-}
-
-void ReadData(float* data, int n) {
-  FILE* fp = fopen("data/data.txt", "r");
-  for (int i = 0; i < n; i++) {
-    fscanf(fp, "%f", &data[i]);
-  }
-  fclose(fp);
-}
diff --git a/go/demo/mobilenet_cxx.cc b/go/demo/mobilenet_cxx.cc
deleted file mode 100644
index 7bdd6b2b03b24e2393e746edde754f763e9dd986..0000000000000000000000000000000000000000
--- a/go/demo/mobilenet_cxx.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <paddle_inference_api.h>
-#include <fstream>
-#include <iostream>
-
-void SetConfig(paddle::AnalysisConfig *);
-
-int main(int argc, char *argv[]) {
-  paddle::AnalysisConfig config;
-  SetConfig(&config);
-  auto predictor = paddle::CreatePaddlePredictor(config);
-  auto input_name = predictor->GetInputNames()[0];
-  auto input = predictor->GetInputTensor(input_name);
-  std::cout << predictor->GetOutputNames()[0] << std::endl;
-  std::vector<int> shape{1, 3, 300, 300};
-  input->Reshape(std::move(shape));
-  std::vector<float> data(1 * 300 * 300 * 3);
-  std::ifstream fin("data/data.txt");
-  for (int i = 0; i < data.size(); i++) {
-    fin >> data[i];
-  }
-
-  input->copy_from_cpu(data.data());
-  predictor->ZeroCopyRun();
-  auto output_name = predictor->GetOutputNames()[0];
-  auto output = predictor->GetOutputTensor(output_name);
-  return 0;
-}
-
-void SetConfig(paddle::AnalysisConfig *config) {
-  config->SetModel("data/model/__model__", "data/model/__params__");
-  config->SwitchUseFeedFetchOps(false);
-  config->SwitchSpecifyInputNames(true);
-  config->SwitchIrOptim(false);
-}
diff --git a/go/paddle/config.go b/go/paddle/config.go
deleted file mode 100644
index 68a31230997bed73fbab1c1d1c7af123e353cf97..0000000000000000000000000000000000000000
--- a/go/paddle/config.go
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package paddle
-
-// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
-// #include <stdbool.h>
-// #include <stdlib.h>
-// #include <paddle_c_api.h>
-import "C"
-
-import "runtime"
-import "unsafe"
-
-type Precision C.Precision
-
-const (
-	Precision_FLOAT32 Precision = C.kFloat32
-	Precision_INT8    Precision = C.kInt8
-	Precision_HALF    Precision = C.kHalf
-)
-
-type AnalysisConfig struct {
-	c *C.PD_AnalysisConfig
-}
-
-func NewAnalysisConfig() *AnalysisConfig {
-	c_config := C.PD_NewAnalysisConfig()
-	config := &AnalysisConfig{c: c_config}
-	runtime.SetFinalizer(config, (*AnalysisConfig).finalize)
-	return config
-}
-
-func (config *AnalysisConfig) finalize() {
-	C.PD_DeleteAnalysisConfig(config.c)
-}
-
-func (config *AnalysisConfig) SetModel(model, params string) {
-	//C.printString((*C.char)(unsafe.Pointer(&s[0])))
-	c_model := C.CString(model)
-	defer C.free(unsafe.Pointer(c_model))
-	var c_params *C.char
-	if params == "" {
-		c_params = nil
-	} else {
-		c_params = C.CString(params)
-		defer C.free(unsafe.Pointer(c_params))
-	}
-
-	C.PD_SetModel(config.c, c_model, c_params)
-}
-
-func (config *AnalysisConfig) ModelDir() string {
-	return C.GoString(C.PD_ModelDir(config.c))
-}
-
-func (config *AnalysisConfig) ProgFile() string {
-	return C.GoString(C.PD_ProgFile(config.c))
-}
-
-func (config *AnalysisConfig) ParamsFile() string {
-	return C.GoString(C.PD_ParamsFile(config.c))
-}
-
-func (config *AnalysisConfig) EnableUseGpu(memory_pool_init_size_mb int, device_id int) {
-	C.PD_EnableUseGpu(config.c, C.int(memory_pool_init_size_mb), C.int(device_id))
-}
-
-func (config *AnalysisConfig) DisableGpu() {
-	C.PD_DisableGpu(config.c)
-}
-
-func (config *AnalysisConfig) UseGpu() bool {
-	return ConvertCBooleanToGo(C.PD_UseGpu(config.c))
-}
-
-func (config *AnalysisConfig) GpuDeviceId() int {
-	return int(C.PD_GpuDeviceId(config.c))
-}
-
-func (config *AnalysisConfig) MemoryPoolInitSizeMb() int {
-	return int(C.PD_MemoryPoolInitSizeMb(config.c))
-}
-
-func (config *AnalysisConfig) FractionOfGpuMemoryForPool() float32 {
-	return float32(C.PD_FractionOfGpuMemoryForPool(config.c))
-}
-
-func (config *AnalysisConfig) EnableCudnn() {
-	C.PD_EnableCUDNN(config.c)
-}
-
-func (config *AnalysisConfig) CudnnEnabled() bool {
-	return ConvertCBooleanToGo(C.PD_CudnnEnabled(config.c))
-}
-
-func (config *AnalysisConfig) SwitchIrOptim(x bool) {
-	C.PD_SwitchIrOptim(config.c, C.bool(x))
-}
-
-func (config *AnalysisConfig) IrOptim() bool {
-	return ConvertCBooleanToGo(C.PD_IrOptim(config.c))
-}
-
-func (config *AnalysisConfig) SwitchUseFeedFetchOps(x bool) {
-	C.PD_SwitchUseFeedFetchOps(config.c, C.bool(x))
-}
-
-func (config *AnalysisConfig) UseFeedFetchOpsEnabled() bool {
-	return ConvertCBooleanToGo(C.PD_UseFeedFetchOpsEnabled(config.c))
-}
-
-func (config *AnalysisConfig) SwitchSpecifyInputNames(x bool) {
-	C.PD_SwitchSpecifyInputNames(config.c, C.bool(x))
-}
-
-func (config *AnalysisConfig) SpecifyInputName() bool {
-	return ConvertCBooleanToGo(C.PD_SpecifyInputName(config.c))
-}
-
-func (config *AnalysisConfig) EnableTensorRtEngine(workspace_size int, max_batch_size int, min_subgraph_size int, precision Precision, use_static bool, use_calib_mode bool) {
-	C.PD_EnableTensorRtEngine(config.c, C.int(workspace_size), C.int(max_batch_size), C.int(min_subgraph_size), C.Precision(precision), C.bool(use_static), C.bool(use_calib_mode))
-}
-
-func (config *AnalysisConfig) TensorrtEngineEnabled() bool {
-	return ConvertCBooleanToGo(C.PD_TensorrtEngineEnabled(config.c))
-}
-
-func (config *AnalysisConfig) SwitchIrDebug(x bool) {
-	C.PD_SwitchIrDebug(config.c, C.bool(x))
-}
-
-func (config *AnalysisConfig) EnableMkldnn() {
-	C.PD_EnableMKLDNN(config.c)
-}
-
-func (config *AnalysisConfig) MkldnnEnabled() bool {
-	return ConvertCBooleanToGo(C.PD_MkldnnEnabled(config.c))
-}
-
-func (config *AnalysisConfig) SetCpuMathLibraryNumThreads(n int) {
-	C.PD_SetCpuMathLibraryNumThreads(config.c, C.int(n))
-}
-
-func (config *AnalysisConfig) CpuMathLibraryNumThreads() int {
-	return int(C.PD_CpuMathLibraryNumThreads(config.c))
-}
-
-func (config *AnalysisConfig) EnableMkldnnQuantizer() {
-	C.PD_EnableMkldnnQuantizer(config.c)
-}
-
-func (config *AnalysisConfig) EnableMkldnnBfloat16() {
-	C.PD_EnableMkldnnBfloat16(config.c)
-}
-
-func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
-	return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
-}
-
-func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
-	return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
-}
-// SetModelBuffer
-// ModelFromMemory
-
-func (config *AnalysisConfig) EnableMemoryOptim() {
-	C.PD_EnableMemoryOptim(config.c)
-}
-
-func (config *AnalysisConfig) MemoryOptimEnabled() bool {
-	return ConvertCBooleanToGo(C.PD_MemoryOptimEnabled(config.c))
-}
-
-func (config *AnalysisConfig) EnableProfile() {
-	C.PD_EnableProfile(config.c)
-}
-
-func (config *AnalysisConfig) ProfileEnabled() bool {
-	return ConvertCBooleanToGo(C.PD_ProfileEnabled(config.c))
-}
-
-func (config *AnalysisConfig) DisableGlogInfo() {
-	C.PD_DisableGlogInfo(config.c)
-}
-
-func (config *AnalysisConfig) DeletePass(pass string) {
-	c_pass := C.CString(pass)
-	defer C.free(unsafe.Pointer(c_pass))
-	C.PD_DeletePass(config.c, c_pass)
-}
-
-func (config *AnalysisConfig) SetInValid() {
-	C.PD_SetInValid(config.c)
-}
-
-func (config *AnalysisConfig) IsValid() bool {
-	return ConvertCBooleanToGo(C.PD_IsValid(config.c))
-}
diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go
deleted file mode 100644
index 5f2b2c81a60549dfdbf22dd31a98560e7e3a8cee..0000000000000000000000000000000000000000
--- a/go/paddle/predictor.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package paddle
-
-// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
-// #include <stdbool.h>
-// #include "paddle_c_api.h"
-import "C"
-
-import "reflect"
-import "runtime"
-import "unsafe"
-
-type Predictor struct {
-	c *C.PD_Predictor
-}
-
-func NewPredictor(config *AnalysisConfig) *Predictor {
-	c_predictor := C.PD_NewPredictor((*config).c)
-	predictor := &Predictor{c: c_predictor}
-	runtime.SetFinalizer(predictor, (*Predictor).finalize)
-	return predictor
-}
-
-func (predictor *Predictor) finalize() {
-	C.PD_DeletePredictor(predictor.c)
-}
-
-func DeletePredictor(predictor *Predictor) {
-	C.PD_DeletePredictor(predictor.c)
-}
-
-func (predictor *Predictor) GetInputNum() int {
-	return int(C.PD_GetInputNum(predictor.c))
-}
-
-func (predictor *Predictor) GetOutputNum() int {
-	return int(C.PD_GetOutputNum(predictor.c))
-}
-
-func (predictor *Predictor) GetInputName(n int) string {
-	return C.GoString(C.PD_GetInputName(predictor.c, C.int(n)))
-}
-
-func (predictor *Predictor) GetOutputName(n int) string {
-	return C.GoString(C.PD_GetOutputName(predictor.c, C.int(n)))
-}
-
-func (predictor *Predictor) GetInputTensors() [](*ZeroCopyTensor) {
-	var result [](*ZeroCopyTensor)
-	for i := 0; i < predictor.GetInputNum(); i++ {
-		tensor := NewZeroCopyTensor()
-		tensor.c.name = C.PD_GetInputName(predictor.c, C.int(i))
-		result = append(result, tensor)
-	}
-	return result
-}
-
-func (predictor *Predictor) GetOutputTensors() [](*ZeroCopyTensor) {
-	var result [](*ZeroCopyTensor)
-	for i := 0; i < predictor.GetOutputNum(); i++ {
-		tensor := NewZeroCopyTensor()
-		tensor.c.name = C.PD_GetOutputName(predictor.c, C.int(i))
-		result = append(result, tensor)
-	}
-	return result
-}
-
-func (predictor *Predictor) GetInputNames() []string {
-	names := make([]string, predictor.GetInputNum())
-	for i := 0; i < len(names); i++ {
-		names[i] = predictor.GetInputName(i)
-	}
-	return names
-}
-
-func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetOutputNum())
-	for i := 0; i < len(names); i++ {
-		names[i] = predictor.GetOutputName(i)
-	}
-	return names
-}
-
-func (predictor *Predictor) SetZeroCopyInput(tensor *ZeroCopyTensor) {
-	C.PD_SetZeroCopyInput(predictor.c, tensor.c)
-}
-
-func (predictor *Predictor) GetZeroCopyOutput(tensor *ZeroCopyTensor) {
-	C.PD_GetZeroCopyOutput(predictor.c, tensor.c)
-	tensor.name = C.GoString(tensor.c.name)
-	var shape []int32
-	shape_hdr := (*reflect.SliceHeader)(unsafe.Pointer(&shape))
-	shape_hdr.Data = uintptr(unsafe.Pointer(tensor.c.shape.data))
-	shape_hdr.Len = int(tensor.c.shape.length / C.sizeof_int)
-	shape_hdr.Cap = int(tensor.c.shape.length / C.sizeof_int)
-	tensor.Reshape(shape)
-}
-
-func (predictor *Predictor) ZeroCopyRun() {
-	C.PD_ZeroCopyRun(predictor.c)
-}
diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go
deleted file mode 100644
index 6fbcf039f88a7cc43a5d28f0433c9feb965566f0..0000000000000000000000000000000000000000
--- a/go/paddle/tensor.go
+++ /dev/null
@@ -1,255 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package paddle
-
-// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
-// #include <stdbool.h>
-// #include <stdlib.h>
-// #include <string.h>
-// #include <paddle_c_api.h>
-import "C"
-
-import "runtime"
-import "reflect"
-import "unsafe"
-import (
-	"bytes"
-	"encoding/binary"
-)
-
-type PaddleDType C.PD_DataType
-
-const (
-	FLOAT32  PaddleDType = C.PD_FLOAT32
-	INT32    PaddleDType = C.PD_INT32
-	INT64    PaddleDType = C.PD_INT64
-	UINT8    PaddleDType = C.PD_UINT8
-	UNKDTYPE PaddleDType = C.PD_UNKDTYPE
-)
-
-var types = []struct {
-	gotype reflect.Type
-	dtype  PaddleDType
-}{
-	{reflect.TypeOf(float32(0)), FLOAT32},
-	{reflect.TypeOf(int32(0)), INT32},
-	{reflect.TypeOf(int64(0)), INT64},
-	{reflect.TypeOf(uint8(0)), UINT8},
-}
-
-func TypeOfShape(dtype PaddleDType, shape []int32) reflect.Type {
-	var ret reflect.Type
-	for _, t := range types {
-		if dtype == PaddleDType(t.dtype) {
-			ret = t.gotype
-			break
-		}
-	}
-
-	if ret == nil {
-		panic(bug("Data %v type is not support", dtype))
-	}
-
-	for range shape {
-		ret = reflect.SliceOf(ret)
-	}
-	return ret
-}
-
-type ZeroCopyTensor struct {
-	c     *C.PD_ZeroCopyTensor
-	name  string
-	shape []int32
-}
-
-func NewZeroCopyTensor() *ZeroCopyTensor {
-	c_tensor := C.PD_NewZeroCopyTensor()
-
-	tensor := &ZeroCopyTensor{c: c_tensor}
-	runtime.SetFinalizer(tensor, (*ZeroCopyTensor).finalize)
-	return tensor
-}
-
-func (tensor *ZeroCopyTensor) finalize() {
-	C.PD_DeleteZeroCopyTensor(tensor.c)
-}
-
-func (tensor *ZeroCopyTensor) Shape() []int32 {
-	return tensor.shape
-}
-
-func (tensor *ZeroCopyTensor) Name() string {
-	return C.GoString(tensor.c.name)
-}
-
-func (tensor *ZeroCopyTensor) Rename(name string) {
-	tensor.name = name
-	tensor.c.name = (*C.char)(unsafe.Pointer(tensor.c.name))
-	//tensor.c.name = C.CString(tensor.name)
-	//defer C.free(unsafe.Pointer(tensor.c.name))
-}
-
-func (tensor *ZeroCopyTensor) Reshape(shape []int32) {
-	tensor.shape = make([]int32, len(shape))
-	copy(tensor.shape, shape)
-	length := C.sizeof_int * C.size_t(len(shape))
-	if tensor.c.shape.capacity < C.size_t(length) {
-		if tensor.c.shape.capacity != C.size_t(0) {
-			C.free(tensor.c.shape.data)
-		}
-		tensor.c.shape.data = C.malloc(length)
-		tensor.c.shape.capacity = length
-	}
-	tensor.c.shape.length = length
-	C.memcpy(tensor.c.shape.data, unsafe.Pointer(&shape[0]), length)
-}
-
-func (tensor *ZeroCopyTensor) DataType() PaddleDType {
-	return PaddleDType(tensor.c.dtype)
-}
-
-func (tensor *ZeroCopyTensor) SetValue(value interface{}) {
-	val := reflect.ValueOf(value)
-	shape, dtype := ShapeAndTypeOf(val)
-	tensor.Reshape(shape)
-	num := numel(shape)
-	length := C.size_t(SizeofDataType(dtype) * num)
-	if tensor.c.data.capacity < length {
-		if tensor.c.data.capacity != C.size_t(0) {
-			C.free(tensor.c.data.data)
-		}
-		tensor.c.data.data = C.malloc(length)
-		tensor.c.data.capacity = length
-	}
-	tensor.c.data.length = length
-
-	switch dtype {
-	case PaddleDType(UINT8):
-		data := val.Interface().([]uint8)
-		C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length)
-	case PaddleDType(INT32):
-		data := val.Interface().([]int32)
-		C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length)
-	case PaddleDType(INT64):
-		data := val.Interface().([]int64)
-		C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length)
-	case PaddleDType(FLOAT32):
-		data := val.Interface().([]float32)
-		C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length)
-	}
-	tensor.c.dtype = C.PD_DataType(dtype)
-}
-
-func TypeOf(dtype PaddleDType, shape []int32) reflect.Type {
-	var ret reflect.Type
-	for _, t := range types {
-		if t.dtype == dtype {
-			ret = t.gotype
-			break
-		}
-	}
-
-	for range shape {
-		ret = reflect.SliceOf(ret)
-	}
-	return ret
-}
-
-func (tensor *ZeroCopyTensor) Value() interface{} {
-	t := TypeOf(PaddleDType(tensor.c.dtype), tensor.shape)
-	value := reflect.New(t)
-	c_bytes := tensor.c.data.data
-	length := tensor.c.data.length
-	var slice []byte
-	if unsafe.Sizeof(unsafe.Pointer(nil)) == 8 {
-		slice = (*[1<<50 - 1]byte)(unsafe.Pointer(c_bytes))[:length:length]
-	} else {
-		slice = (*[1 << 30]byte)(unsafe.Pointer(c_bytes))[:length:length]
-	}
-	r := bytes.NewReader(slice)
-	DecodeTensor(r, tensor.Shape(), t, value)
-	return reflect.Indirect(value).Interface()
-}
-
-func Endian() binary.ByteOrder {
-	buf := [2]byte{}
-	*(*uint16)(unsafe.Pointer(&buf[0])) = uint16(0xABCD)
-
-	var endian binary.ByteOrder
-
-	switch buf {
-	case [2]byte{0xCD, 0xAB}:
-		endian = binary.LittleEndian
-	case [2]byte{0xAB, 0xCD}:
-		endian = binary.BigEndian
-	default:
-		panic("Could not determine native endianness.")
-	}
-	return endian
-}
-
-func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Value) {
-	switch t.Kind() {
-	case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
-		binary.Read(r, Endian(), ptr.Interface())
-	case reflect.Slice:
-		value := reflect.Indirect(ptr)
-		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
-		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(0).Kind() {
-			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
-				binary.Read(r, Endian(), value.Interface())
-				return
-			}
-		}
-
-		for i := 0; i < value.Len(); i++ {
-			DecodeTensor(r, shape[1:], t.Elem(), value.Index(i).Addr())
-		}
-	}
-}
-
-func SizeofDataType(dtype PaddleDType) int32 {
-	switch dtype {
-	case UINT8:
-		return int32(C.sizeof_uchar)
-	case INT32:
-		return int32(C.sizeof_int)
-	case INT64:
-		return int32(C.sizeof_longlong)
-	case FLOAT32:
-		return int32(C.sizeof_float)
-	}
-	return -1
-}
-
-func ShapeAndTypeOf(val reflect.Value) (shape []int32, dt PaddleDType) {
-	gotype := val.Type()
-	for gotype.Kind() == reflect.Array || gotype.Kind() == reflect.Slice {
-		shape = append(shape, int32(val.Len()))
-		if val.Len() > 0 {
-			val = val.Index(0)
-		}
-		gotype = gotype.Elem()
-	}
-
-	for _, t := range types {
-		if gotype.Kind() == t.gotype.Kind() {
-			return shape, PaddleDType(t.dtype)
-		}
-	}
-	return shape, dt
-}
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 43306b79fabf60caedb6c6c54417d3b7c98ab127..313cbfb7c786e967611c6d99ebbf1e843973e9a0 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -104,6 +104,8 @@ DataType Tensor::type() const {
     return DataType::INT32;
   } else if (type == paddle::framework::proto::VarType::UINT8) {
     return DataType::UINT8;
+  } else if (type == paddle::framework::proto::VarType::INT8) {
+    return DataType::INT8;
   }
   return DataType::FLOAT32;
 }
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2bbd4bb837a22f672e5aa625f299424b6f0c5b88..a547aa1b857300b62e119a668295c04138699fed 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -31,6 +31,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle_infer_declare.h"  // NOLINT
 
 /*! \file */
@@ -177,6 +178,26 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void DisableGpu();
 
+  ///
+  /// \brief Turn on XPU.
+  ///
+  /// \param l3_workspace_size The size of the video memory allocated by the l3
+  ///         cache, the maximum is 16M.
+  /// \param locked Whether the allocated L3 cache can be locked. If false,
+  ///       it means that the L3 cache is not locked, and the allocated L3
+  ///       cache can be shared by multiple models, and multiple models
+  ///       sharing the L3 cache will be executed sequentially on the card.
+  /// \param autotune Whether to autotune the conv operator in the model. If
+  ///       true, when the conv operator of a certain dimension is executed
+  ///       for the first time, it will automatically search for a better
+  ///       algorithm to improve the performance of subsequent conv operators
+  ///       of the same dimension.
+  /// \param autotune_file Specify the path of the autotune file. If
+  ///       autotune_file is specified, the algorithm specified in the
+  ///       file will be used and autotune will not be performed again.
+  /// \param precision Calculation accuracy of multi_encoder
+  /// \param adaptive_seqlen Is the input of multi_encoder variable length
+  ///
   void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false,
                  bool autotune = true, const std::string& autotune_file = "",
                  const std::string& precision = "int16",
diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h
index 4b70ed7fbad297efdf1863317e3af2b69bed702b..e7f7ac88687e7c64cb554c24eb6c6b496d63326b 100644
--- a/paddle/fluid/inference/capi_exp/pd_common.h
+++ b/paddle/fluid/inference/capi_exp/pd_common.h
@@ -71,5 +71,5 @@ PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU,
 
 PD_ENUM(PD_DataType){
     PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32,
-    PD_DATA_INT64,    PD_DATA_UINT8,
+    PD_DATA_INT64,    PD_DATA_UINT8,   PD_DATA_INT8,
 };
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index c45454e86bdaac5e8f054da91410eab7e2b873a2..e9104ef52376cd8f36358dba005c636f9f435a3d 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/inference/capi_exp/pd_config.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
 #include "paddle/fluid/platform/enforce.h"
 
 #define CHECK_NULL_POINTER_PARM(param)                  \
@@ -125,10 +127,14 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
 }
 
 void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
-                        int32_t l3_workspace_size) {
+                        int32_t l3_workspace_size, PD_Bool locked,
+                        PD_Bool autotune, const char* autotune_file,
+                        const char* precision, PD_Bool adaptive_seqlen) {
   CHECK_AND_CONVERT_PD_CONFIG;
-  config->EnableXpu(l3_workspace_size);
+  config->EnableXpu(l3_workspace_size, locked, autotune, autotune_file,
+                    precision, adaptive_seqlen);
 }
+
 PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   return config->use_xpu();
@@ -378,5 +384,24 @@ void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   config->PartiallyRelease();
 }
+void PD_ConfigDeletePass(__pd_keep PD_Config* pd_config, const char* pass) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->pass_builder()->DeletePass(pass);
+}
+void PD_ConfigInsertPass(__pd_keep PD_Config* pd_config, size_t idx,
+                         const char* pass) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->pass_builder()->InsertPass(idx, pass);
+}
+void PD_ConfigAppendPass(__pd_keep PD_Config* pd_config, const char* pass) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->pass_builder()->AppendPass(pass);
+}
+__pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses(
+    __pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> passes = config->pass_builder()->AllPasses();
+  return paddle_infer::CvtVecToOneDimArrayCstr(passes);
+}
 
 }  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index e44983e24484eae930afa6b84db397ac3aad8f08..a47ca5d27687f710aa1c0bb6db4bf830492175aa 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -25,6 +25,7 @@
 #pragma once
 
 #include "pd_common.h"  // NOLINT
+#include "pd_types.h"   // NOLINT
 
 typedef struct PD_Config PD_Config;
 
@@ -154,10 +155,27 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
 /// \brief Turn on XPU.
 ///
 /// \param[in] pd_onfig config
-/// \param[in] l3_workspace_size l3 workspace size.
+/// \param l3_workspace_size The size of the video memory allocated by the l3
+///         cache, the maximum is 16M.
+/// \param locked Whether the allocated L3 cache can be locked. If false,
+///       it means that the L3 cache is not locked, and the allocated L3
+///       cache can be shared by multiple models, and multiple models
+///       sharing the L3 cache will be executed sequentially on the card.
+/// \param autotune Whether to autotune the conv operator in the model. If
+///       true, when the conv operator of a certain dimension is executed
+///       for the first time, it will automatically search for a better
+///       algorithm to improve the performance of subsequent conv operators
+///       of the same dimension.
+/// \param autotune_file Specify the path of the autotune file. If
+///       autotune_file is specified, the algorithm specified in the
+///       file will be used and autotune will not be performed again.
+/// \param precision Calculation accuracy of multi_encoder
+/// \param adaptive_seqlen Is the input of multi_encoder variable length
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
-    __pd_keep PD_Config* pd_config, int32_t l3_workspace_size);
+    __pd_keep PD_Config* pd_config, int32_t l3_workspace_size, PD_Bool locked,
+    PD_Bool autotune, const char* autotune_file, const char* precision,
+    PD_Bool adaptive_seqlen);
 ///
 /// \brief A boolean state telling whether the XPU is turned on.
 ///
@@ -565,6 +583,35 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid(
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease(
     __pd_keep PD_Config* pd_config);
+///
+/// \brief Delete all passes that has a certain type 'pass'.
+///
+/// \param[in] pass the certain pass type to be deleted.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDeletePass(
+    __pd_keep PD_Config* pd_config, const char* pass);
+///
+/// \brief  Insert a pass to a specific position
+///
+/// \param[in] idx the position to insert.
+/// \param[in] pass the new pass.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigInsertPass(
+    __pd_keep PD_Config* pd_config, size_t idx, const char* pass);
+///
+/// \brief Append a pass to the end of the passes
+///
+/// \param[in] pass the new pass.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigAppendPass(
+    __pd_keep PD_Config* pd_config, const char* pass);
+///
+/// \brief Get information of passes.
+///
+/// \return Return list of the passes.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses(
+    __pd_keep PD_Config* pd_config);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc
index f5287a5152957f5cda0db9dee82a7689267cd3d2..5ca58b0e4138b274c67cbd988388acc30a0368ae 100644
--- a/paddle/fluid/inference/capi_exp/pd_predictor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -106,4 +106,9 @@ void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) {
   delete pd_predictor;
 }
 
+const char* PD_GetVersion() {
+  static std::string version = paddle_infer::GetVersion();
+  return version.c_str();
+}
+
 }  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h
index d4542d0b6d394d2ebc67e6f63b0b52cefb5939b3..33d5160bc3e0d1b1f14c2e9e34e1885ee8ae4f72 100644
--- a/paddle/fluid/inference/capi_exp/pd_predictor.h
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
@@ -143,6 +143,13 @@ PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory(
 PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy(
     __pd_take PD_Predictor* pd_predictor);
 
+///
+/// \brief Get version info.
+///
+/// \return version
+///
+PADDLE_CAPI_EXPORT extern const char* PD_GetVersion();
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
index 2e762619f5567c3fce05272815f9a8a0f17d267c..94362b8784bb3501d38799296f88bbfaa05bb176 100644
--- a/paddle/fluid/inference/capi_exp/pd_utils.cc
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -196,6 +196,8 @@ DataType CvtToCxxDatatype(PD_DataType data_type) {
       return DataType::INT32;
     case PD_DATA_UINT8:
       return DataType::UINT8;
+    case PD_DATA_INT8:
+      return DataType::INT8;
     default:
       PADDLE_THROW(paddle::platform::errors::InvalidArgument(
           "Unsupport paddle data type %d.", data_type));
diff --git a/paddle/fluid/inference/goapi/README.md b/paddle/fluid/inference/goapi/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..272a5a6108eee4579889a504f361cbce3905fed3
--- /dev/null
+++ b/paddle/fluid/inference/goapi/README.md
@@ -0,0 +1,107 @@
+# Paddle Inference golang API
+
+Paddle Inference golang API 基于 [capi](../capi_exp) 和 cgo 实现，需要您提前准备好C预测库。
+
+## 安装
+
+1. 确认使用Paddle的CommitId
+
+您可以通过`git log -1`的方式，确认您使用的Paddle版本的CommitId
+
+2. 使用`go get`获取golang paddle api
+
+```
+# 此处使用上一步记录的CommitId，假设为76e5724
+COMMITID=76e5724
+go get -d -v github.com/paddlepaddle/paddle/paddle/fluid/inference/goapi@${COMMITID}
+```
+
+3. 下载C预测库
+
+您可以选择直接下载[paddle_inference_c](https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/docs/user_guides/download_lib.md)预测库，或通过源码编译的方式安装，源码编译方式参考官网文档，注意这里cmake编译时打开`-DON_INFER=ON`,在编译目录下得到`paddle_inference_c_install_dir`。
+
+
+4. 软链
+
+go1.15新增了`GOMODCACHE`环境变量，`go get`默认会将代码下载到`GOMODCACHE`目录下，您可以通过`go env | grep GOMODCACHE`的方式，查看该路径，在官网发布的docker镜像中该路径一般默认为`/root/gopath/pkg/mod`，进入到golang api代码路径建立软连接，将c预测库命名为`paddle_inference_c`。
+
+```bash
+eval $(go env | grep GOMODCACHE)
+# 按需修改最后的goapi版本号
+cd ${GOMODCACHE}/github.com/paddlepaddle/paddle/paddle/fluid/inference/goapi\@v0.0.0-20210517084506-76e5724c16a5/
+ln -s ${PADDLE_C_DOWNLOAD_DIR}/paddle_inference_c_install_dir paddle_inference_c
+```
+
+5. 运行单测，验证
+
+```
+bash test.sh
+```
+
+## 在Go中使用Paddle预测
+
+首先创建预测配置
+```go
+config := paddle.NewConfig()
+config.SetModel(model_file, params_file)
+```
+
+创建predictor
+```go
+predictor := paddle.NewPredictor(config)
+```
+
+获取输入Tensor和输出Tensor
+```go
+inNames := predictor.GetInputNames()
+inHandle = predictor.GetInputHandle(inNames[0])
+
+outNames := predictor.GetOutputNames()
+outHandle := predictor.GetOutputHandle(outNames[0])
+```
+
+设置输入数据(假设只有一个输入)
+```go
+data := make([]float32, 1*3*224*224)
+for i := 0; i < len(data); i++ {
+    data[i] = float32(i%255) * 0.1
+}
+inHandle.Reshape([]int32{1, 3, 224, 224})
+inHandle.CopyFromCpu(data)
+```
+
+设置Lod
+```go
+lod := make([][]uint, 2)
+for i:=0; i < len(lod); i++ {
+    lod[i] = make([]uint, 2)
+    // 设置输入...
+    lod[i][0] = 0
+    lod[i][0] = 10
+}
+inHandle.SetLod(lod)
+```
+
+运行预测
+```go
+predictor.Run()
+```
+
+获取输入Tensor的真实值
+```go
+func numElements(shape []int32) int32 {
+	n := int32(1)
+	for _, v := range shape {
+		n *= v
+	}
+	return n
+}
+
+outData := make([]float32, numElements(outHandle.Shape()))
+outHandle.CopyToCpu(outData)
+fmt.Println(outHandle.Lod())
+```
+
+## 示例
+
+Demo示例见[Paddle-Inference-Demo](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/go)
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
new file mode 100644
index 0000000000000000000000000000000000000000..9200de3d08f71c54f3778e324865712f97eafc9b
--- /dev/null
+++ b/paddle/fluid/inference/goapi/config.go
@@ -0,0 +1,735 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paddle
+
+// #include "pd_config.h"
+// #include "pd_common.h"
+// #include "pd_types.h"
+// #include "pd_utils.h"
+// #include <stdlib.h>
+// #include <string.h>
+import "C"
+import (
+	"unsafe"
+)
+
+type Precision C.PD_PrecisionType
+
+const (
+	PrecisionFloat32 Precision = C.PD_PRECISION_FLOAT32
+	PrecisionInt8    Precision = C.PD_PRECISION_INT8
+	PrecisionHalf    Precision = C.PD_PRECISION_HALF
+)
+
+type Config struct {
+	c *C.PD_Config
+}
+
+///
+/// \brief Create a new config.
+///
+func NewConfig() *Config {
+	cConfig := C.PD_ConfigCreate()
+	config := &Config{c: cConfig}
+	return config
+}
+
+///
+/// \brief Set the combined model with two specific pathes for program and
+/// parameters.
+///
+/// \param model model file path of the combined model.
+/// \param params params file path of the combined model.
+///
+func (config *Config) SetModel(model, params string) {
+	cmodel := C.CString(model)
+	cparams := C.CString(params)
+	C.PD_ConfigSetModel(config.c, cmodel, cparams)
+	defer func() {
+		C.free(unsafe.Pointer(cmodel))
+		C.free(unsafe.Pointer(cparams))
+	}()
+}
+
+///
+/// \brief Set the no-combined model dir path.
+///
+/// \param modelDir model dir path.
+///
+func (config *Config) SetModelDir(modelDir string) {
+	cmodel := C.CString(modelDir)
+	C.PD_ConfigSetModelDir(config.c, cmodel)
+	defer C.free(unsafe.Pointer(cmodel))
+}
+
+///
+/// \brief Set the model file path of a combined model.
+///
+/// \param x model file path.
+///
+func (config *Config) SetProgFile(model string) {
+	cmodel := C.CString(model)
+	C.PD_ConfigSetProgFile(config.c, cmodel)
+	defer C.free(unsafe.Pointer(cmodel))
+}
+
+///
+/// \brief Set the params file path of a combined model.
+///
+/// \param x params file path.
+///
+func (config *Config) SetParamsFile(params string) {
+	cparams := C.CString(params)
+	C.PD_ConfigSetParamsFile(config.c, cparams)
+	defer C.free(unsafe.Pointer(cparams))
+}
+
+///
+/// \brief Set the path of optimization cache directory.
+///
+/// \param cacheDir the path of optimization cache directory.
+///
+func (config *Config) SetOptimCacheDir(cacheDir string) {
+	ccacheDir := C.CString(cacheDir)
+	C.PD_ConfigSetOptimCacheDir(config.c, ccacheDir)
+	defer C.free(unsafe.Pointer(ccacheDir))
+}
+
+///
+/// \brief Get the model directory path.
+///
+/// \return string The model directory path.
+///
+func (config *Config) ModelDir() string {
+	return C.GoString(C.PD_ConfigGetModelDir(config.c))
+}
+
+///
+/// \brief Get the program file path.
+///
+/// \return string The program file path.
+///
+func (config *Config) ProgFile() string {
+	return C.GoString(C.PD_ConfigGetProgFile(config.c))
+}
+
+///
+/// \brief Get the combined parameters file.
+///
+/// \return string The combined parameters file.
+///
+func (config *Config) ParamsFile() string {
+	return C.GoString(C.PD_ConfigGetParamsFile(config.c))
+}
+
+///
+/// \brief Turn off FC Padding.
+///
+func (config *Config) DisableFCPadding() {
+	C.PD_ConfigDisableFCPadding(config.c)
+}
+
+///
+/// \brief A boolean state telling whether fc padding is used.
+///
+/// \return bool Whether fc padding is used.
+///
+func (config *Config) UseFcPadding() bool {
+	return cvtPDBoolToGo(C.PD_ConfigUseFcPadding(config.c))
+}
+
+///
+/// \brief Turn on GPU.
+///
+/// \param memorySize initial size of the GPU memory pool in MB.
+/// \param deviceId the GPU card to use.
+///
+func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) {
+	C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId))
+}
+
+///
+/// \brief Turn on XPU.
+///
+/// \param l3_workspace_size The size of the video memory allocated by the l3 cache, the maximum is 16M.
+/// \param locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
+/// \param autotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
+/// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
+/// \param precision Calculation accuracy of multi_encoder
+/// \param adaptive_seqlen Is the input of multi_encoder variable length
+///
+func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool) {
+	cAutotuneFile := C.CString(autotuneFile)
+	cPrecision := C.CString(precision)
+	defer func() {
+		C.free(unsafe.Pointer(cAutotuneFile))
+		C.free(unsafe.Pointer(cPrecision))
+	}()
+	C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune),
+		cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen))
+}
+
+///
+/// \brief A boolean state telling whether the GPU is turned on.
+///
+/// \return bool Whether the GPU is turned on.
+///
+func (config *Config) UseGpu() bool {
+	return cvtPDBoolToGo(C.PD_ConfigUseGpu(config.c))
+}
+
+///
+/// \brief A boolean state telling whether the XPU is turned on.
+///
+/// \return bool Whether the XPU is turned on.
+///
+func (config *Config) UseXpu() bool {
+	return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c))
+}
+
+///
+/// \brief Get the GPU device id.
+///
+/// \return int32 The GPU device id.
+///
+func (config *Config) GpuDeviceId() int32 {
+	return int32(C.PD_ConfigGpuDeviceId(config.c))
+}
+
+///
+/// \brief Get the XPU device id.
+///
+/// \return int32 The XPU device id.
+///
+func (config *Config) XpuDeviceId() int32 {
+	return int32(C.PD_ConfigXpuDeviceId(config.c))
+}
+
+///
+/// \brief Get the initial size in MB of the GPU memory pool.
+///
+/// \return int32 The initial size in MB of the GPU memory pool.
+///
+func (config *Config) MemoryPoolInitSizeMb() int32 {
+	return int32(C.PD_ConfigMemoryPoolInitSizeMb(config.c))
+}
+
+///
+/// \brief Get the proportion of the initial memory pool size compared to the
+/// device.
+///
+/// \return float32 The proportion of the initial memory pool size.
+///
+func (config *Config) FractionOfGpuMemoryForPool() float32 {
+	return float32(C.PD_ConfigFractionOfGpuMemoryForPool(config.c))
+}
+
+///
+/// \brief Control whether to perform IR graph optimization.
+/// If turned off, the AnalysisConfig will act just like a NativeConfig.
+///
+/// \param x Whether the ir graph optimization is actived.
+///
+func (config *Config) SwitchIrOptim(x bool) {
+	C.PD_ConfigSwitchIrOptim(config.c, cvtGoBoolToPD(x))
+}
+
+///
+/// \brief A boolean state telling whether the ir graph optimization is
+/// actived.
+///
+/// \return bool Whether to use ir graph optimization.
+///
+// bool ir_optim() const { return enable_ir_optim_; }
+func (config *Config) IrOptim() bool {
+	return cvtPDBoolToGo(C.PD_ConfigIrOptim(config.c))
+}
+
+///
+/// \brief Turn on the TensorRT engine.
+/// The TensorRT engine will accelerate some subgraphes in the original Fluid
+/// computation graph. In some models such as resnet50, GoogleNet and so on,
+/// it gains significant performance acceleration.
+///
+/// \param workspaceSize The memory size(in byte) used for TensorRT
+/// workspace.
+/// \param maxBatchSize The maximum batch size of this prediction task,
+/// better set as small as possible for less performance loss.
+/// \param minSubgraphSize The minimum TensorRT subgraph size needed, if a
+/// subgraph is smaller than this, it will not be transferred to TensorRT
+/// engine.
+/// \param precision The precision used in TensorRT.
+/// \param useStatic Serialize optimization information to disk for reusing.
+/// \param useCalibMode Use TRT int8 calibration(post training
+/// quantization).
+///
+func (config *Config) EnableTensorRtEngine(workspaceSize int32, maxBatchSize int32, minSubgraphSize int32,
+	precision Precision, useStatic bool, useCalibMode bool) {
+	C.PD_ConfigEnableTensorRtEngine(config.c, C.int32_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode))
+}
+
+///
+/// \brief A boolean state telling whether the TensorRT engine is used.
+///
+/// \return bool Whether the TensorRT engine is used.
+///
+func (config *Config) TensorRtEngineEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigTensorRtEngineEnabled(config.c))
+}
+
+///
+/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+/// \param minInputShape The min input shape of the subgraph input.
+/// \param maxInputShape The max input shape of the subgraph input.
+/// \param optimInputShape The opt input shape of the subgraph input.
+/// \param disableTrtPluginFp16 Setting this parameter to true means that
+/// TRT plugin will not run fp16.
+///
+func (config *Config) SetTRTDynamicShapeInfo(minInputShape map[string][]int32, maxInputShape map[string][]int32,
+	optimInputShape map[string][]int32, disableTrtPluginFp16 bool) {
+
+	tensorNum := uint(len(minInputShape))
+	names := make([](*C.char), tensorNum)
+	goNames := make([]string, tensorNum)
+	var shapeNum []uint
+
+	idx := 0
+	for n := range minInputShape {
+		char := C.CString(n)
+		defer C.free(unsafe.Pointer(char))
+		names[idx] = (*C.char)(unsafe.Pointer(char))
+		goNames[idx] = n
+		shapeNum = append(shapeNum, uint(len(minInputShape[n])))
+		idx++
+	}
+
+	cMinInputShape := make([]*C.int32_t, len(goNames))
+	cMaxInputShape := make([]*C.int32_t, len(goNames))
+	cOptInputShape := make([]*C.int32_t, len(goNames))
+	for i, n := range goNames {
+		pMin := (*C.int32_t)(C.malloc(C.size_t(C.sizeof_int32_t * len(minInputShape[n]))))
+		cMinInputShape[i] = pMin
+
+		// A []C.int32_t slice backed by C memory.
+		// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+		// Using [1<<27] instead of [1<<30] so it works on 32-bit architecture
+		pMinData := (*[1 << 27]C.int32_t)(unsafe.Pointer(pMin))
+		for j, v := range minInputShape[n] {
+			(*pMinData)[j] = C.int32_t(v)
+		}
+		defer C.free(unsafe.Pointer(pMin))
+
+		pMax := (*C.int32_t)(C.malloc(C.size_t(C.sizeof_int32_t * len(maxInputShape[n]))))
+		cMaxInputShape[i] = pMax
+		pMaxData := (*[1 << 27]C.int32_t)(unsafe.Pointer(pMax))
+		for j, v := range maxInputShape[n] {
+			(*pMaxData)[j] = C.int32_t(v)
+		}
+		defer C.free(unsafe.Pointer(pMax))
+
+		pOpt := (*C.int32_t)(C.malloc(C.size_t(C.sizeof_int32_t * len(optimInputShape[n]))))
+		cOptInputShape[i] = pOpt
+		pOptData := (*[1 << 27]C.int32_t)(unsafe.Pointer(pOpt))
+		for j, v := range optimInputShape[n] {
+			(*pOptData)[j] = C.int32_t(v)
+		}
+		defer C.free(unsafe.Pointer(pOpt))
+	}
+
+	C.PD_ConfigSetTrtDynamicShapeInfo(config.c, C.size_t(tensorNum), (**C.char)(unsafe.Pointer(&names[0])),
+		(*C.size_t)(unsafe.Pointer(&shapeNum[0])),
+		(**C.int32_t)(unsafe.Pointer(&cMinInputShape[0])),
+		(**C.int32_t)(unsafe.Pointer(&cMaxInputShape[0])),
+		(**C.int32_t)(unsafe.Pointer(&cOptInputShape[0])),
+		cvtGoBoolToPD(disableTrtPluginFp16))
+}
+
+///
+/// \brief Prevent ops running in Paddle-TRT
+/// NOTE: just experimental, not an official stable API, easy to be broken.
+///
+func (config *Config) DisableTensorRtOPs(ops []string) {
+	num := uint(len(ops))
+	var buf = make([]*C.char, num+1)
+	for i, _ := range ops {
+		char := C.CString(ops[i])
+		defer C.free(unsafe.Pointer(char))
+		buf[i] = (*C.char)(unsafe.Pointer(char))
+	}
+
+	C.PD_ConfigDisableTensorRtOPs(config.c, C.size_t(num), (**C.char)(unsafe.Pointer(&buf[0])))
+}
+
+///
+/// \brief Replace some TensorRT plugins to TensorRT OSS(
+/// https://github.com/NVIDIA/TensorRT), with which some models's inference
+/// may be more high-performance. Libnvinfer_plugin.so greater than
+/// V7.2.1 is needed.
+///
+func (config *Config) EnableTensorRtOSS() {
+	C.PD_ConfigEnableTensorRtOSS(config.c)
+}
+
+///
+/// \brief A boolean state telling whether to use the TensorRT OSS.
+///
+/// \return bool Whether to use the TensorRT OSS.
+///
+func (config *Config) TensorrtOssEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigTensorRtOssEnabled(config.c))
+}
+
+///
+/// \brief Enable TensorRT DLA
+/// \param dlaCore ID of DLACore, which should be 0, 1,
+///        ..., IBuilder.getNbDLACores() - 1
+///
+func (config *Config) EnableTensorRtDLA(dlaCore int32) {
+	C.PD_ConfigEnableTensorRtDla(config.c, C.int32_t(dlaCore))
+}
+
+///
+/// \brief A boolean state telling whether to use the TensorRT DLA.
+///
+/// \return bool Whether to use the TensorRT DLA.
+///
+func (config *Config) TensorrtDlaEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigTensorRtDlaEnabled(config.c))
+}
+
+///
+/// \brief Turn on the usage of Lite sub-graph engine.
+///
+/// \param precision Precion used in Lite sub-graph engine.
+/// \param zeroCopy Set the zero copy mode.
+/// \param passesFilter Set the passes used in Lite sub-graph engine.
+/// \param opsFilter Operators not supported by Lite.
+///
+func (config *Config) EnableLiteEngine(precision Precision, zeroCopy bool, passesFilter []string, opsFilter []string) {
+	passesFilterNum := uint(len(passesFilter))
+	var passesFilterBuf = make([]*C.char, passesFilterNum+1)
+	for i, _ := range passesFilter {
+		char := C.CString(passesFilter[i])
+		defer C.free(unsafe.Pointer(char))
+		passesFilterBuf[i] = (*C.char)(unsafe.Pointer(char))
+	}
+
+	opsFilterNum := uint(len(opsFilter))
+	var opsFilterBuf = make([]*C.char, passesFilterNum+1)
+	for i, _ := range opsFilter {
+		char := C.CString(opsFilter[i])
+		defer C.free(unsafe.Pointer(char))
+		opsFilterBuf[i] = (*C.char)(unsafe.Pointer(char))
+	}
+
+	C.PD_ConfigEnableLiteEngine(config.c, C.int32_t(precision), cvtGoBoolToPD(zeroCopy), C.size_t(passesFilterNum), (**C.char)(unsafe.Pointer(&passesFilterBuf[0])), C.size_t(opsFilterNum), (**C.char)(unsafe.Pointer(&opsFilterBuf[0])))
+}
+
+///
+/// \brief A boolean state indicating whether the Lite sub-graph engine is
+/// used.
+///
+/// \return bool whether the Lite sub-graph engine is used.
+///
+func (config *Config) LiteEngineEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigLiteEngineEnabled(config.c))
+}
+
+///
+/// \brief Control whether to debug IR graph analysis phase.
+/// This will generate DOT files for visualizing the computation graph after
+/// each analysis pass applied.
+///
+/// \param x whether to debug IR graph analysis phase.
+///
+func (config *Config) SwitchIrDebug(x bool) {
+	C.PD_ConfigSwitchIrDebug(config.c, cvtGoBoolToPD(x))
+}
+
+///
+/// \brief Turn on MKLDNN.
+///
+func (config *Config) EnableMKLDNN() {
+	C.PD_ConfigEnableMKLDNN(config.c)
+}
+
+///
+/// \brief Set the cache capacity of different input shapes for MKLDNN.
+/// Default value 0 means not caching any shape.
+/// Please see MKL-DNN Data Caching Design Document:
+/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
+///
+/// \param capacity The cache capacity.
+///
+func (config *Config) SetMkldnnCacheCapacity(capacity int32) {
+	C.PD_ConfigSetMkldnnCacheCapacity(config.c, C.int32_t(capacity))
+}
+
+///
+/// \brief A boolean state telling whether to use the MKLDNN.
+///
+/// \return bool Whether to use the MKLDNN.
+///
+func (config *Config) MkldnnEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigMkldnnEnabled(config.c))
+}
+
+///
+/// \brief Set the number of cpu math library threads.
+///
+/// \param mathThreadsNum The number of cpu math library
+/// threads.
+///
+func (config *Config) SetCpuMathLibraryNumThreads(mathThreadsNum int) {
+	C.PD_ConfigSetCpuMathLibraryNumThreads(config.c, C.int32_t(mathThreadsNum))
+}
+
+///
+/// \brief An int state telling how many threads are used in the CPU math
+/// library.
+///
+/// \return int The number of threads used in the CPU math library.
+///
+func (config *Config) CpuMathLibraryNumThreads() int32 {
+	return int32(C.PD_ConfigGetCpuMathLibraryNumThreads(config.c))
+}
+
+///
+/// \brief Transform the AnalysisConfig to NativeConfig.
+///
+/// \return NativeConfig The NativeConfig transformed.
+///
+// NativeConfig ToNativeConfig() const;
+
+///
+/// \brief Specify the operator type list to use MKLDNN acceleration.
+///
+/// \param opList The operator type list.
+///
+func (config *Config) SetMKLDNNOp(opList []string) {
+	num := uint(len(opList))
+	// Add one in case num is zero.
+	var buf = make([]*C.char, num+1)
+	for i, _ := range opList {
+		char := C.CString(opList[i])
+		defer C.free(unsafe.Pointer(char))
+		buf[i] = (*C.char)(unsafe.Pointer(char))
+	}
+
+	C.PD_ConfigSetMkldnnOp(config.c, C.size_t(num), (**C.char)(unsafe.Pointer(&buf[0])))
+}
+
+///
+/// \brief Turn on MKLDNN quantization.
+///
+func (config *Config) EnableMkldnnQuantizer() {
+	C.PD_ConfigEnableMkldnnQuantizer(config.c)
+}
+
+///
+/// \brief Turn on MKLDNN bfloat16.
+///
+func (config *Config) EnableMkldnnBfloat16() {
+	C.PD_ConfigEnableMkldnnBfloat16(config.c)
+}
+
+///
+/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+///
+/// \return bool Whether to use the MKLDNN Bfloat16.
+///
+func (config *Config) MkldnnBfloat16Enabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigMkldnnBfloat16Enabled(config.c))
+}
+
+/// \brief Specify the operator type list to use Bfloat16 acceleration.
+///
+/// \param opList The operator type list.
+///
+func (config *Config) SetBfloat16Op(opList []string) {
+	num := uint(len(opList))
+	// Add one in case num is zero.
+	var buf = make([]*C.char, num+1)
+	for i, _ := range opList {
+		char := C.CString(opList[i])
+		defer C.free(unsafe.Pointer(char))
+		buf[i] = (*C.char)(unsafe.Pointer(char))
+	}
+
+	C.PD_ConfigSetBfloat16Op(config.c, C.size_t(num), (**C.char)(unsafe.Pointer(&buf[0])))
+}
+
+///
+/// \brief A boolean state telling whether the thread local CUDA stream is
+/// enabled.
+///
+/// \return bool Whether the thread local CUDA stream is enabled.
+///
+func (config *Config) ThreadLocalStreamEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigThreadLocalStreamEnabled(config.c))
+}
+
+///
+/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+///
+/// \return bool Whether the MKLDNN quantization is enabled.
+///
+func (config *Config) MkldnnQuantizerEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigMkldnnQuantizerEnabled(config.c))
+}
+
+///
+/// \brief Specify the memory buffer of program and parameter.
+/// Used when model and params are loaded directly from memory.
+///
+/// \param prog The memory buffer of program.
+/// \param params The memory buffer of the combined parameters file.
+///
+func (config *Config) SetModelBuffer(prog, params string) {
+	cProg := C.CString(prog)
+	cParams := C.CString(params)
+	defer func() {
+		C.free(unsafe.Pointer(cProg))
+		C.free(unsafe.Pointer(cParams))
+	}()
+
+	C.PD_ConfigSetModelBuffer(config.c, cProg, C.size_t(len(prog)), cParams, C.size_t(len(params)))
+}
+
+///
+/// \brief A boolean state telling whether the model is set from the CPU
+/// memory.
+///
+/// \return bool Whether model and params are loaded directly from memory.
+///
+func (config *Config) ModelFromMemory() bool {
+	return cvtPDBoolToGo(C.PD_ConfigModelFromMemory(config.c))
+}
+
+///
+/// \brief Turn on memory optimize
+/// NOTE still in development.
+///
+func (config *Config) EnableMemoryOptim() {
+	C.PD_ConfigEnableMemoryOptim(config.c)
+}
+
+///
+/// \brief A boolean state telling whether the memory optimization is
+/// activated.
+///
+/// \return bool Whether the memory optimization is activated.
+///
+func (config *Config) MemoryOptimEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigMemoryOptimEnabled(config.c))
+}
+
+///
+/// \brief Turn on profiling report.
+/// If not turned on, no profiling report will be generated.
+///
+func (config *Config) EnableProfile() {
+	C.PD_ConfigEnableProfile(config.c)
+}
+
+///
+/// \brief A boolean state telling whether the profiler is activated.
+///
+/// \return bool Whether the profiler is activated.
+///
+func (config *Config) ProfileEnabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigProfileEnabled(config.c))
+}
+
+///
+/// \brief Mute all logs in Paddle inference.
+///
+func (config *Config) DisableGlogInfo() {
+	C.PD_ConfigDisableGlogInfo(config.c)
+}
+
+///
+/// \brief A boolean state telling whether logs in Paddle inference are muted.
+///
+/// \return bool Whether logs in Paddle inference are muted.
+///
+func (config *Config) GlogInfoDisabled() bool {
+	return cvtPDBoolToGo(C.PD_ConfigGlogInfoDisabled(config.c))
+}
+
+///
+/// \brief A boolean state telling whether the AnalysisConfig is valid.
+///
+/// \return bool Whether the AnalysisConfig is valid.
+///
+func (config *Config) IsValid() bool {
+	return cvtPDBoolToGo(C.PD_ConfigIsValid(config.c))
+}
+
+///
+/// \brief Enable the GPU multi-computing stream feature.
+/// NOTE: The current behavior of this interface is to bind the computation
+/// stream to the thread, and this behavior may be changed in the future.
+///
+func (config *Config) EnableGpuMultiStream() {
+	C.PD_ConfigEnableGpuMultiStream(config.c)
+}
+
+///
+/// \brief Delete all passes that has a certain type 'pass'.
+///
+/// \param[in] pass the certain pass type to be deleted.
+///
+func (config *Config) DeletePass(pass string) {
+	cPass := C.CString(pass)
+	C.PD_ConfigDeletePass(config.c, cPass)
+	C.free(unsafe.Pointer(cPass))
+}
+
+///
+/// \brief Append a pass to the end of the passes
+///
+/// \param[in] pass the new pass.
+///
+func (config *Config) AppendPass(pass string) {
+	cPass := C.CString(pass)
+	C.PD_ConfigAppendPass(config.c, cPass)
+	C.free(unsafe.Pointer(cPass))
+}
+
+///
+/// \brief  Insert a pass to a specific position
+///
+/// \param[in] idx the position to insert.
+/// \param[in] pass the new pass.
+///
+func (config *Config) InsertPass(idx uint64, pass string) {
+	cPass := C.CString(pass)
+	C.PD_ConfigInsertPass(config.c, C.size_t(idx), cPass)
+	C.free(unsafe.Pointer(cPass))
+}
+
+///
+/// \brief Get information of passes.
+///
+/// \return Return list of the passes.
+///
+func (config *Config) AllPasses() []string {
+	cPasses := C.PD_ConfigAllPasses(config.c)
+	num := int(cPasses.size)
+	passes := cvtToGoSliceString(num, cPasses.data)
+	C.PD_OneDimArrayCstrDestroy(cPasses)
+	return passes
+}
diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..e7b2c956a924ae201be3cbc9a8a299ab053d8142
--- /dev/null
+++ b/paddle/fluid/inference/goapi/config_test.go
@@ -0,0 +1,122 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paddle
+
+import "testing"
+
+func TestNewConfig(t *testing.T) {
+	config := NewConfig()
+	config.SetProgFile("model")
+	config.SetParamsFile("params")
+
+	config.SetOptimCacheDir("cache")
+
+	config.DisableFCPadding()
+	t.Logf("UseFcPadding:%+v", config.UseFcPadding())
+
+	// It will break when we have no xpu env.
+	// config.EnableXpu(100)
+	// t.Logf("EnableXpu, UseXpu:%+v ", config.UseXpu())
+
+	config.SwitchIrOptim(true)
+	t.Logf("IrOptim:%+v", config.IrOptim())
+
+	config.EnableUseGpu(100, 0)
+	t.Logf("use_gpu:%+v, gpu_id:%+v", config.UseGpu(), config.GpuDeviceId())
+	t.Logf("MemoryPoolInitSizeMb:%+v, FractionOfGpuMemoryForPool:%+v", config.MemoryPoolInitSizeMb(), config.FractionOfGpuMemoryForPool())
+
+	config.EnableTensorRtEngine(1024, 16, 3, PrecisionFloat32, false, false)
+	t.Logf("TensorRtEngineEnabled:%+v", config.TensorRtEngineEnabled())
+
+	minInputShape := map[string][]int32{
+		"image": []int32{-1, 3, 100, 100},
+		"shape": []int32{-1, 2},
+	}
+	maxInputShape := map[string][]int32{
+		"image": []int32{-1, 3, 608, 608},
+		"shape": []int32{-1, 2},
+	}
+	optInputShape := map[string][]int32{
+		"image": []int32{-1, 3, 406, 406},
+		"shape": []int32{-1, 2},
+	}
+	config.SetTRTDynamicShapeInfo(minInputShape, maxInputShape, optInputShape, false)
+
+	config.EnableTensorRtOSS()
+	t.Logf("TensorrtOssEnabled:%+v", config.TensorrtOssEnabled())
+
+	config.EnableTensorRtDLA(0)
+	t.Logf("TensorrtDlaEnabled:%+v", config.TensorrtDlaEnabled())
+
+	config.DisableTensorRtOPs([]string{"mul", "fc"})
+
+	config.EnableGpuMultiStream()
+	t.Logf("ThreadLocalStreamEnabled:%+v", config.ThreadLocalStreamEnabled())
+
+	config.SwitchIrDebug(false)
+
+	config.EnableMKLDNN()
+
+	config.EnableMemoryOptim()
+	t.Logf("MemoryOptimEnabled:%+v", config.MemoryOptimEnabled())
+
+	config.EnableProfile()
+	t.Logf("ProfileEnabled:%+v", config.ProfileEnabled())
+
+	config.DisableGlogInfo()
+	t.Logf("GlogInfoDisabled:%+v", config.GlogInfoDisabled())
+
+	t.Logf("IsValid:%+v", config.IsValid())
+
+	config.AppendPass("test_pass")
+	t.Logf("After AppendPass, AllPasses:%+v", config.AllPasses())
+
+	config.DeletePass("test_pass")
+	t.Logf("After DeletePass, AllPasses:%+v", config.AllPasses())
+}
+
+func TestLite(t *testing.T) {
+	config := NewConfig()
+	config.SetModel("model", "params")
+	t.Log(config.ProgFile())
+	t.Log(config.ParamsFile())
+
+	config.EnableLiteEngine(PrecisionFloat32, true, []string{}, []string{})
+	t.Logf("LiteEngineEnabled:%+v", config.LiteEngineEnabled())
+}
+
+func TestMkldnn(t *testing.T) {
+	config := NewConfig()
+	config.SetModelDir("modelDir")
+	t.Log(config.ModelDir())
+
+	config.EnableMKLDNN()
+	t.Logf("MkldnnEnabled:%+v", config.MkldnnEnabled())
+
+	config.SetMkldnnCacheCapacity(4)
+
+	config.SetCpuMathLibraryNumThreads(4)
+	t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads())
+
+	config.SetMKLDNNOp([]string{"fc", "conv"})
+
+	config.EnableMkldnnQuantizer()
+	t.Logf("MkldnnQuantizerEnabled:%+v", config.MkldnnQuantizerEnabled())
+
+	config.EnableMkldnnBfloat16()
+	t.Logf("MkldnnBfloat16Enabled:%+v", config.MkldnnBfloat16Enabled())
+
+	config.SetBfloat16Op([]string{"fc", "mul"})
+}
diff --git a/paddle/fluid/inference/goapi/go.mod b/paddle/fluid/inference/goapi/go.mod
new file mode 100644
index 0000000000000000000000000000000000000000..1036a2e3281901ae4cb813dcd1017caa7dacc4e1
--- /dev/null
+++ b/paddle/fluid/inference/goapi/go.mod
@@ -0,0 +1,3 @@
+module github.com/jiweibo/paddle/paddle/fluid/inference/goapi
+
+go 1.15
diff --git a/paddle/fluid/inference/goapi/lib.go b/paddle/fluid/inference/goapi/lib.go
new file mode 100644
index 0000000000000000000000000000000000000000..b87561577714fe97a62b74645a7f7cfbb14dce06
--- /dev/null
+++ b/paddle/fluid/inference/goapi/lib.go
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paddle
+
+// #cgo CFLAGS: -I${SRCDIR}/paddle_inference_c/paddle/include
+// #cgo LDFLAGS: -L${SRCDIR}/paddle_inference_c/paddle/lib -lpaddle_inference_c
+import "C"
diff --git a/paddle/fluid/inference/goapi/predictor.go b/paddle/fluid/inference/goapi/predictor.go
new file mode 100644
index 0000000000000000000000000000000000000000..fb8c8892b6676e210e6304ed6db076a3c20178d8
--- /dev/null
+++ b/paddle/fluid/inference/goapi/predictor.go
@@ -0,0 +1,166 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paddle
+
+// #include "pd_predictor.h"
+// #include "pd_tensor.h"
+// #include "pd_common.h"
+// #include "pd_types.h"
+// #include "pd_utils.h"
+// #include <stdlib.h>
+// #include <string.h>
+import "C"
+import (
+	"runtime"
+	"unsafe"
+)
+
+type Predictor struct {
+	c *C.PD_Predictor
+}
+
+///
+/// \brief Create a new Predictor
+///
+/// \param[in] Config config
+/// \return new predicor.
+///
+func NewPredictor(config *Config) *Predictor {
+	cPredictor := C.PD_PredictorCreate(config.c)
+	predictor := &Predictor{c: cPredictor}
+	runtime.SetFinalizer(predictor, func(predictor *Predictor) {
+		C.PD_PredictorDestroy(predictor.c)
+	})
+	return predictor
+}
+
+///
+/// \brief Clone a new Predictor
+///
+/// \return new predictor.
+///
+func (p *Predictor) Clone() *Predictor {
+	cPredictor := C.PD_PredictorClone(p.c)
+	predictor := &Predictor{c: cPredictor}
+	runtime.SetFinalizer(predictor, func(predictor *Predictor) {
+		C.PD_PredictorDestroy(predictor.c)
+	})
+	return predictor
+}
+
+///
+/// \brief Get the input number
+///
+/// \return input number
+///
+func (p *Predictor) GetInputNum() uint {
+	return uint(C.PD_PredictorGetInputNum(p.c))
+}
+
+///
+/// \brief Get the output number
+///
+/// \return output number
+///
+func (p *Predictor) GetOutputNum() uint {
+	return uint(C.PD_PredictorGetOutputNum(p.c))
+}
+
+///
+/// \brief Get the input names
+///
+/// \return input names
+///
+func (p *Predictor) GetInputNames() []string {
+	cNames := C.PD_PredictorGetInputNames(p.c)
+	numNames := int(cNames.size)
+	names := cvtToGoSliceString(numNames, cNames.data)
+	C.PD_OneDimArrayCstrDestroy(cNames)
+	return names
+}
+
+///
+/// \brief Get the output names
+///
+/// \return output names
+///
+func (p *Predictor) GetOutputNames() []string {
+	cNames := C.PD_PredictorGetOutputNames(p.c)
+	numNames := int(cNames.size)
+	names := cvtToGoSliceString(numNames, cNames.data)
+	C.PD_OneDimArrayCstrDestroy(cNames)
+	return names
+}
+
+///
+/// \brief Get the Input Tensor object
+///
+/// \param[in] name input name
+/// \return input tensor
+///
+func (p *Predictor) GetInputHandle(name string) *Tensor {
+	cName := C.CString(name)
+	cHandle := C.PD_PredictorGetInputHandle(p.c, cName)
+	C.free(unsafe.Pointer(cName))
+	handle := &Tensor{c: cHandle}
+	runtime.SetFinalizer(handle, func(handle *Tensor) {
+		C.PD_TensorDestroy(handle.c)
+	})
+	return handle
+}
+
+///
+/// \brief Get the Output Tensor object
+///
+/// \param[in] name output name
+/// \return output tensor
+///
+func (p *Predictor) GetOutputHandle(name string) *Tensor {
+	cName := C.CString(name)
+	cHandle := C.PD_PredictorGetOutputHandle(p.c, cName)
+	C.free(unsafe.Pointer(cName))
+	handle := &Tensor{c: cHandle}
+	runtime.SetFinalizer(handle, func(handle *Tensor) {
+		C.PD_TensorDestroy(handle.c)
+	})
+	return handle
+}
+
+///
+/// \brief Run the prediction engine
+///
+func (p *Predictor) Run() {
+	C.PD_PredictorRun(p.c)
+}
+
+///
+/// \brief Clear the intermediate tensors of the predictor
+///
+func (p *Predictor) ClearIntermediateTensor() {
+	C.PD_PredictorClearIntermediateTensor(p.c)
+}
+
+///
+/// \brief Release all tmp tensor to compress the size of the memory pool.
+/// The memory pool is considered to be composed of a list of chunks, if
+/// the chunk is not occupied, it can be released.
+///
+/// \return Number of bytes released. It may be smaller than the actual
+/// released memory, because part of the memory is not managed by the
+/// MemoryPool.
+///
+func (p *Predictor) TryShrinkMemory() {
+	C.PD_PredictorTryShrinkMemory(p.c)
+}
diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..a5df1048ca2a56901dd8203affbed3ed36b2a075
--- /dev/null
+++ b/paddle/fluid/inference/goapi/predictor_test.go
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paddle
+
+import (
+	"io/ioutil"
+	"os"
+	"testing"
+)
+
+func TestNewPredictor(t *testing.T) {
+	t.Logf("Version:\n%+v", Version())
+	config := NewConfig()
+	config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams")
+	config.EnableUseGpu(100, 0)
+	predictor := NewPredictor(config)
+	inNames := predictor.GetInputNames()
+	t.Logf("InputNames:%+v", inNames)
+	outNames := predictor.GetOutputNames()
+	t.Logf("OutputNames:%+v", outNames)
+
+	inHandle := predictor.GetInputHandle(inNames[0])
+	inHandle.Reshape([]int32{1, 3, 224, 224})
+	t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape())
+
+	var lod [][]uint
+	lod = append(lod, []uint{0, 1, 2})
+	lod = append(lod, []uint{1, 2, 3, 4})
+	inHandle.SetLod(lod)
+	t.Logf("inHandle Lod:%+v", inHandle.Lod())
+	data := make([]float32, numElements([]int32{1, 3, 224, 224}))
+	for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ {
+		data[i] = float32(i%255) * 0.1
+	}
+	inHandle.CopyFromCpu(data)
+	t.Logf("inHandle Type:%+v", inHandle.Type())
+
+	predictor.Run()
+
+	outHandle := predictor.GetOutputHandle(outNames[0])
+	t.Logf("outHandle name:%+v", outHandle.Name())
+
+	outShape := outHandle.Shape()
+	t.Logf("outHandle Shape:%+v", outShape)
+	outData := make([]float32, numElements(outShape))
+	outHandle.CopyToCpu(outData)
+	t.Log(outData)
+
+	cloned := predictor.Clone()
+	t.Logf("InputNum:%+v", cloned.GetInputNum())
+	t.Logf("OutputNum:%+v", cloned.GetInputNum())
+	cloned.ClearIntermediateTensor()
+}
+
+func TestFromBuffer(t *testing.T) {
+	modelFile, err := os.Open("./mobilenetv1/inference.pdmodel")
+	if err != nil {
+		t.Fatal(err)
+	}
+	paramsFile, err := os.Open("./mobilenetv1/inference.pdiparams")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer modelFile.Close()
+	defer paramsFile.Close()
+
+	model, err := ioutil.ReadAll(modelFile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	params, err := ioutil.ReadAll(paramsFile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	config := NewConfig()
+	config.SetModelBuffer(string(model), string(params))
+
+	predictor := NewPredictor(config)
+	inNames := predictor.GetInputNames()
+	outNames := predictor.GetOutputNames()
+	inHandle := predictor.GetInputHandle(inNames[0])
+	inHandle.Reshape([]int32{1, 3, 224, 224})
+	data := make([]float32, numElements([]int32{1, 3, 224, 224}))
+	for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ {
+		data[i] = float32(i%255) * 0.1
+	}
+	inHandle.CopyFromCpu(data)
+	predictor.Run()
+	outHandle := predictor.GetOutputHandle(outNames[0])
+	outShape := outHandle.Shape()
+	t.Logf("outHandle Shape:%+v", outShape)
+	outData := make([]float32, numElements(outShape))
+	outHandle.CopyToCpu(outData)
+	t.Log(outData)
+}
+
+func numElements(shape []int32) int32 {
+	n := int32(1)
+	for _, v := range shape {
+		n *= v
+	}
+	return n
+}
diff --git a/paddle/fluid/inference/goapi/tensor.go b/paddle/fluid/inference/goapi/tensor.go
new file mode 100644
index 0000000000000000000000000000000000000000..b4ad1d8f766c7596d6fc767040428ba468736649
--- /dev/null
+++ b/paddle/fluid/inference/goapi/tensor.go
@@ -0,0 +1,240 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paddle
+
+// #include "pd_tensor.h"
+// #include "pd_utils.h"
+// #include "pd_types.h"
+// #include "pd_common.h"
+// #include "stdlib.h"
+import "C"
+import (
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+type DataType C.PD_DataType
+
+const (
+	Unk     DataType = C.PD_DATA_UNK
+	Float32 DataType = C.PD_DATA_FLOAT32
+	Int32   DataType = C.PD_DATA_INT32
+	Int64   DataType = C.PD_DATA_INT64
+	Uint8   DataType = C.PD_DATA_UINT8
+	Int8    DataType = C.PD_DATA_INT8
+)
+
+type PlaceType C.PD_PlaceType
+
+const (
+	UnkPlace PlaceType = C.PD_PLACE_UNK
+	CpuPlace PlaceType = C.PD_PLACE_CPU
+	GpuPlace PlaceType = C.PD_PLACE_GPU
+	XpuPlace PlaceType = C.PD_PLACE_XPU
+)
+
+type Tensor struct {
+	c *C.PD_Tensor
+}
+
+///
+/// \brief Reset the shape of the tensor.
+/// Generally it's only used for the input tensor.
+///
+/// \param[in] shape The shape to set.
+///
+func (t *Tensor) Reshape(shape []int32) {
+	C.PD_TensorReshape(t.c, C.size_t(len(shape)), (*C.int32_t)(unsafe.Pointer(&shape[0])))
+}
+
+///
+/// \brief Get the tensor shape
+///
+/// \return The tensor shape.
+///
+func (t *Tensor) Shape() []int32 {
+	cData := C.PD_TensorGetShape(t.c)
+	length := int(cData.size)
+	defer C.PD_OneDimArrayInt32Destroy(cData)
+	return cvtToGoSliceInt32(length, cData.data)
+}
+
+///
+/// \brief Set the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \param[in] lod lod information.
+///
+func (t *Tensor) SetLod(lod [][]uint) {
+	cLod := (*C.struct_PD_TwoDimArraySize)(C.malloc(C.size_t(C.sizeof_struct_PD_TwoDimArraySize)))
+	length := len(lod)
+	cLod.size = C.size_t(uint(length))
+	var lodList = make([]*C.struct_PD_OneDimArraySize, length+1)
+
+	for i, v := range lod {
+		oneDimArray := (*C.struct_PD_OneDimArraySize)(C.malloc(C.size_t(C.sizeof_struct_PD_OneDimArraySize)))
+		defer C.free(unsafe.Pointer(oneDimArray))
+		tmpLength := len(v)
+		oneDimArray.size = C.size_t(uint(tmpLength))
+
+		tmpC := (*C.size_t)(C.malloc(C.size_t(C.sizeof_size_t * tmpLength)))
+		defer C.free(unsafe.Pointer(tmpC))
+		tmpSlice := (*[1 << 27]C.size_t)(unsafe.Pointer(tmpC))[:tmpLength:tmpLength]
+		for j, w := range v {
+			tmpSlice[j] = C.size_t(w)
+		}
+		oneDimArray.data = tmpC
+
+		lodList[i] = oneDimArray
+	}
+	cLod.data = (**C.struct_PD_OneDimArraySize)(unsafe.Pointer(&lodList[0]))
+	C.PD_TensorSetLod(t.c, cLod)
+	C.free(unsafe.Pointer(cLod))
+	// C.PD_TwoDimArraySizeDestroy(cLod)
+}
+
+///
+/// \brief Get the tensor lod information
+///
+/// \return the lod information.
+///
+func (t *Tensor) Lod() [][]uint {
+	cLod := C.PD_TensorGetLod(t.c)
+	length := int(cLod.size)
+	res := make([][]uint, length)
+	if length == 0 {
+		return res
+	}
+	cLodSlice := (*[1 << 27]*C.struct_PD_OneDimArraySize)(unsafe.Pointer(cLod.data))[:length:length]
+
+	for i := 0; i < length; i++ {
+		size := uint(cLodSlice[i].size)
+		lod := make([]uint, size)
+
+		tmpSlice := (*[1 << 27]C.size_t)(unsafe.Pointer(cLodSlice[i].data))[:size:size]
+		for j, v := range tmpSlice {
+			lod[j] = uint(v)
+		}
+
+		res[i] = lod
+	}
+
+	C.PD_TwoDimArraySizeDestroy(cLod)
+	return res
+}
+
+///
+/// \brief Get the tensor data type
+/// \param[in] pd_tensor tensor.
+/// \return the tensor data type.
+///
+func (t *Tensor) Type() DataType {
+	cDtype := C.PD_TensorGetDataType(t.c)
+	return DataType(cDtype)
+}
+
+///
+/// \brief Get the tensor name
+///
+/// \return the tensor name.
+///
+func (t *Tensor) Name() string {
+	return C.GoString(C.PD_TensorGetName(t.c))
+}
+
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+///
+/// \param[in] value
+///
+func (t *Tensor) CopyFromCpu(value interface{}) {
+	val := reflect.ValueOf(value)
+	dtype, _ := dataTypeOf(val)
+
+	switch dtype {
+	case Float32:
+		data := val.Interface().([]float32)
+		C.PD_TensorCopyFromCpuFloat(t.c, (*C.float)(unsafe.Pointer(&data[0])))
+	case Int32:
+		data := val.Interface().([]int32)
+		C.PD_TensorCopyFromCpuInt32(t.c, (*C.int32_t)(unsafe.Pointer(&data[0])))
+	case Int64:
+		data := val.Interface().([]int64)
+		C.PD_TensorCopyFromCpuInt64(t.c, (*C.int64_t)(unsafe.Pointer(&data[0])))
+	case Uint8:
+		data := val.Interface().([]uint8)
+		C.PD_TensorCopyFromCpuUint8(t.c, (*C.uint8_t)(unsafe.Pointer(&data[0])))
+	case Int8:
+		data := val.Interface().([]int8)
+		C.PD_TensorCopyFromCpuInt8(t.c, (*C.int8_t)(unsafe.Pointer(&data[0])))
+	}
+}
+
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+///
+/// \param[value] data The tensor will copy the data to the address.
+///
+func (t *Tensor) CopyToCpu(value interface{}) {
+	val := reflect.ValueOf(value)
+	dtype, _ := dataTypeOf(val)
+
+	switch dtype {
+	case Float32:
+		data := val.Interface().([]float32)
+		C.PD_TensorCopyToCpuFloat(t.c, (*C.float)(unsafe.Pointer(&data[0])))
+	case Int32:
+		data := val.Interface().([]int32)
+		C.PD_TensorCopyToCpuInt32(t.c, (*C.int32_t)(unsafe.Pointer(&data[0])))
+	case Int64:
+		data := val.Interface().([]int64)
+		C.PD_TensorCopyToCpuInt64(t.c, (*C.int64_t)(unsafe.Pointer(&data[0])))
+	case Uint8:
+		data := val.Interface().([]uint8)
+		C.PD_TensorCopyToCpuUint8(t.c, (*C.uint8_t)(unsafe.Pointer(&data[0])))
+	case Int8:
+		data := val.Interface().([]int8)
+		C.PD_TensorCopyToCpuInt8(t.c, (*C.int8_t)(unsafe.Pointer(&data[0])))
+	}
+}
+
+var types = []struct {
+	typ      reflect.Type
+	dataType C.PD_DataType
+}{
+	{reflect.TypeOf(float32(0)), C.PD_DATA_FLOAT32},
+	{reflect.TypeOf(int32(0)), C.PD_DATA_INT32},
+	{reflect.TypeOf(int64(0)), C.PD_DATA_INT64},
+	{reflect.TypeOf(uint8(0)), C.PD_DATA_UINT8},
+	{reflect.TypeOf(int8(0)), C.PD_DATA_INT8},
+}
+
+func dataTypeOf(val reflect.Value) (dt DataType, err error) {
+	typ := val.Type()
+	for typ.Kind() == reflect.Array || typ.Kind() == reflect.Slice {
+		if val.Len() > 0 {
+			val = val.Index(0)
+		}
+		typ = typ.Elem()
+	}
+	for _, t := range types {
+		if typ.Kind() == t.typ.Kind() {
+			return DataType(t.dataType), nil
+		}
+	}
+	return dt, fmt.Errorf("unsupported type %v", typ)
+}
diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b764e2ac72c70e7689af6828c69d0a7bcb716d5b
--- /dev/null
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 1. download the mobilenetv1 model to test config and predictor
+if [ ! -d mobilenetv1 ]; then
+    wget https://paddle-inference-dist.bj.bcebos.com/Paddle-Inference-Demo/mobilenetv1.tgz
+    tar xzf mobilenetv1.tgz 
+fi
+
+# 2. set LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$PWD/paddle_inference_c/paddle/lib
+
+# 3. go test
+go test -v ./...
diff --git a/paddle/fluid/inference/goapi/utils.go b/paddle/fluid/inference/goapi/utils.go
new file mode 100644
index 0000000000000000000000000000000000000000..fca5298baf9e29637b99b66f5fd1fedd9d55cb16
--- /dev/null
+++ b/paddle/fluid/inference/goapi/utils.go
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paddle
+
+// #include <stdint.h>
+// #include <stdlib.h>
+import "C"
+import (
+	"unsafe"
+)
+
+func cvtPDBoolToGo(b C.int8_t) bool {
+	var cFalse C.int8_t
+	if b != cFalse {
+		return true
+	}
+	return false
+}
+
+func cvtGoBoolToPD(b bool) C.int8_t {
+	if b == false {
+		return 0
+	}
+	return 1
+}
+
+func cvtToGoSliceString(length int, str **C.char) []string {
+	if str == nil {
+		return nil
+	}
+	tmpSlice := (*[1 << 27]*C.char)(unsafe.Pointer(str))[:length:length]
+	goStrings := make([]string, length)
+	for i, s := range tmpSlice {
+		goStrings[i] = C.GoString(s)
+	}
+	return goStrings
+}
+
+func cvtToGoSliceInt32(length int, data *C.int32_t) []int32 {
+	if data == nil {
+		return nil
+	}
+	tmpSlice := (*[1 << 27]C.int32_t)(unsafe.Pointer(data))[:length:length]
+	res := make([]int32, length)
+	for i, s := range tmpSlice {
+		res[i] = int32(s)
+	}
+	return res
+}
diff --git a/go/paddle/common.go b/paddle/fluid/inference/goapi/version.go
similarity index 50%
rename from go/paddle/common.go
rename to paddle/fluid/inference/goapi/version.go
index cbbde6a45f59b80931a3a2c501581819085e8ea7..74b74dd501a00c106c6cc510c09475b9cb31e2c0 100644
--- a/go/paddle/common.go
+++ b/paddle/fluid/inference/goapi/version.go
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,29 +14,13 @@
 
 package paddle
 
-// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
-// #include <stdbool.h>
-// #include <paddle_c_api.h>
+// #include "pd_common.h"
+// #include "pd_predictor.h"
+// #include "pd_types.h"
+// #include "pd_utils.h"
 import "C"
-import "fmt"
 
-func ConvertCBooleanToGo(b C.bool) bool {
-	var c_false C.bool
-	if b != c_false {
-		return true
-	}
-	return false
-}
-
-func numel(shape []int32) int32 {
-	n := int32(1)
-	for _, d := range shape {
-		n *= d
-	}
-	return n
-}
-
-func bug(format string, args ...interface{}) error {
-	return fmt.Errorf("Bug %v", fmt.Sprintf(format, args...))
+func Version() string {
+	cVersion := C.PD_GetVersion()
+	return C.GoString(cVersion)
 }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 96dc8c67969458041a93490d75ce4007f3ae9f33..1945803b2db764b845c1629157e6ddfd2eab522b 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1985,6 +1985,26 @@ EOF
     fi
 }
 
+function test_go_inference_api() {
+    cat <<EOF
+    ========================================
+    Testing go inference api ...
+    ========================================
+EOF
+
+    # ln paddle_inference_c lib
+    cd ${PADDLE_ROOT}/build
+    ln -s ${PADDLE_ROOT}/build/paddle_inference_c_install_dir/ ${PADDLE_ROOT}/paddle/fluid/inference/goapi/paddle_inference_c
+
+    # run go test
+    cd ${PADDLE_ROOT}/paddle/fluid/inference/goapi
+    bash test.sh
+    EXIT_CODE=$?
+    if [[ "$EXIT_CODE" != "0" ]]; then
+        exit 8;
+    fi
+}
+
 function test_fluid_lib_train() {
     cat <<EOF
     ========================================
@@ -2226,6 +2246,8 @@ function main() {
         gen_fluid_lib ${parallel_number}
         test_fluid_lib
         #test_fluid_lib_train
+        #go inference test
+        test_go_inference_api
         ;;
       test_train)
         gen_fluid_lib ${parallel_number}
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 78a8b14027900cb9b24b92fb8a3dcaded879e3df..df863cd893c19d914ba8049cb921f4ef40136b09 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -123,7 +123,7 @@ RUN rm Python-$version.tgz setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.
 
 # Install Go and glide
 WORKDIR /home
-RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+RUN wget -qO- https://paddle-ci.gz.bcebos.com/go1.15.12.linux-amd64.tar.gz | \
     tar -xz -C /usr/local && \
     mkdir /root/gopath && \
     mkdir /root/gopath/bin && \