diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index b76852fc6c50e80633c8294fb2724b83f15293a7..137a27c8fe87a4726d5554b32889ceafda4d5501 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -185,3 +185,4 @@ if(CUDA_ARCH)
 endif()
 
 set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+
diff --git a/paddle/capi/PaddleCAPI.h b/paddle/capi/PaddleCAPI.h
index f2340b8a750afe1f8b6aa06b3bc59689071e678e..6ca413117886e0fdf4fc78e0da16b8ad6c7d6047 100644
--- a/paddle/capi/PaddleCAPI.h
+++ b/paddle/capi/PaddleCAPI.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #include <stdbool.h>
 #include <stdint.h>
 #include "config.h"
+
+// Since we only support linux and macos in compile, always use clang or
+// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
+#define PD_API __attribute__((visibility("default")))
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -44,28 +49,31 @@ typedef enum {
  */
 typedef void* PD_IVector;
 
-int PDIVecCreateNone(PD_IVector* ivec);
+PD_API int PDIVecCreateNone(PD_IVector* ivec);
 
-int PDIVecDestroy(PD_IVector ivec);
+PD_API int PDIVecDestroy(PD_IVector ivec);
 
-int PDIVectorGet(PD_IVector ivec, int** buffer);
+PD_API int PDIVectorGet(PD_IVector ivec, int** buffer);
 
 /**
  * Matrix functions. Return will be a PD_Error type.
  */
 typedef void* PD_Matrix;
 
-int PDMatCreate(PD_Matrix* mat, uint64_t height, uint64_t width, bool useGpu);
+PD_API int PDMatCreate(PD_Matrix* mat,
+                       uint64_t height,
+                       uint64_t width,
+                       bool useGpu);
 
-int PDMatDestroy(PD_Matrix mat);
+PD_API int PDMatDestroy(PD_Matrix mat);
 
-int PDMatCopyToRow(PD_Matrix mat, uint64_t rowID, pd_real* rowArray);
+PD_API int PDMatCopyToRow(PD_Matrix mat, uint64_t rowID, pd_real* rowArray);
 
-int PDMatGetRow(PD_Matrix mat, uint64_t rowID, pd_real** rawRowBuffer);
+PD_API int PDMatGetRow(PD_Matrix mat, uint64_t rowID, pd_real** rawRowBuffer);
 
-int PDMatCreateNone(PD_Matrix* mat);
+PD_API int PDMatCreateNone(PD_Matrix* mat);
 
-int PDMatGetShape(PD_Matrix mat, uint64_t* height, uint64_t* width);
+PD_API int PDMatGetShape(PD_Matrix mat, uint64_t* height, uint64_t* width);
 
 /**
  * Arguments functions. Each argument means layer output. Arguments means a
@@ -73,48 +81,48 @@ int PDMatGetShape(PD_Matrix mat, uint64_t* height, uint64_t* width);
  */
 typedef void* PD_Arguments;
 
-int PDArgsCreateNone(PD_Arguments* args);
+PD_API int PDArgsCreateNone(PD_Arguments* args);
 
-int PDArgsDestroy(PD_Arguments args);
+PD_API int PDArgsDestroy(PD_Arguments args);
 
-int PDArgsGetSize(PD_Arguments args, uint64_t* size);
+PD_API int PDArgsGetSize(PD_Arguments args, uint64_t* size);
 
-int PDArgsResize(PD_Arguments args, uint64_t size);
+PD_API int PDArgsResize(PD_Arguments args, uint64_t size);
 
-int PDArgsSetValue(PD_Arguments args, uint64_t ID, PD_Matrix mat);
+PD_API int PDArgsSetValue(PD_Arguments args, uint64_t ID, PD_Matrix mat);
 
-int PDArgsGetValue(PD_Arguments args, uint64_t ID, PD_Matrix mat);
+PD_API int PDArgsGetValue(PD_Arguments args, uint64_t ID, PD_Matrix mat);
 
-int PDArgsGetIds(PD_Arguments args, uint64_t ID, PD_IVector ids);
+PD_API int PDArgsGetIds(PD_Arguments args, uint64_t ID, PD_IVector ids);
 
 /**
  * @brief GradientMachine means a neural network.
  */
 typedef void* PD_GradientMachine;
 
-int PDGradientMachineCreateForPredict(PD_GradientMachine* machine,
-                                      void* modelConfigProtobuf,
-                                      int size);
+PD_API int PDGradientMachineCreateForPredict(PD_GradientMachine* machine,
+                                             void* modelConfigProtobuf,
+                                             int size);
 
-int PDGradientMachineLoadParameterFromDisk(PD_GradientMachine machine,
-                                           const char* path);
+PD_API int PDGradientMachineLoadParameterFromDisk(PD_GradientMachine machine,
+                                                  const char* path);
 
-int PDGradientMachineForward(PD_GradientMachine machine,
-                             PD_Arguments inArgs,
-                             PD_Arguments outArgs,
-                             bool isTrain);
+PD_API int PDGradientMachineForward(PD_GradientMachine machine,
+                                    PD_Arguments inArgs,
+                                    PD_Arguments outArgs,
+                                    bool isTrain);
 
-int PDGradientMachineCreateSharedParam(PD_GradientMachine origin,
-                                       void* modelConfigProtobuf,
-                                       int size,
-                                       PD_GradientMachine* slave);
+PD_API int PDGradientMachineCreateSharedParam(PD_GradientMachine origin,
+                                              void* modelConfigProtobuf,
+                                              int size,
+                                              PD_GradientMachine* slave);
 
-int PDGradientMachineDestroy(PD_GradientMachine machine);
+PD_API int PDGradientMachineDestroy(PD_GradientMachine machine);
 
 /**
  * Initialize Paddle.
  */
-int PDInit(int argc, char** argv);
+PD_API int PDInit(int argc, char** argv);
 
 #ifdef __cplusplus
 }