From 7635d6993e1a5a7fea1b1b6fb7c9c3ee97ae4add Mon Sep 17 00:00:00 2001
From: yongqiangma <xing.wo@163.com>
Date: Wed, 20 Nov 2019 11:27:29 +0800
Subject: [PATCH] support build C++ cuda shared lib (#2401)

* support build C++ cuda shared lib
---
 cmake/generic.cmake     |  5 ++++-
 cmake/lite.cmake        |  1 +
 lite/CMakeLists.txt     |  3 +++
 lite/api/CMakeLists.txt | 14 ++++++++++----
 lite/core/context.h     |  7 +++++++
 lite/tools/build.sh     |  6 +++---
 6 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 415eb451a9..225a3c19a1 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+    target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
     if(nv_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
-gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
+       gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 9cf8b12635..a095eea6d1 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -248,6 +248,7 @@ endfunction()
 
 set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
+set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 60edfc357c..61f07583b2 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -117,6 +117,9 @@ if (LITE_WITH_X86)
     add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
 endif()
 
+if(LITE_WITH_CUDA)
+    add_dependencies(publish_inference paddle_full_api_shared)
+endif(LITE_WITH_CUDA) 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
         # add cxx lib
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index c79927ba10..63d53869ea 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
     add_library(paddle_full_api_shared SHARED "")
     target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
     endif()
-    
+    if(LITE_WITH_CUDA)
+        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
+    endif(LITE_WITH_CUDA) 
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared MODULE
     SRCS light_api_shared.cc
@@ -59,6 +61,7 @@ endif()
 
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
+message(STATUS "get CUDA kernels ${cuda_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
@@ -289,7 +292,8 @@ if(NOT IOS)
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
   lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
     ${ops} ${host_kernels}
     ARM_DEPS ${arm_kernels}
@@ -297,7 +301,9 @@ if(NOT IOS)
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
+
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/core/context.h b/lite/core/context.h
index eb25e7e1d9..545c6d2e88 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -207,6 +207,13 @@ class Context<TargetType::kCUDA> {
     ctx->cublas_fp32_ = cublas_fp32_;
   }
 
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    this->cublas_fp32_ = context.cublas_fp32_;
+    return *this;
+  }
+
   const cudaStream_t& exec_stream() const { return exec_stream_; }
   void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
 
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 90baac40e9..319f26ff82 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -237,10 +237,10 @@ function make_cuda {
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DWITH_TESTING=OFF \
             -DLITE_WITH_ARM=OFF \
-            -DLITE_WITH_PYTHON=ON \
+            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON
-
-  make publish_inference_python_lib -j8
+ 
+  make publish_inference -j4
   cd -
 }
 
-- 
GitLab