use curand

7c274dc0 · qijun · d525abed · 7c274dc0 · 7c274dc0 · 7c274dc0
5 changed file
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -109,6 +109,15 @@ void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
      matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
 }

+template <>
+void Set<typename CPUPlace, typename float>(const int n, const float alpha,
+                                            float* output,
+                                            platform::DeviceContext* context) {
+  auto* cpu_context = reinterpret_cast<platform::CPUDeviceContext*>(context);
+  framework::EigenVector::Type<T> out(output, n);
+  out.device(*(cpu_context->eigen_device())) = t.constant(T(alpha));
+}
+
 template <>
 void RandUniform<platform::CPUPlace, float>(const int n, const float min,
                                            const float max, float* output,

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -126,20 +126,48 @@ void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
      matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
 }

+template <>
+void Set<typename GPUPlace, typename float>(const int n, const float alpha,
+                                            float* output,
+                                            platform::DeviceContext* context) {
+  auto* cuda_context = reinterpret_cast<platform::CUDADeviceContext*>(context);
+  framework::EigenVector::Type<T> out(output, n);
+  out.device(*(cuda_context->eigen_device())) = t.constant(T(alpha));
+}
+
+template <typename T>
+__global__ void UniformShift(const int n, const T min, const T max, T* x) {
+  float scale = max - min;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    x[i] = x[i] * scale + min;
+  }
+}
+
 template <>
 void RandUniform<platform::GPUPlace, float>(const int n, const float min,
                                            const float max, float* output,
                                            platform::DeviceContext* context) {
  auto* cuda_context = reinterpret_cast<platform::CUDADeviceContext*>(context);
-  thrust::uniform_real_distribution<float> distribution(min, max);
-  thrust::minstd_rand engine = cuda_context->rand_enigne();
-  engine->discard(n);
-
-  thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+  PADDLE_ENFORCE(
+      curandGenerateUniform(cuda_context->curand_generator(), output, n));
+  int block = 512;
+  int grid = (n + block - 1) / block;
+  UniformShift<float><<<grid, block, 0, cuda_context->stream()>>>(n, min, max,
+                                                                  output);
+}

-  thrust::transform(thrust::cuda::par.on(cuda_context->stream()),
-                    index_sequence_begin, index_sequence_begin + n,
-                    thrust::device_ptr<float>(output), distribution(engine));
+template <typename T>
+int HandleOddLengthRandGaussian(const int n, const T mean, const T std,
+                                T* output, CUDADeviceContext* context) {
+  if (n % 2 == 1) {
+    std::default_random_engine generator;
+    std::normal_distribution<T> distribution(mean, std);
+    const T random_value = distribution(generator);
+    Set<T, platform::GPUPlace>(1, random_value, output + (n - 1), context);
+    return n - 1;
+  }
+  return n;
 }

 template <>
@@ -147,15 +175,11 @@ void RandGaussian<platform::GPUPlace, float>(const int n, const float mean,
                                             const float std, float* output,
                                             platform::DeviceContext* context) {
  auto* cuda_context = reinterpret_cast<platform::CUDADeviceContext*>(context);
-  thrust::normal_distribution<float> distribution(mean, std);
-  thrust::minstd_rand engine = cuda_context->rand_enigne();
-  engine->discard(n);
-
-  thrust::counting_iterator<unsigned int> index_sequence_begin(0);

-  thrust::transform(thrust::cuda::par.on(cuda_context->stream()),
-                    index_sequence_begin, index_sequence_begin + n,
-                    thrust::device_ptr<float>(output), distribution(engine));
+  const int even_n =
+      HandleOddLengthRandGaussian<float>(n, mean, std, output, cuda_context);
+  PADDLE_ENFORCE(curandGenerateNormal(cuda_context->curand_generator(), output,
+                                      even_n, mean, std));
 }

 }  // namespace math

--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -54,6 +54,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,

 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
+#include "paddle/platform/eigen.h"
 #include "paddle/platform/enforce.h"

 namespace paddle {
@@ -77,6 +78,13 @@ void matmul(const framework::Tensor& matrix_a, bool trans_a,
            framework::Tensor* matrix_out, T beta,
            platform::DeviceContext* context);

+template <typename Place, typename T>
+void Set(const int n, const T alpha, T* output,
+         platform::DeviceContext* context) {
+  framework::EigenVector::Type<T> out(output, n);
+  out.device(*(context->eigen_device())) = t.constant(T(alpha));
+}
+
 template <typename Place, typename T>
 void RandUniform(const int n, const T min, const T max, T* output,
                 platform::DeviceContext* context);

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -157,12 +157,17 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
  return cudnn_handle_;
 }

-thrust::minstd_rand& CPUDeviceContext::rand_engine() {
-  if (!rand_engine_) {
-    rand_engine_.reset(new thrust::minstd_rand());
-    rand_engine_->seed(rand_seed_);
+curandGenerator_t CUDADeviceContext::curand_generator() {
+  if (!curand_generator_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
+                                                  CURAND_RNG_PSEUDO_DEFAULT));
+    PADDLE_ENFORCE(
+        dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
+
+    PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
  }
-  return *(rand_engine_.get());
+  return curand_generator_;
 }

 cudaStream_t CUDADeviceContext::stream() { return stream_; }

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/platform/place.h"

 #ifndef PADDLE_ONLY_CPU
-#include <thrust/device_ptr.h>
-#include <thrust/random.h>
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
@@ -80,7 +79,8 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cudnn  handle in the device context. */
  cudnnHandle_t     cudnn_handle();

-  thrust::minstd_rand& CPUDeviceContext::rand_engine();
+   /*! \brief  Return curand handle in the device context. */
+  curandGenerator_t curand_generator();

  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t      stream();