truncated_gaussian_random_kernel.cu 4.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include <limits>

#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"

namespace phi {

template <typename T>
struct GPUTruncatedNormal {
  T mean, std;
  T a_normal_cdf;
  T b_normal_cdf;
  unsigned int seed;
  T numeric_min;

  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
39 40
    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
41 42 43 44 45
  }

  __host__ __device__ T operator()(const unsigned int n) const {
    thrust::minstd_rand rng;
    rng.seed(seed);
46
    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
47 48
    rng.discard(n);
    T value = dist(rng);
49 50
    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
  }
};

template <typename T>
struct TruncatedNormalOffset {
  T mean, std;
  T a_normal_cdf;
  T b_normal_cdf;
  unsigned int seed;
  T numeric_min;
  int offset_;

  __host__ __device__
  TruncatedNormalOffset(T mean, T std, T numeric_min, int seed, int offset)
      : mean(mean),
        std(std),
        seed(seed),
        numeric_min(numeric_min),
        offset_(offset) {
70 71
    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
72 73 74 75 76
  }

  __host__ __device__ T operator()(const unsigned int n) const {
    thrust::minstd_rand rng;
    rng.seed(seed);
77
    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
78 79
    rng.discard(n + offset_);
    T value = dist(rng);
80 81
    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
82 83 84 85 86
  }
};

template <typename T, typename Context>
void TruncatedGaussianRandomKernel(const Context& dev_ctx,
87
                                   const std::vector<int>& shape,
88 89 90 91 92
                                   float mean,
                                   float std,
                                   int seed,
                                   DataType dtype,
                                   DenseTensor* out) {
93
  T* data = dev_ctx.template Alloc<T>(out);
94 95

  thrust::counting_iterator<int64_t> index_sequence_begin(0);
96
  int64_t size = out->numel();
97

98
  auto gen_cuda = dev_ctx.GetGenerator();
99 100
  if (seed == 0) {
    // use global Generator seed
101
    auto seed_offset = gen_cuda->IncrementOffset(1);
102 103 104 105 106 107 108 109
    uint64_t seed = seed_offset.first;
    uint64_t offset = seed_offset.second;
    thrust::transform(
        index_sequence_begin,
        index_sequence_begin + size,
        thrust::device_ptr<T>(data),
        TruncatedNormalOffset<T>(
            mean, std, std::numeric_limits<T>::min(), seed, size * offset));
110
  } else {
111
    // use OP seed
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
    thrust::transform(
        index_sequence_begin,
        index_sequence_begin + size,
        thrust::device_ptr<T>(data),
        GPUTruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
  }
}

}  // namespace phi

PD_REGISTER_KERNEL(truncated_gaussian_random,
                   GPU,
                   ALL_LAYOUT,
                   phi::TruncatedGaussianRandomKernel,
                   float) {}