未验证 提交 bbbd75e4 编写于 作者: zhouweiwei2014's avatar zhouweiwei2014 提交者: GitHub

change CUDA implementaion of uniform/gaussian OP (#38611)

* change CUDA implementaion of uniform/gaussian OP

* fix unittest
上级 a998c077
......@@ -21,17 +21,26 @@ limitations under the License. */
#include <hiprand_kernel.h>
#endif
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/hostdevice.h"
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition)
#endif
namespace paddle {
namespace distribution {
using Tensor = framework::Tensor;
/********************* Transformation Function **********************/
template <typename T>
struct exponential_transform {
explicit exponential_transform(T lambda) : lambda_(lambda) {}
......@@ -52,7 +61,37 @@ struct exponential_transform {
T lambda_;
};
template <typename T>
struct uniform_transform {
explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
HOSTDEVICE inline T operator()(T val) const {
if (UNLIKELY(val == static_cast<T>(1.0))) {
return min_;
} else {
return val * range_ + min_;
}
}
private:
T range_;
T min_;
};
template <typename T>
struct normal_transform {
explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
private:
T mean_;
T std_;
};
#if defined(__NVCC__) || defined(__HIPCC__)
/*********************** Distribution Function *************************/
template <typename T>
struct uniform_distribution;
......@@ -132,6 +171,7 @@ struct normal_distribution<double> {
};
#endif
/******** Launch GPU function of distribution and transformation *********/
template <typename T, typename DistOp, typename TransformOp>
__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
DistOp dist, TransformOp trans,
......@@ -151,8 +191,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
for (size_t j = 0; j < returns_count; j++) {
size_t index = i + j * total_thread;
if (index < size) {
auto random = static_cast<T>((&random_tuple.x)[j]);
out_data[index] = trans(random);
auto random = (&random_tuple.x)[j];
out_data[index] = static_cast<T>(trans(random));
}
}
}
......
......@@ -19,8 +19,11 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/distribution_helper.h"
#include "paddle/fluid/operators/fill_constant_op.h"
DECLARE_bool(use_curand);
namespace paddle {
namespace operators {
......@@ -65,7 +68,10 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
thrust::counting_iterator<int64_t> index_sequence_begin(0);
auto shape = GetShape(context);
tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace());
auto& dev_cxt =
context.template device_context<platform::CUDADeviceContext>();
T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
int64_t size = tensor->numel();
......@@ -73,12 +79,20 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && seed_flag) {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
thrust::transform(
index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
if (FLAGS_use_curand) {
using MT = typename details::MPTypeTrait<T>::Type;
distribution::normal_distribution<MT> dist;
distribution::normal_transform<MT> trans(mean, std);
distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
thrust::transform(
index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
}
} else {
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
......
......@@ -18,7 +18,12 @@ limitations under the License. */
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/distribution_helper.h"
#include "paddle/fluid/operators/uniform_random_op.h"
DECLARE_bool(use_curand);
namespace paddle {
namespace operators {
......@@ -123,7 +128,9 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
"unsupport type: %s.",
framework::ToTypeName(out_var->Type())));
}
T* data = tensor->mutable_data<T>(context.GetPlace());
auto& dev_cxt =
context.template device_context<platform::CUDADeviceContext>();
T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
bool seed_flag = false;
if (seed == 0) {
......@@ -144,13 +151,21 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
int device_id = context.GetPlace().GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && seed_flag) {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
thrust::transform(
index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
diag_step, diag_val, gen_offset));
if (FLAGS_use_curand) {
using MT = typename details::MPTypeTrait<T>::Type;
distribution::uniform_distribution<MT> dist;
distribution::uniform_transform<MT> trans(min, max);
distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
thrust::transform(
index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
diag_step, diag_val, gen_offset));
}
} else {
thrust::transform(
index_sequence_begin, index_sequence_begin + size,
......
......@@ -545,6 +545,8 @@ PADDLE_DEFINE_EXPORTED_double(
*/
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
PADDLE_DEFINE_EXPORTED_bool(use_curand, false, "Random OP use CURAND");
/**
* Debug related FLAG
* Name: FLAGS_call_stack_level
......
......@@ -662,6 +662,7 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
set start=%start:~4,10%
set FLAGS_call_stack_level=2
set FLAGS_use_curand=True
dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
......
......@@ -61,6 +61,8 @@ function init() {
# NOTE(chenweihang): For easy debugging, CI displays the C++ error stacktrace by default
export FLAGS_call_stack_level=2
export FLAGS_use_curand=True
# set CI_SKIP_CPP_TEST if only *.py changed
# In order to avoid using in some CI(such as daily performance), the current
# branch must not be `${BRANCH}` which is usually develop.
......
......@@ -274,7 +274,7 @@ class TestDistTraning(unittest.TestCase):
seq_length = 16
class_size_per_card = 2
vocab_size = class_size_per_card * self.model_parallel_size
seed = 1025
seed = 100
set_random_seed(seed)
rank_id = dist.get_rank()
......
......@@ -333,7 +333,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
lr_ratio=simple_lr_fun)
loss_ref = np.array(
[4.8383293, 3.084947, 1.3323904, -0.41943002, -2.1710064])
[-1.7267396, -2.81524, -3.9250019, -5.05954, -6.2272625])
for i in range(5):
a1 = linear1(a)
out = linear2(a1)
......@@ -379,7 +379,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
exe.run(startup)
loss_ref = np.array(
[0.36120513, 0.2720821, 0.67208904, 0.14607805, 0.24098626])
[0.33895183, 0.3159437, 0.19472016, 0.17764759, 0.1520702])
for i in range(5):
inputs = np.random.random(size=[8, 10]).astype('float32')
outputs = np.random.random(size=[8, 1]).astype('float32')
......
......@@ -287,5 +287,49 @@ class TestStandardNormalDtype(unittest.TestCase):
paddle.enable_static()
class TestRandomValue(unittest.TestCase):
def test_fixed_random_number(self):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if not paddle.is_compiled_with_cuda():
return
# Note(zhouwei): The Number of threads is determined by
# 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different
# GPU have different number of threads, which result in different
# random value. Only test on V100 GPU here.
if not "V100" in paddle.device.cuda.get_device_name():
return
def _check_random_value(dtype, expect, expect_mean, expect_std):
x = paddle.randn([32, 3, 1024, 1024], dtype=dtype)
actual = x.numpy()
self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect))
self.assertTrue(np.mean(actual), expect_mean)
self.assertTrue(np.std(actual), expect_std)
print("Test Fixed Random number on V100 GPU------>")
paddle.disable_static()
paddle.set_device('gpu')
paddle.seed(2021)
expect = [
-0.79037829, -0.54411126, -0.32266671, 0.35791815, 1.44169267,
-0.87785644, -1.23909874, -2.18194139, 0.49489656, 0.40703062
]
expect_mean = -0.0000053026194133403266873214888799115129813799285329878330230713
expect_std = 0.99999191058126390974081232343451119959354400634765625
_check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean,
expect_std)
expect = [
-0.7988942, 1.8644791, 0.02782744, 1.3692524, 0.6419724, 0.12436751,
0.12058455, -1.9984808, 1.5635862, 0.18506318
]
expect_mean = -0.00004762359094456769526004791259765625
expect_std = 0.999975681304931640625
_check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean,
expect_std)
paddle.enable_static()
if __name__ == "__main__":
unittest.main()
......@@ -562,5 +562,49 @@ class TestUniformDtype(unittest.TestCase):
paddle.enable_static()
class TestRandomValue(unittest.TestCase):
def test_fixed_random_number(self):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if not paddle.is_compiled_with_cuda():
return
# Note(zhouwei): The Number of threads is determined by
# 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different
# GPU have different number of threads, which result in different
# random value. Only test on V100 GPU here.
if not "V100" in paddle.device.cuda.get_device_name():
return
def _check_random_value(dtype, expect, expect_mean, expect_std):
x = paddle.rand([32, 3, 1024, 1024], dtype=dtype)
actual = x.numpy()
self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect))
self.assertEqual(np.mean(actual), expect_mean)
self.assertEqual(np.std(actual), expect_std)
print("Test Fixed Random number on V100 GPU------>")
paddle.disable_static()
paddle.set_device('gpu')
paddle.seed(2021)
expect = [
0.55298901, 0.65184678, 0.49375412, 0.57943639, 0.16459608,
0.67181056, 0.03021481, 0.0238559, 0.07742096, 0.55972187
]
expect_mean = 0.50000454338820143895816272561205551028251647949218750
expect_std = 0.28867379167297479991560749112977646291255950927734375
_check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean,
expect_std)
expect = [
0.45320973, 0.17582087, 0.725341, 0.30849215, 0.622257, 0.46352342,
0.97228295, 0.12771158, 0.286525, 0.9810645
]
expect_mean = 0.50002604722976684570312500
expect_std = 0.2886914908885955810546875
_check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean,
expect_std)
paddle.enable_static()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册