提交 2b4febb4 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4436 Refactor uniform ops in GPU context

Merge pull request !4436 from peixu_ren/custom_gpu
...@@ -19,19 +19,26 @@ template <typename T> ...@@ -19,19 +19,26 @@ template <typename T>
__global__ void NormalKernel(int seed, curandState *globalState, T *output, size_t count) { __global__ void NormalKernel(int seed, curandState *globalState, T *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
curand_init(seed, i, 0, &globalState[i]); curand_init(seed, i, 0, &globalState[i]);
output[i] = curand_normal(&globalState[i]); output[i] = (T)curand_normal(&globalState[i]);
} }
return; return;
} }
template <typename T> template <typename T>
__global__ void UniformKernel(int seed, curandState *globalState, T *input1, size_t input_size_1, __global__ void UniformIntKernel(int seed, curandState *globalState, T *input1, size_t input_size_1,
T *input2, size_t input_size_2, T *output, size_t count) { T *input2, size_t input_size_2, T *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
input1[i] = (input_size_1 == 1 ? input1[0] : input1[i]);
input2[i] = (input_size_2 == 1 ? input2[0] : input2[i]);
curand_init(seed, i, 0, &globalState[i]); curand_init(seed, i, 0, &globalState[i]);
output[i] = curand_uniform(&globalState[i]) * (input2[i] - input1[i]) + input1[i]; output[i] = (T)(curand_uniform(&globalState[i])) * (input2[0] - input1[0]) + input1[0];
}
return;
}
template <typename T>
__global__ void UniformRealKernel(int seed, curandState *globalState, T *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
curand_init(seed, i, 0, &globalState[i]);
output[i] = (T)curand_uniform(&globalState[i]);
} }
return; return;
} }
...@@ -51,16 +58,46 @@ void StandardNormal(int seed, int seed2, curandState *globalState, T *output, si ...@@ -51,16 +58,46 @@ void StandardNormal(int seed, int seed2, curandState *globalState, T *output, si
} }
template <typename T> template <typename T>
void UniformReal(int seed, curandState *globalState, T *input1, size_t input_size_1, void UniformInt(int seed, int seed2, curandState *globalState, T *input1, size_t input_size_1,
T *input2, size_t input_size_2, T *output, size_t count, cudaStream_t cuda_stream) { T *input2, size_t input_size_2, T *output, size_t count, cudaStream_t cuda_stream) {
seed = (seed == 0 ? time(NULL):seed); int RNG_seed = 0;
UniformKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>> if (seed2 != 0) {
(seed, globalState, input1, input_size_1, input2, input_size_2, output, count); RNG_seed = seed2;
} else if (seed != 0) {
RNG_seed = seed;
} else {
RNG_seed = time(NULL);
}
UniformIntKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>
(RNG_seed, globalState, input1, input_size_1, input2, input_size_2, output, count);
return;
}
template <typename T>
void UniformReal(int seed, int seed2, curandState *globalState, T *output, size_t count, cudaStream_t cuda_stream) {
int RNG_seed = 0;
if (seed2 != 0) {
RNG_seed = seed2;
} else if (seed != 0) {
RNG_seed = seed;
} else {
RNG_seed = time(NULL);
}
UniformRealKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(RNG_seed, globalState, output, count);
return; return;
} }
template void StandardNormal<float>(int seed, int seed2, curandState *globalState, template void StandardNormal<float>(int seed, int seed2, curandState *globalState,
float *output, size_t count, cudaStream_t cuda_stream); float *output, size_t count, cudaStream_t cuda_stream);
template void UniformReal<float>(int seed, curandState *globalState, float *input1, size_t input_size_1, template void StandardNormal<int>(int seed, int seed2, curandState *globalState,
float *input2, size_t input_size_2, float *output, size_t count, int *output, size_t count, cudaStream_t cuda_stream);
cudaStream_t cuda_stream); template void UniformInt<float>(int seed, int seed2, curandState *globalState, float *input1, size_t input_size_1,
float *input2, size_t input_size_2, float *output, size_t count,
cudaStream_t cuda_stream);
template void UniformInt<int>(int seed, int seed2, curandState *globalState, int *input1, size_t input_size_1,
int *input2, size_t input_size_2, int *output, size_t count,
cudaStream_t cuda_stream);
template void UniformReal<float>(int seed, int seed2, curandState *globalState,
float *output, size_t count, cudaStream_t cuda_stream);
template void UniformReal<int>(int seed, int seed2, curandState *globalState,
int *output, size_t count, cudaStream_t cuda_stream);
...@@ -24,7 +24,10 @@ template <typename T> ...@@ -24,7 +24,10 @@ template <typename T>
void StandardNormal(int seed, int seed2, curandState *globalState, void StandardNormal(int seed, int seed2, curandState *globalState,
T *output, size_t count, cudaStream_t cuda_stream); T *output, size_t count, cudaStream_t cuda_stream);
template <typename T> template <typename T>
void UniformReal(int seed, curandState *globalState, void UniformInt(int seed, int seed2, curandState *globalState,
T *input1, size_t input_size_1, T *input2, size_t input_size_2, T *input1, size_t input_size_1, T *input2, size_t input_size_2,
T *output, size_t count, cudaStream_t cuda_stream); T *output, size_t count, cudaStream_t cuda_stream);
template <typename T>
void UniformReal(int seed, int seed2, curandState *globalState,
T *output, size_t count, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_ #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_
...@@ -20,12 +20,14 @@ namespace mindspore { ...@@ -20,12 +20,14 @@ namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_ONE(StandardNormal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), MS_REG_GPU_KERNEL_ONE(StandardNormal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
RandomOpGpuKernel, float) RandomOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(UniformReal, MS_REG_GPU_KERNEL_ONE(UniformInt,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeInt32) .AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32), .AddOutputAttr(kNumberTypeInt32),
RandomOpGpuKernel, int)
MS_REG_GPU_KERNEL_ONE(UniformReal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
RandomOpGpuKernel, float) RandomOpGpuKernel, float)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore
...@@ -28,16 +28,17 @@ ...@@ -28,16 +28,17 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
enum RandomOptype { RANDOM_OP_NORMAL = 0, RANDOM_OP_UNIFORM_REAL, RANDOM_OP_INVALID_TYPE = 255 }; enum RandomOptype { RANDOM_OP_NORMAL = 0, RANDOM_OP_UNIFORM_INT, RANDOM_OP_UNIFORM_REAL, RANDOM_OP_INVALID_TYPE = 255 };
const std::map<std::string, RandomOptype> kRandomOpTypeMap = {
{"StandardNormal", RANDOM_OP_NORMAL}, {"UniformInt", RANDOM_OP_UNIFORM_INT}, {"UniformReal", RANDOM_OP_UNIFORM_REAL}};
const std::map<std::string, RandomOptype> kRandomOpTypeMap = {{"StandardNormal", RANDOM_OP_NORMAL},
{"UniformReal", RANDOM_OP_UNIFORM_REAL}};
template <typename T> template <typename T>
class RandomOpGpuKernel : public GpuKernel { class RandomOpGpuKernel : public GpuKernel {
public: public:
RandomOpGpuKernel() RandomOpGpuKernel()
: random_op_type_(RANDOM_OP_INVALID_TYPE), : random_op_type_(RANDOM_OP_INVALID_TYPE),
input_size_0_(sizeof(int)), input_size_0_(sizeof(0)),
input_size_1_(sizeof(T)), input_size_1_(sizeof(T)),
input_size_2_(sizeof(T)), input_size_2_(sizeof(T)),
output_size_(sizeof(T)), output_size_(sizeof(T)),
...@@ -62,11 +63,16 @@ class RandomOpGpuKernel : public GpuKernel { ...@@ -62,11 +63,16 @@ class RandomOpGpuKernel : public GpuKernel {
reinterpret_cast<cudaStream_t>(stream_ptr)); reinterpret_cast<cudaStream_t>(stream_ptr));
break; break;
} }
case RANDOM_OP_UNIFORM_REAL: { case RANDOM_OP_UNIFORM_INT: {
T *input_addr_1 = GetDeviceAddress<T>(inputs, 1); T *input_addr_1 = GetDeviceAddress<T>(inputs, 1);
T *input_addr_2 = GetDeviceAddress<T>(inputs, 2); T *input_addr_2 = GetDeviceAddress<T>(inputs, 2);
UniformReal(seed_, devStates, input_addr_1, inputs[1]->size / sizeof(T), input_addr_2, UniformInt(seed_, seed2_, devStates, input_addr_1, inputs[1]->size / sizeof(T), input_addr_2,
inputs[2]->size / sizeof(T), output_addr, outputs[0]->size / sizeof(T), inputs[2]->size / sizeof(T), output_addr, outputs[0]->size / sizeof(T),
reinterpret_cast<cudaStream_t>(stream_ptr));
break;
}
case RANDOM_OP_UNIFORM_REAL: {
UniformReal(seed_, seed2_, devStates, output_addr, outputs[0]->size / sizeof(T),
reinterpret_cast<cudaStream_t>(stream_ptr)); reinterpret_cast<cudaStream_t>(stream_ptr));
break; break;
} }
...@@ -86,11 +92,11 @@ class RandomOpGpuKernel : public GpuKernel { ...@@ -86,11 +92,11 @@ class RandomOpGpuKernel : public GpuKernel {
random_op_type_ = iter->second; random_op_type_ = iter->second;
} }
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (random_op_type_ == RANDOM_OP_NORMAL && input_num != 1) { if ((random_op_type_ == RANDOM_OP_NORMAL || random_op_type_ == RANDOM_OP_UNIFORM_REAL) && input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 1 input."; MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 1 input.";
return false; return false;
} }
if (random_op_type_ == RANDOM_OP_UNIFORM_REAL && input_num != 3) { if (random_op_type_ == RANDOM_OP_UNIFORM_INT && input_num != 3) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 3 inputs."; MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 3 inputs.";
return false; return false;
} }
...@@ -104,15 +110,9 @@ class RandomOpGpuKernel : public GpuKernel { ...@@ -104,15 +110,9 @@ class RandomOpGpuKernel : public GpuKernel {
input_size_0_ += input_shape_0[i]; input_size_0_ += input_shape_0[i];
} }
input_size_0_ *= sizeof(int); input_size_0_ *= sizeof(int);
if (random_op_type_ == RANDOM_OP_UNIFORM_REAL) { if (random_op_type_ == RANDOM_OP_UNIFORM_INT) {
auto input_shape_1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); input_size_1_ *= 1;
for (size_t i = 0; i < input_shape_1.size(); i++) { input_size_2_ *= 1;
input_size_1_ *= input_shape_1[i];
}
auto input_shape_2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
for (size_t i = 0; i < input_shape_2.size(); i++) {
input_size_2_ *= input_shape_2[i];
}
} }
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < output_shape.size(); i++) { for (size_t i = 0; i < output_shape.size(); i++) {
...@@ -120,9 +120,7 @@ class RandomOpGpuKernel : public GpuKernel { ...@@ -120,9 +120,7 @@ class RandomOpGpuKernel : public GpuKernel {
workspace_size_ *= output_shape[i]; workspace_size_ *= output_shape[i];
} }
seed_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed")); seed_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed"));
if (random_op_type_ == RANDOM_OP_NORMAL) { seed2_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed2"));
seed2_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed2"));
}
InitSizeLists(); InitSizeLists();
return true; return true;
} }
...@@ -130,7 +128,7 @@ class RandomOpGpuKernel : public GpuKernel { ...@@ -130,7 +128,7 @@ class RandomOpGpuKernel : public GpuKernel {
protected: protected:
void InitSizeLists() override { void InitSizeLists() override {
input_size_list_.push_back(input_size_0_); input_size_list_.push_back(input_size_0_);
if (random_op_type_ == RANDOM_OP_UNIFORM_REAL) { if (random_op_type_ == RANDOM_OP_UNIFORM_INT) {
input_size_list_.push_back(input_size_1_); input_size_list_.push_back(input_size_1_);
input_size_list_.push_back(input_size_2_); input_size_list_.push_back(input_size_2_);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册