// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #define PADDLE_CUDA_FP16 #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/float16.h" using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::float16; #define CUDA_ATOMIC_KERNEL(op, T) \ __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \ i += blockDim.x * gridDim.x) { \ paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \ } \ } template struct AddFunctor { T operator()(const T& a, const T& b) { return a + b; } }; template struct SubFunctor { T operator()(const T& a, const T& b) { return a - b; } }; // NOTE(dzhwinter): the float16 add has small underflow/overflow // so we use EXPECT_NEAR to check the result. #define ARITHMETIC_KERNEL_LAUNCH(op, T) \ void Test##T##op(size_t num) { \ T *in1, *in2, *out; \ T *d_in1, *d_in2; \ size_t size = sizeof(T) * num; \ cudaMalloc(reinterpret_cast(&d_in1), size); \ cudaMalloc(reinterpret_cast(&d_in2), size); \ in1 = reinterpret_cast(malloc(size)); \ in2 = reinterpret_cast(malloc(size)); \ out = reinterpret_cast(malloc(size)); \ std::minstd_rand engine; \ std::uniform_real_distribution dist(0.0, 1.0); \ for (size_t i = 0; i < num; ++i) { \ in1[i] = static_cast(dist(engine)); \ in2[i] = static_cast(dist(engine)); \ } \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \ cudaDeviceSynchronize(); \ cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \ cudaDeviceSynchronize(); \ for (size_t i = 0; i < num; ++i) { \ EXPECT_NEAR(static_cast(out[i]), \ static_cast(op##Functor()(in1[i], in2[i])), \ 0.001); \ } \ free(in1); \ free(in2); \ free(out); \ cudaFree(d_in1); \ cudaFree(d_in2); \ } CUDA_ATOMIC_KERNEL(Add, float); CUDA_ATOMIC_KERNEL(Add, double); CUDA_ATOMIC_KERNEL(Add, float16); ARITHMETIC_KERNEL_LAUNCH(Add, float); ARITHMETIC_KERNEL_LAUNCH(Add, double); ARITHMETIC_KERNEL_LAUNCH(Add, float16); namespace paddle { namespace platform { USE_CUDA_ATOMIC(Sub, int); }; }; CUDA_ATOMIC_KERNEL(Sub, int); ARITHMETIC_KERNEL_LAUNCH(Sub, int); // cuda primitives TEST(CudaAtomic, Add) { TestfloatAdd(static_cast(10)); TestfloatAdd(static_cast(1024 * 1024)); TestdoubleAdd(static_cast(10)); TestdoubleAdd(static_cast(1024 * 1024)); } TEST(CudaAtomic, Sub) { TestintSub(static_cast(10)); TestintSub(static_cast(1024 * 1024)); } TEST(CudaAtomic, float16) { using paddle::platform::float16; Testfloat16Add(static_cast(1)); Testfloat16Add(static_cast(2)); Testfloat16Add(static_cast(3)); Testfloat16Add(static_cast(10)); Testfloat16Add(static_cast(1024 * 1024)); }