diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index 4a47ba5ccad4de338844e60f6fcbd6b7c11e891b..ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
-#include <bitset>
 #include <iostream>
 #include <random>
 
@@ -25,13 +24,13 @@
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
 
-#define CUDA_ATOMIC_KERNEL(op, T)                                      \
-  __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;       \
-         i += blockDim.x * gridDim.x) {                                \
-      paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]);         \
-    }                                                                  \
+template <typename T>
+__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
   }
+}
 
 template <typename T>
 struct AddFunctor {
@@ -39,80 +38,116 @@ struct AddFunctor {
 };
 
 template <typename T>
-struct SubFunctor {
-  T operator()(const T& a, const T& b) { return a - b; }
-};
-
-// NOTE(dzhwinter): the float16 add has small underflow/overflow
-// so we use EXPECT_NEAR to check the result.
-#define ARITHMETIC_KERNEL_LAUNCH(op, T)                                 \
-  void Test##T##op(size_t num) {                                        \
-    T *in1, *in2, *out;                                                 \
-    T *d_in1, *d_in2;                                                   \
-    size_t size = sizeof(T) * num;                                      \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);                 \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);                 \
-    in1 = reinterpret_cast<T*>(malloc(size));                           \
-    in2 = reinterpret_cast<T*>(malloc(size));                           \
-    out = reinterpret_cast<T*>(malloc(size));                           \
-    std::minstd_rand engine;                                            \
-    std::uniform_real_distribution<double> dist(0.0, 1.0);              \
-    for (size_t i = 0; i < num; ++i) {                                  \
-      in1[i] = static_cast<T>(dist(engine));                            \
-      in2[i] = static_cast<T>(dist(engine));                            \
-    }                                                                   \
-    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);               \
-    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);               \
-    op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);      \
-    cudaDeviceSynchronize();                                            \
-    cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);               \
-    cudaDeviceSynchronize();                                            \
-    for (size_t i = 0; i < num; ++i) {                                  \
-      EXPECT_NEAR(static_cast<float>(out[i]),                           \
-                  static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
-                  0.001);                                               \
-    }                                                                   \
-    free(in1);                                                          \
-    free(in2);                                                          \
-    free(out);                                                          \
-    cudaFree(d_in1);                                                    \
-    cudaFree(d_in2);                                                    \
+void TestCase(size_t num) {
+  T *in1, *in2, *out;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<T*>(malloc(size));
+  in2 = reinterpret_cast<T*>(malloc(size));
+  out = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+    in2[i] = static_cast<T>(dist(engine));
   }
-CUDA_ATOMIC_KERNEL(Add, float);
-CUDA_ATOMIC_KERNEL(Add, double);
-CUDA_ATOMIC_KERNEL(Add, float16);
-
-ARITHMETIC_KERNEL_LAUNCH(Add, float);
-ARITHMETIC_KERNEL_LAUNCH(Add, double);
-ARITHMETIC_KERNEL_LAUNCH(Add, float16);
-
-namespace paddle {
-namespace platform {
-USE_CUDA_ATOMIC(Sub, int);
-};
-};
-CUDA_ATOMIC_KERNEL(Sub, int);
-ARITHMETIC_KERNEL_LAUNCH(Sub, int);
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
 
 // cuda primitives
 TEST(CudaAtomic, Add) {
-  TestfloatAdd(static_cast<size_t>(10));
-  TestfloatAdd(static_cast<size_t>(1024 * 1024));
-  TestdoubleAdd(static_cast<size_t>(10));
-  TestdoubleAdd(static_cast<size_t>(1024 * 1024));
-}
+  TestCase<float>(static_cast<size_t>(10));
+  TestCase<float>(static_cast<size_t>(1024 * 1024));
 
-TEST(CudaAtomic, Sub) {
-  TestintSub(static_cast<size_t>(10));
-  TestintSub(static_cast<size_t>(1024 * 1024));
+  TestCase<double>(static_cast<size_t>(10));
+  TestCase<double>(static_cast<size_t>(1024 * 1024));
 }
 
 TEST(CudaAtomic, float16) {
-  using paddle::platform::float16;
-  Testfloat16Add(static_cast<size_t>(1));
-  Testfloat16Add(static_cast<size_t>(2));
-  Testfloat16Add(static_cast<size_t>(3));
+  TestCase<float16>(static_cast<size_t>(1));
+  TestCase<float16>(static_cast<size_t>(2));
+  TestCase<float16>(static_cast<size_t>(3));
+
+  TestCase<float16>(static_cast<size_t>(10));
+  TestCase<float16>(static_cast<size_t>(1024 * 1024));
+}
+
+// unalignment of uint8
+void TestUnalign(size_t num, const int shift_bit) {
+  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  float16 *in1, *in2, *out;
+  float16 *d_in1, *d_in2;
+  size_t size = sizeof(uint8_t) * (num + shift_bit);
+  size_t array_size = sizeof(float16) * (num / 2);
+
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<float16*>(malloc(size));
+  in2 = reinterpret_cast<float16*>(malloc(size));
+  out = reinterpret_cast<float16*>(malloc(size));
+
+  // right shift 1, mimic the unalignment of address
+  float16* r_in1 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
+  float16* r_in2 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
+
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num / 2; ++i) {
+    r_in1[i] = static_cast<float16>(dist(engine));
+    r_in2[i] = static_cast<float16>(dist(engine));
+  }
+  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
+  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num / 2; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
+                0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaAtomic, float16Unalign) {
+  // same with float16 testcase
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
+
+  // shift the address.
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
 
-  Testfloat16Add(static_cast<size_t>(10));
-  Testfloat16Add(static_cast<size_t>(1024 * 1024));
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
 }
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 94ce83975a7f13daa2b6a4d480cb22cc95811b9b..67ea64833d3b844d88a2e5996f860ef165bd8ffd 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 
 // convert the value into float and do the add arithmetic.
 // then store the result into a uint32.
-inline __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
   float16 low_half;
   // the float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xffffu);
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
   low_half = static_cast<float16>(static_cast<float>(low_half) + x);
-  return (val & 0xffff0000u) | low_half.x;
+  return (val & 0xFFFF0000u) | low_half.x;
 }
 
-inline __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
   float16 high_half;
   // the float16 in higher 16bits
   high_half.x = static_cast<uint16_t>(val >> 16);
   high_half = static_cast<float16>(static_cast<float>(high_half) + x);
-  return (val & 0xffffu) | (static_cast<uint32_t>(high_half.x) << 16);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
 CUDA_ATOMIC_WRAPPER(Add, float16) {
   // concrete packed float16 value may exsits in lower or higher 16bits
   // of the 32bits address.
-  uint32_t *address_as_ui =
-      reinterpret_cast<uint32_t *>(reinterpret_cast<char *>(address) -
-                                   (reinterpret_cast<size_t>(address) & 2));
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
   float val_f = static_cast<float>(val);
   uint32_t old = *address_as_ui;
   uint32_t sum;
   uint32_t newval;
   uint32_t assumed;
-  if (((size_t)address & 2) == 0) {
+  if (((uintptr_t)address & 0x02) == 0) {
     // the float16 value stay at lower 16 bits of the address.
     do {
       assumed = old;
       old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
     } while (old != assumed);
     float16 ret;
-    ret.x = old & 0xffffu;
+    ret.x = old & 0xFFFFu;
     return ret;
   } else {
     // the float16 value stay at higher 16 bits of the address.