From 4a9d21de4987ced5aaf58a318ac598abff853b48 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 24 Sep 2020 10:11:06 +0800
Subject: [PATCH] Add GPU Kernels of Segment Ops,  support, sum, max, min, mean

Add GPU Kernels of Segment Ops,  support, sum, max, min, mean
---
 .../fluid/operators/math/segment_pooling.cu   | 365 ++++++++++++++++++
 paddle/fluid/operators/segment_pool_op.cu     |  28 ++
 paddle/fluid/operators/segment_pool_op.h      |  40 ++
 paddle/fluid/platform/cuda_primitives.h       | 107 +++++
 4 files changed, 540 insertions(+)
 create mode 100644 paddle/fluid/operators/math/segment_pooling.cu
 create mode 100644 paddle/fluid/operators/segment_pool_op.cu
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
new file mode 100644
index 00000000000..bb2b6db100b
--- /dev/null
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -0,0 +1,365 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, typename Index, int DimTileSize>
+__global__ void SegmentMeanCustomKernel(
+    const Index* segment_ids, const T* input, T* output, T* summed_ids,
+    const Index input_length_size, const Index inner_dim_size,
+    const Index output_length_size, const Index total_stripe_count) {
+  CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
+    const Index segment_offset = stripe_index % inner_dim_size;
+    const Index dim_index_base =
+        stripe_index / inner_dim_size * Index(DimTileSize);
+    const Index actual_height =
+        min(Index(DimTileSize), input_length_size - dim_index_base);
+
+    Index first_segment_id = segment_ids[dim_index_base];
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
+    }
+    if (segment_offset == 0) {
+      T sum = T(0);
+      for (Index j = 0; j < actual_height; j++) {
+        Index current_segment_id = segment_ids[dim_index_base + j];
+        // Note(ZHUI): following check may cause
+        // cudaErrorLaunchOutOfResources.
+        // PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+        //               "the segment ids should be sorted, but got "
+        //               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+        //               dim_index_base + j - 1, dim_index_base + j,
+        //               last_segment_id, current_segment_id);
+
+        if (j > 0 && current_segment_id > last_segment_id) {
+          if (last_segment_id == first_segment_id) {
+            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+          } else {
+            *(summed_ids + last_segment_id) = sum;
+          }
+          sum = T(0);
+        }
+        sum += T(1);
+        last_segment_id = current_segment_id;
+      }
+      platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    }
+    // ensure last_segment_id is the largest
+    last_segment_id = output_length_size;
+    __syncthreads();
+    T sum = T(0);
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      if (current_segment_id > last_segment_id) {
+        const Index output_index =
+            last_segment_id * inner_dim_size + segment_offset;
+        if (last_segment_id == first_segment_id) {
+          platform::CudaAtomicAdd(output + output_index,
+                                  sum / *(summed_ids + last_segment_id));
+        } else {
+          *(output + output_index) = sum / *(summed_ids + last_segment_id);
+        }
+        sum = T(0);
+      }
+      sum += input[(dim_index_base + j) * inner_dim_size + segment_offset];
+      last_segment_id = current_segment_id;
+    }
+    const Index output_index =
+        last_segment_id * inner_dim_size + segment_offset;
+    platform::CudaAtomicAdd(output + output_index,
+                            sum / *(summed_ids + last_segment_id));
+  }
+}
+
+template <typename T, typename Index, typename Helper, typename Pool>
+__global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
+                                 T* output, Helper h, Pool pool) {
+  CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
+    Index segment_offset, dim_index_base, actual_height;
+    Index inner_dim_size = h.inner_dim_size;
+    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+
+    T minmax = pool.initial();
+    Index first_segment_id = segment_ids[dim_index_base];
+    // -1 is for the start value when interval_id = 0
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
+    }
+
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      // ensure the segment_ids is sorted.
+      PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+                     "The segment ids should be sorted, but got "
+                     "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                     dim_index_base + j - 1, dim_index_base + j,
+                     last_segment_id, current_segment_id);
+
+      if (current_segment_id > last_segment_id) {
+        // reset the interval value which do not have corresponding ids.
+        for (Index interval_id = last_segment_id + 1;
+             interval_id < current_segment_id; ++interval_id) {
+          *(output + interval_id * inner_dim_size + segment_offset) = 0;
+        }
+        // don't update result when j=0
+        if (j > 0) {
+          const Index output_index =
+              last_segment_id * inner_dim_size + segment_offset;
+          if (last_segment_id == first_segment_id) {
+            pool.atomic(output + output_index, minmax);
+          } else {
+            *(output + output_index) = minmax;
+          }
+          minmax = pool.initial();
+        }
+      }
+      pool.compute(
+          input[(dim_index_base + j) * inner_dim_size + segment_offset],
+          &minmax);
+      last_segment_id = current_segment_id;
+    }
+    const Index output_index =
+        last_segment_id * inner_dim_size + segment_offset;
+    pool.atomic(output + output_index, minmax);
+  }
+}
+
+template <typename T, typename Index, typename Helper>
+__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
+                                       const T* output, const T* out_grad,
+                                       T* in_grad, Helper h) {
+  CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
+    Index segment_offset, dim_index_base, actual_height;
+    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      Index input_index =
+          (dim_index_base + j) * h.inner_dim_size + segment_offset;
+      Index output_index =
+          current_segment_id * h.inner_dim_size + segment_offset;
+      if (input[input_index] == output[output_index]) {
+        in_grad[input_index] = out_grad[output_index];
+      }
+    }
+  }
+}
+
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicMax(address, val);
+  }
+};
+
+template <class T>
+class MinPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicMin(address, val);
+  }
+};
+
+template <class T>
+class SumPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicAdd(address, val);
+  }
+};
+
+template <class T>
+class ArrangeHelper {
+ public:
+  const T input_total_size;
+  const T input_length_size;
+  const T output_length_size;
+  T inner_dim_size;
+  T total_stripe_count;
+  const T DimTileSize = 8;
+
+  ArrangeHelper(T a, T b, T c)
+      : input_total_size(a), input_length_size(b), output_length_size(c) {
+    T input_outer_dim_num_stripe =
+        (input_length_size + DimTileSize - 1) / DimTileSize;
+    inner_dim_size = input_total_size / input_length_size;
+    total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
+  }
+
+  DEVICE inline void calculate(T stripe_index, T& segment_offset,
+                               T& dim_index_base, T& actual_height) {
+    segment_offset = stripe_index % inner_dim_size;
+    dim_index_base = stripe_index / inner_dim_size * DimTileSize;
+    actual_height = min(DimTileSize, input_length_size - dim_index_base);
+  }
+};
+
+template <typename T, typename Index>
+void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
+                                const framework::Tensor& input,
+                                const framework::Tensor& segment_ids,
+                                const framework::Tensor& output,
+                                const framework::Tensor& out_grad,
+                                framework::Tensor* in_grad,
+                                const std::string pooltype = "SUM") {
+  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
+                                output.dims()[0]);
+  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  if (pooltype == "MAX" || pooltype == "MIN") {
+    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
+        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
+        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
+        out_grad.data<T>(), in_grad->data<T>(), h);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unsupported segment pooling grad operation, Only MAX, MIN "
+        "available, but got %s.",
+        pooltype));
+  }
+}
+
+template <typename T>
+__global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
+  for (int i = blockIdx.x; i < len; i += gridDim.x) {
+    __shared__ T y_i;
+    auto base = i * dim;
+    if (threadIdx.x == 0) {
+      y_i = y[i];
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < dim; j += blockDim.x) {
+      x[base + j] /= y_i;
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& input,
+                  const framework::Tensor& segment_ids,
+                  framework::Tensor* output,
+                  framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM") {
+    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
+                                   output->dims()[0]);
+    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    if (pooltype == "MEAN") {
+      SegmentMeanCustomKernel<
+          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                  config.thread_per_block.x, 0, ctx.stream()>>>(
+          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
+          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
+          h.output_length_size, h.total_stripe_count);
+    } else if (pooltype == "SUM") {
+      SumPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else if (pooltype == "MAX") {
+      MaxPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else if (pooltype == "MIN") {
+      MinPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
+          "available, but got %s.",
+          pooltype));
+    }
+  }
+};
+
+template <typename T, typename IndexT>
+class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM") {
+    if (pooltype == "MAX" || pooltype == "MIN") {
+      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
+                                            out_grad, in_grad, pooltype);
+    } else if (pooltype == "MEAN") {
+      framework::Tensor mean_grad;
+      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
+      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      int len = output.dims()[0];
+      int dim = output.numel() / len;
+      auto config = platform::GetGpuLaunchConfig1D(context, len);
+      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                     context.stream()>>>(mean_grad.data<T>(),
+                                         summed_ids->data<T>(), len, dim);
+      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+    } else if (pooltype == "SUM") {
+      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
+          "available, but got %s.",
+          pooltype));
+    }
+  }
+};
+
+using CUDA = paddle::platform::CUDADeviceContext;
+template class SegmentPoolFunctor<CUDA, float, int>;
+template class SegmentPoolFunctor<CUDA, float, int64_t>;
+template class SegmentPoolFunctor<CUDA, double, int>;
+template class SegmentPoolFunctor<CUDA, double, int64_t>;
+template class SegmentPoolGradFunctor<CUDA, float, int>;
+template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
+template class SegmentPoolGradFunctor<CUDA, double, int>;
+template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
new file mode 100644
index 00000000000..dc92d7fcc3a
--- /dev/null
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/segment_pool_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    segment_pool,
+    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    segment_pool_grad,
+    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index a505946b9f5..23b0c31608d 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -63,6 +63,46 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, output, static_cast<T>(0));
   }
+#ifdef PADDLE_WITH_CUDA
+  if (!cpu_place) {
+    Tensor length;
+    length.mutable_data<IndexT>(framework::make_ddim({1}),
+                                platform::CPUPlace());
+    IndexT* length_data = length.data<IndexT>();
+    const IndexT* segment_ids = segment->data<IndexT>();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
+                   cudaMemcpyDeviceToHost));
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host, 0,
+        platform::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = input->dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    output->Resize({dims});
+    output->mutable_data<T>(context.GetPlace());
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    math::SetConstant<DeviceContext, T> setconst;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    setconst(dev_ctx, output, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids = context.Output<Tensor>("SummedIds");
+      summed_ids->Resize({dims[0], 1});
+      summed_ids->mutable_data<T>(context.GetPlace());
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
 
   SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
 
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 67ea64833d3..f7c77071b12 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -128,5 +128,112 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
 }
 
 #endif
+
+// For atomicMax
+USE_CUDA_ATOMIC(Max, int);
+USE_CUDA_ATOMIC(Max, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
+
+CUDA_ATOMIC_WRAPPER(Max, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicMax(
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
+}
+
+CUDA_ATOMIC_WRAPPER(Max, float) {
+  if (*address >= val) {
+    return;
+  }
+
+  int *const address_as_i = (int *)address;
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+}
+
+CUDA_ATOMIC_WRAPPER(Max, double) {
+  if (*address >= val) {
+    return;
+  }
+
+  unsigned long long int *const address_as_ull =
+      (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+}
+
+// For atomicMin
+USE_CUDA_ATOMIC(Min, int);
+USE_CUDA_ATOMIC(Min, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
+
+CUDA_ATOMIC_WRAPPER(Min, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicMin(
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
+}
+
+CUDA_ATOMIC_WRAPPER(Min, float) {
+  if (*address <= val) {
+    return;
+  }
+
+  int *const address_as_i = (int *)address;
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+}
+
+CUDA_ATOMIC_WRAPPER(Min, double) {
+  if (*address <= val) {
+    return;
+  }
+
+  unsigned long long int *const address_as_ull =
+      (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+}
+
 }  // namespace platform
 }  // namespace paddle
-- 
GitLab