// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/gpu/reduce.h" #ifdef __NVCC__ #include "cub/cub.cuh" #endif #ifdef __HIPCC__ #include namespace cub = hipcub; #endif namespace phi { #ifdef __HIPCC__ static constexpr int kNumCUDAThreads = 256; #else static constexpr int kNumCUDAThreads = 512; #endif static constexpr int kNumMaxinumNumBlocks = 4096; static inline int NumBlocks(const int N) { return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, kNumMaxinumNumBlocks); } template struct NonzeroFunctor { HOSTDEVICE explicit inline NonzeroFunctor() {} HOSTDEVICE inline T operator()(const T x) const { return static_cast(static_cast(x) != 0); } }; template struct DivFunctor { const T norm_; HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {} HOSTDEVICE inline T operator()(T loss) { loss /= norm_; return loss; } }; } // namespace phi