未验证 提交 890d6bc0 编写于 作者: Z Zhang Zheng 提交者: GitHub

Modify some contents for elementwise op impl (#32414)

上级 1064f2b8
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
......@@ -34,7 +33,9 @@ namespace operators {
*/
template <typename T>
struct CudaAddFunctor {
inline HOSTDEVICE T operator()(T args[]) const { return args[0] + args[1]; }
__device__ __forceinline__ T operator()(const T* args) const {
return args[0] + args[1];
}
};
template <typename T>
......
......@@ -13,6 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#ifdef __HIPCC__
#define ELEMENTWISE_BLOCK_SIZE 256
#else
#define ELEMENTWISE_BLOCK_SIZE 512
#endif
namespace paddle {
namespace operators {
......@@ -90,8 +101,7 @@ struct ElementwiseDataWrapper {
template <ElementwiseType ET, int VecSize, typename T, typename Functor>
__device__ void VectorizedKernelImpl(
ElementwiseDataWrapper<ET, VecSize, T> data, int size, Functor func,
int tid) {
ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
using VecType = CudaAlignedVector<T, VecSize>;
VecType ins_vec[ET];
VecType out_vec;
......@@ -121,10 +131,9 @@ __device__ void VectorizedKernelImpl(
data.store_vector(out_vec, tid);
}
template <ElementwiseType ET, typename T, typename Functor>
__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, 1, T> data,
int size, Functor func, int start,
int remain) {
template <ElementwiseType ET, int VecSize, typename T, typename Functor>
__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
Functor func, int start, int remain) {
T ins[ET];
T out;
......@@ -146,12 +155,11 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int remain = size - VecSize * tid;
remain = remain > 0 ? remain : 0;
if (remain >= VecSize) {
auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
VectorizedKernelImpl(data, size, func, tid);
if (remain >= VecSize) {
VectorizedKernelImpl(data, func, tid);
} else {
auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
ScalarKernelImpl(data, size, func, tid * VecSize, remain);
ScalarKernelImpl(data, func, tid * VecSize, remain);
}
}
......@@ -162,7 +170,7 @@ __global__ void ScalarKernel(const T *__restrict__ in0,
auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int remain = tid < size ? 1 : 0;
ScalarKernelImpl(data, size, func, tid, remain);
ScalarKernelImpl(data, func, tid, remain);
}
template <ElementwiseType ET, typename T, typename Functor>
......@@ -173,7 +181,7 @@ void LaunchElementwiseCudaKernel(
// calculate the max vec_size for all ins and outs
auto size = ins[0]->numel();
int vec_size = GetVectorizedSize<T>(ins, *outs);
int block_size = PADDLE_CUDA_THREAD_SIZE;
int block_size = ELEMENTWISE_BLOCK_SIZE;
int grid_size =
((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
const T *in0 = ins[0]->data<T>();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册