// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include "glog/logging.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" template struct AddTernary_1 { inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; } }; template struct AddTernary_2 { inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; } }; template struct AddTernary_3 { inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; } }; template void InitValue(T* data, size_t numel, const int val) { for (auto i = 0; i < numel; ++i) { data[i] = static_cast(val); } } template void TestCase(const phi::GPUContext& dev_ctx, const phi::DDim& dim1, const phi::DDim& dim2, const phi::DDim& dim3, const phi::DDim& dim_out, const size_t times, Func compute) { phi::DataType dtype = phi::CppTypeToDataType::Type(); const auto alloc_cpu = std::make_unique(phi::CPUPlace()); const auto alloc_gpu = std::make_unique(phi::GPUPlace()); auto in1 = std::make_shared( alloc_cpu.get(), phi::DenseTensorMeta(dtype, dim1, phi::DataLayout::NCHW)); auto in2 = std::make_shared( alloc_cpu.get(), phi::DenseTensorMeta(dtype, dim2, phi::DataLayout::NCHW)); auto in3 = std::make_shared( alloc_cpu.get(), phi::DenseTensorMeta(dtype, dim3, phi::DataLayout::NCHW)); InitValue(in1->data(), in1->numel(), 1); InitValue(in2->data(), in2->numel(), 1); InitValue(in3->data(), in3->numel(), 1); auto d_in1 = std::make_shared( alloc_gpu.get(), phi::DenseTensorMeta(dtype, dim1, phi::DataLayout::NCHW)); auto d_in2 = std::make_shared( alloc_gpu.get(), phi::DenseTensorMeta(dtype, dim2, phi::DataLayout::NCHW)); auto d_in3 = std::make_shared( alloc_gpu.get(), phi::DenseTensorMeta(dtype, dim3, phi::DataLayout::NCHW)); auto d_out = std::make_shared( alloc_gpu.get(), phi::DenseTensorMeta(dtype, dim_out, phi::DataLayout::NCHW)); phi::Copy(dev_ctx, *in1.get(), phi::GPUPlace(), false, d_in1.get()); phi::Copy(dev_ctx, *in2.get(), phi::GPUPlace(), false, d_in2.get()); phi::Copy(dev_ctx, *in3.get(), phi::GPUPlace(), false, d_in3.get()); std::vector inputs{ d_in1.get(), d_in2.get(), d_in3.get()}; std::vector outputs{d_out.get()}; for (int i = 0; i < times; ++i) { phi::funcs::BroadcastKernel(dev_ctx, inputs, &outputs, compute); } dev_ctx.Wait(); } TEST(Broadcast, add) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto place = phi::GPUPlace(); phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); auto* dev_ctx = static_cast(pool.GetByPlace(place)); size_t times = 10; do { auto dim1 = phi::make_ddim({1, 2048, 3584}); auto dim2 = phi::make_ddim({1, 2048, 1}); auto dim3 = phi::make_ddim({1, 1, 3584}); auto dim_out = phi::make_ddim({1, 2048, 3584}); TestCase( *dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_1()); TestCase(*dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_1()); TestCase(*dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_1()); } while (0); do { auto dim1 = phi::make_ddim({1, 256, 4, 256, 256}); auto dim2 = phi::make_ddim({1, 256, 1, 1, 256}); auto dim3 = phi::make_ddim({1, 1, 4, 256, 256}); auto dim_out = phi::make_ddim({1, 256, 4, 256, 256}); TestCase( *dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_2()); TestCase(*dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_2()); TestCase(*dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_2()); } while (0); do { auto dim1 = phi::make_ddim({1, 256, 256}); auto dim2 = phi::make_ddim({1, 1, 256}); auto dim3 = phi::make_ddim({1, 256, 1}); auto dim_out = phi::make_ddim({1, 256, 256}); TestCase( *dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_3()); TestCase(*dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_3()); TestCase(*dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_3()); } while (0); #endif }