// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifndef _MSC_VER #include #include #include #include #endif #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/errors.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/gpu/shuffle_batch_utils.h" #include "paddle/phi/kernels/shuffle_batch_kernel.h" namespace phi { template void ShuffleBatchKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& seed, int startup_seed, DenseTensor* out, DenseTensor* shuffleidx, DenseTensor* seed_out) { #ifdef _MSC_VER PADDLE_THROW(phi::errors::Unimplemented( "GPU shuffle_batch is not supported on Windows yet")); #else int64_t x_embed_size = x.dims()[x.dims().size() - 1]; int64_t elem_size = 1; for (int i = 0; i < x.dims().size() - 1; i++) { elem_size *= x.dims()[i]; } shuffleidx->Resize(phi::make_ddim({elem_size})); int64_t seed_int = 0; if (seed.initialized()) { const auto& seed_place = seed.place().GetType(); bool is_gpu_place = seed_place == phi::AllocationType::GPU; if (is_gpu_place) { // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would // not be CUDAPlace in practice. This case would only happen in Python // op_test framework. phi::DenseTensor tmp_tensor; phi::Copy(dev_ctx, seed, phi::CPUPlace(), false, &tmp_tensor); seed_int = *(tmp_tensor.data()); } else { seed_int = *(seed.data()); } } else { seed_int = startup_seed; } auto* shuffleidx_data = dev_ctx.template Alloc(shuffleidx); #ifdef PADDLE_WITH_CUDA CacheAllocator allocator(dev_ctx.GetPlace()); const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #else const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream()); #endif thrust::random::default_random_engine engine(seed_int); thrust::counting_iterator cnt_iter(0); thrust::shuffle_copy(exec_policy, cnt_iter, cnt_iter + elem_size, thrust::device_pointer_cast(shuffleidx_data), engine); // TODO(zengjinle): for small data, direct cudaMemcpy may be better auto* x_data = x.data(); auto* out_data = dev_ctx.template Alloc(out); ReorderFunctor functor( x_data, shuffleidx_data, out_data, x_embed_size); phi::funcs::ForRange for_range(dev_ctx, elem_size * x_embed_size); for_range(functor); seed_out->Resize(phi::make_ddim({1})); auto* seed_out_data = dev_ctx.template HostAlloc(seed_out); *seed_out_data = engine(); #endif } } // namespace phi PD_REGISTER_KERNEL(shuffle_batch, GPU, ALL_LAYOUT, phi::ShuffleBatchKernel, float, double, int32_t, int64_t) { kernel->OutputAt(1).SetDataType(phi::DataType::INT64); kernel->OutputAt(2).SetDataType(phi::DataType::INT64); } #endif