“1df23957ba2da6eb8c7a5c2ad7bb1d4732132873”上不存在“git@gitcode.net:openharmony/kernel_linux.git”
未验证 提交 2739bd73 编写于 作者: Y YuanRisheng 提交者: GitHub

[Phi]Change Copy from Kernel to basic component utils (#43622)

* perfect copy

* deal with conflict

* deal with conflict

* fix compile bugs

* fix unittest bugs

* change code format

* deal with conflict

* modify code by review

* fix ce bugs

* fix ce bugs

* add lo

* perfect code format

* deal with conflicts
上级 7985407b
......@@ -21,13 +21,6 @@
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
#endif
namespace eager_test {
using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
class AutogradMetaTest : public AbstractAutogradMeta {
......@@ -212,7 +205,8 @@ TEST(EagerVariable, Constructor) {
TEST(EagerVariable, DataLayout) {
paddle::experimental::Tensor tensor;
phi::DenseTensorMeta meta =
phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1, 1, 1}),
phi::DenseTensorMeta(phi::DataType::FLOAT32,
phi::make_ddim({1, 1, 1, 1}),
paddle::experimental::DataLayout::UNDEFINED);
std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
std::make_unique<paddle::experimental::DefaultAllocator>(
......
......@@ -30,7 +30,6 @@
#include "paddle/phi/core/tensor_meta.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
namespace egr {
......@@ -44,9 +43,12 @@ TEST(Backward, SingleNodeEmptyGrad) {
// Create Target Tensor
paddle::experimental::Tensor target_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
paddle::experimental::Tensor leaf_tensor;
{
......@@ -92,17 +94,24 @@ TEST(Backward, SingleNodeCustomGrad) {
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
std::vector<paddle::experimental::Tensor> grad_tensors;
// Create Grad Tensor
paddle::experimental::Tensor grad_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
10.0 /*value*/,
false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor));
paddle::experimental::Tensor leaf_tensor;
......@@ -157,9 +166,13 @@ TEST(Backward, LinearNodes) {
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
paddle::experimental::Tensor leaf_tensor;
......@@ -229,25 +242,39 @@ TEST(Backward, WithAccumulation) {
// Create Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor0 =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
paddle::experimental::Tensor tensor1 =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor0));
target_tensors.emplace_back(std::move(tensor1));
// Create Grad Tensor
std::vector<paddle::experimental::Tensor> grad_tensors;
paddle::experimental::Tensor grad_tensor0 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
5.0 /*value*/,
false /*is_leaf*/);
paddle::experimental::Tensor grad_tensor1 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
10.0 /*value*/,
false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor0));
grad_tensors.emplace_back(std::move(grad_tensor1));
......
......@@ -29,7 +29,6 @@
#include "paddle/phi/core/tensor_meta.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
namespace egr {
......@@ -43,15 +42,21 @@ TEST(Grad, SingleNodeEmptyGrad) {
// Create Target Tensor (output)
paddle::experimental::Tensor output_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
// Create input tensor
const paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
true /*is_leaf*/);
{
// Create Scale Node
......@@ -103,23 +108,33 @@ TEST(Grad, SingleNodeCustomGrad) {
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
std::vector<paddle::experimental::Tensor> grad_tensors;
// Create Grad Tensor
paddle::experimental::Tensor grad_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
10.0 /*value*/,
false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
true /*is_leaf*/);
{
// Create Scale Node
......@@ -172,15 +187,22 @@ TEST(Grad, LinearNodes) {
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
true /*is_leaf*/);
{
// Create Node0
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
......@@ -247,25 +269,39 @@ TEST(Grad, WithAccumulation) {
// Create Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor0 =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
paddle::experimental::Tensor tensor1 =
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
1.0 /*value*/,
false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor0));
target_tensors.emplace_back(std::move(tensor1));
// Create Grad Tensor
std::vector<paddle::experimental::Tensor> grad_tensors;
paddle::experimental::Tensor grad_tensor0 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
5.0 /*value*/,
false /*is_leaf*/);
paddle::experimental::Tensor grad_tensor1 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
egr_utils_api::CreateTensorWithValue(ddim,
paddle::platform::CPUPlace(),
phi::DataType::FLOAT32,
phi::DataLayout::NCHW,
10.0 /*value*/,
false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor0));
grad_tensors.emplace_back(std::move(grad_tensor1));
......
......@@ -21,7 +21,7 @@
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/jit/layer.h"
......
......@@ -20,9 +20,9 @@ limitations under the License. */
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/copy_kernel.h"
namespace paddle {
namespace operators {
......@@ -41,7 +41,9 @@ struct GreaterThan {
// Value can be decided in compile time.
template <typename FUN, int INT_32 = 32>
constexpr bool CheckProperTileSize(int tile_long, int tile_short, int size_T,
constexpr bool CheckProperTileSize(int tile_long,
int tile_short,
int size_T,
FUN op) {
return (size_T == 16 && ((tile_long == INT_32 && op(tile_short, 4)) ||
(tile_long == 2 * INT_32 && op(tile_short, 4)) ||
......@@ -79,7 +81,8 @@ constexpr bool CheckNonLongTileSize(int tile_long, int tile_short, int size_T) {
// Use SM to do data transfer, load a tile into SM then store out.
// All tile read and write are colascing, so can speedup memory copy
template <typename T, int NumThreads, int TileX, int TileY>
__global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
__global__ void TilingSwapDim1And2(const T* __restrict__ input,
Dim3 input_dims,
T* __restrict__ output) {
assert(blockDim.x == NumThreads);
assert(blockDim.y == 1);
......@@ -218,12 +221,14 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
template <int TSIZE>
bool SelectProperTileSize(std::vector<std::pair<int, int>>* tiles) {
PADDLE_ENFORCE_LE(
TSIZE, 16,
TSIZE,
16,
platform::errors::InvalidArgument(
"The tile size should smaller than 16, but received is:%d.", TSIZE));
PADDLE_ENFORCE_EQ(
(TSIZE & (TSIZE - 1)), 0,
(TSIZE & (TSIZE - 1)),
0,
platform::errors::InvalidArgument(
"Data types should be powers of 2, but reived size is:%d.", TSIZE));
......@@ -269,29 +274,37 @@ struct SystemElemType<16> {
};
template <typename T, int tile_long, int tile_short>
void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims,
void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d,
int tile_size_i,
int tile_size_j,
int total_tiles_count,
const T* input,
const Dim3& input_dims,
T* output) {
constexpr int NumThreads = tile_long;
if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
TilingSwapDim1And2<T, NumThreads, tile_long, tile_short>
<<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
output);
<<<total_tiles_count, NumThreads, 0, d.stream()>>>(
input, input_dims, output);
} else {
TilingSwapDim1And2<T, NumThreads, tile_short, tile_long>
<<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
output);
<<<total_tiles_count, NumThreads, 0, d.stream()>>>(
input, input_dims, output);
}
}
template <typename T, int tile_long, int tile_short, typename dummy = void>
struct NarrowDims2TransposeDispatch {
static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims, T* output) {
static void DoTranspose(const phi::GPUContext& d,
int tile_size_i,
int tile_size_j,
int total_tiles_count,
const T* input,
const Dim3& input_dims,
T* output) {
PADDLE_ENFORCE_EQ(
(tile_long & (tile_long - 1)), 0,
(tile_long & (tile_long - 1)),
0,
platform::errors::InvalidArgument(
"The length of the longer side of the tile should be power of 2."
" But received value is:%d.",
......@@ -302,7 +315,12 @@ struct NarrowDims2TransposeDispatch {
if (request_satisfied) {
LaunchNarrowDims2TransposeKernel<T, tile_long, tile_short>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
d,
tile_size_i,
tile_size_j,
total_tiles_count,
input,
input_dims,
output);
return;
}
......@@ -312,11 +330,21 @@ struct NarrowDims2TransposeDispatch {
if (long_side_request_not_satisfied) {
NarrowDims2TransposeDispatch<T, tile_long * 2, tile_short>::DoTranspose(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
d,
tile_size_i,
tile_size_j,
total_tiles_count,
input,
input_dims,
output);
} else {
NarrowDims2TransposeDispatch<T, tile_long, tile_short + 1>::DoTranspose(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
d,
tile_size_i,
tile_size_j,
total_tiles_count,
input,
input_dims,
output);
}
}
......@@ -325,14 +353,22 @@ struct NarrowDims2TransposeDispatch {
// If Not long tile size, goto this function when compile.
template <typename T, int tile_long, int tile_short>
struct NarrowDims2TransposeDispatch<
T, tile_long, tile_short,
typename std::enable_if<
CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> {
static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims, T* output) {
T,
tile_long,
tile_short,
typename std::enable_if<CheckNonLongTileSize(
tile_long, tile_short, sizeof(T)),
void>::type> {
static void DoTranspose(const phi::GPUContext& d,
int tile_size_i,
int tile_size_j,
int total_tiles_count,
const T* input,
const Dim3& input_dims,
T* output) {
PADDLE_ENFORCE_EQ(
(tile_long & (tile_long - 1)), 0,
(tile_long & (tile_long - 1)),
0,
platform::errors::InvalidArgument(
"The length of the longer side of the tile should be power of 2."
" But received value is:%d.",
......@@ -343,13 +379,23 @@ struct NarrowDims2TransposeDispatch<
if (request_satisfied) {
LaunchNarrowDims2TransposeKernel<T, tile_long, tile_short>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
d,
tile_size_i,
tile_size_j,
total_tiles_count,
input,
input_dims,
output);
return;
}
NarrowDims2TransposeDispatch<T, tile_long, tile_short + 1>::DoTranspose(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
d,
tile_size_i,
tile_size_j,
total_tiles_count,
input,
input_dims,
output);
}
};
......@@ -357,34 +403,49 @@ struct NarrowDims2TransposeDispatch<
// If long tile size, goto this function when compile.
template <typename T, int tile_long, int tile_short>
struct NarrowDims2TransposeDispatch<
T, tile_long, tile_short,
T,
tile_long,
tile_short,
typename std::enable_if<CheckLongTileSize(tile_long, tile_short, sizeof(T)),
void>::type> {
static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims, T* output) {
static void DoTranspose(const phi::GPUContext& d,
int tile_size_i,
int tile_size_j,
int total_tiles_count,
const T* input,
const Dim3& input_dims,
T* output) {
PADDLE_ENFORCE_EQ(
(tile_long & (tile_long - 1)), 0,
(tile_long & (tile_long - 1)),
0,
platform::errors::InvalidArgument(
"The length of the longer side of the tile should be power of 2,"
" but received is:%d.",
tile_long));
LaunchNarrowDims2TransposeKernel<T, tile_long, tile_short>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
d,
tile_size_i,
tile_size_j,
total_tiles_count,
input,
input_dims,
output);
}
};
template <typename T, bool conjugate = false>
void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
const Dim3& input_dims, T* output,
void SwapDim1And2InNarrow(const phi::GPUContext& d,
const T* input,
const Dim3& input_dims,
T* output,
const int kMinTileSize) {
// First get available tile sizes for the data type requested as backups
std::vector<std::pair<int, int>> tile_sele;
auto ret = SelectProperTileSize<sizeof(T)>(&tile_sele);
PADDLE_ENFORCE_EQ(
ret, true,
ret,
true,
platform::errors::InvalidArgument(
"SelectProperTileSize should return true, but return value is:%d.",
ret));
......@@ -451,16 +512,22 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
using ElemType = typename SystemElemType<sizeof(T)>::type;
NarrowDims2TransposeDispatch<ElemType, 32, 2>::DoTranspose(
d, select_tile_size_i, select_tile_size_j, total_tiles_count,
reinterpret_cast<const ElemType*>(input), input_dims,
d,
select_tile_size_i,
select_tile_size_j,
total_tiles_count,
reinterpret_cast<const ElemType*>(input),
input_dims,
reinterpret_cast<ElemType*>(output));
}
// This is for case that cannot do coalescing read and write.
// Or input is too small to split into tiles.
template <typename T, int pos0, int pos1, int pos2>
__global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
Dim3 input_dims, T* __restrict__ output) {
__global__ void TransposeSimpleKernel(int nthreads,
const T* __restrict__ input,
Dim3 input_dims,
T* __restrict__ output) {
Dim3 output_dims;
output_dims[pos0] = input_dims[0];
output_dims[pos1] = input_dims[1];
......@@ -482,8 +549,10 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
// Here suppose convert all tensor to dim3, so just change dim1 and 2.
template <typename T>
void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
const Dim3& input_dims, T* output) {
void SendSwapDim1And2InTranspose(const phi::GPUContext& d,
const T* input,
const Dim3& input_dims,
T* output) {
// Suppose tile size > 16
static const int kMinTileSize = 16;
static const int kMinNarrowTileSize = 96;
......@@ -508,8 +577,8 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
input_dims_aligned[0] * input_dims_aligned[1] * input_dims_aligned[2];
TilingSwapDim1And2<T, kNumThreads, kTileSize, kTileSize>
<<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
output);
<<<total_tiles_count, kNumThreads, 0, d.stream()>>>(
input, input_dims, output);
} else if (narrow_tile) {
// If input shape is like Rect, such as 2X100, use Narrow tile size.
......@@ -529,8 +598,10 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
template <typename T>
struct SwapDim1And2InTranspose {
typedef phi::GPUContext Device;
void operator()(const Device& d, const T* in,
const std::vector<int>& combined_dims, T* out) {
void operator()(const Device& d,
const T* in,
const std::vector<int>& combined_dims,
T* out) {
Dim3 input_dims = {static_cast<int>(combined_dims[0]),
static_cast<int>(combined_dims[1]),
static_cast<int>(combined_dims[2])};
......@@ -541,8 +612,10 @@ struct SwapDim1And2InTranspose {
template <typename T>
struct SwapDim0And2InTranspose {
typedef phi::GPUContext Device;
void operator()(const Device& d, const T* in,
const std::vector<int>& combined_dims, T* out) {
void operator()(const Device& d,
const T* in,
const std::vector<int>& combined_dims,
T* out) {
Dim3 input_dims = {static_cast<int>(combined_dims[0]),
static_cast<int>(combined_dims[1]),
static_cast<int>(combined_dims[2])};
......@@ -562,11 +635,13 @@ inline void CombineTransposeDim3(const framework::DDim& shape,
const std::vector<int>& perm,
std::vector<int>* new_perm,
framework::DDim* new_dims) {
PADDLE_ENFORCE_EQ(shape.size(), perm.size(),
PADDLE_ENFORCE_EQ(shape.size(),
perm.size(),
platform::errors::InvalidArgument(
" shape should have the save dim with perm, but"
" received shape size is:%d, perm size is:%d.",
shape.size(), perm.size()));
shape.size(),
perm.size()));
std::vector<int> dim_vec;
if (shape.size() == 1) {
......@@ -614,8 +689,10 @@ inline void CombineTransposeDim3(const framework::DDim& shape,
template <typename T>
struct TransposeSimple {
static bool run(const phi::GPUContext& ctx, const Tensor& in,
const std::vector<int32_t> perm, Tensor* out) {
static bool run(const phi::GPUContext& ctx,
const Tensor& in,
const std::vector<int32_t> perm,
Tensor* out) {
// First reduce the dimensions of the input tensor if possible.
std::vector<int> new_perm;
framework::DDim new_dims;
......@@ -805,7 +882,8 @@ __global__ void VectorizedPermuteKernel(PermuteParams<Rank, IndexT> params,
// A general kernel for normal case, only support vectorized write.
template <typename T, typename IndexT, int VecSize, int Rank>
__global__ void GeneralPermuteKernel(PermuteParams<Rank, IndexT> params,
const T* __restrict__ src, T* dst,
const T* __restrict__ src,
T* dst,
const size_t main_cnt,
const size_t tail_cnt,
const size_t offset) {
......@@ -859,10 +937,12 @@ __global__ void GeneralPermuteKernel(PermuteParams<Rank, IndexT> params,
// A Gerneral permute method that drectly find the dst data
// coordinate in the source data.
template <typename T, typename IndexT, int VecSize, int Rank>
inline void LaunchPermuteKernel(const phi::GPUContext& ctx, const IndexT count,
inline void LaunchPermuteKernel(const phi::GPUContext& ctx,
const IndexT count,
const PermuteType perm_type,
const std::vector<size_t>& dims,
const std::vector<int>& perm, const T* src,
const std::vector<int>& perm,
const T* src,
T* dst) {
size_t main_count = count / VecSize;
auto params = PermuteParams<Rank, IndexT>(dims, perm);
......@@ -871,14 +951,12 @@ inline void LaunchPermuteKernel(const phi::GPUContext& ctx, const IndexT count,
if (perm_type == PermuteType::kNormalPermute) {
size_t tail_count = count - main_count * VecSize;
size_t offset = count - tail_count;
GeneralPermuteKernel<
T, IndexT, VecSize,
Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
GeneralPermuteKernel<T, IndexT, VecSize, Rank>
<<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
params, src, dst, main_count, tail_count, offset);
} else {
VectorizedPermuteKernel<
T, IndexT, VecSize,
Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
VectorizedPermuteKernel<T, IndexT, VecSize, Rank>
<<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
params, main_count, src, dst);
}
}
......@@ -889,11 +967,12 @@ inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx,
const PermuteType perm_type,
const std::vector<size_t>& dims,
const std::vector<int>& perm,
const T* src, T* dst) {
const T* src,
T* dst) {
#define CALL_DISPATCH_RANK(rank) \
case rank: { \
LaunchPermuteKernel<T, IndexT, VecSize, rank>(ctx, count, perm_type, dims, \
perm, src, dst); \
LaunchPermuteKernel<T, IndexT, VecSize, rank>( \
ctx, count, perm_type, dims, perm, src, dst); \
break; \
}
......@@ -915,7 +994,9 @@ inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx,
// https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
template <typename T, typename IndexT, int VecSize>
__global__ void BatchTransposeKernel(const T* __restrict__ src_data,
T* dst_data, IndexT rows, IndexT cols) {
T* dst_data,
IndexT rows,
IndexT cols) {
using VecT = phi::AlignedVector<T, VecSize>;
__shared__ VecT tile[kTileSize][kShareCol];
......@@ -961,10 +1042,13 @@ __global__ void BatchTransposeKernel(const T* __restrict__ src_data,
// With the byte limitation of shared_memory, the VecSize shall be restricted
// for the type whose byte-size is less than 8.
template <typename T, typename IndexT, int Size,
template <typename T,
typename IndexT,
int Size,
int VecSize = (sizeof(T) > 8 ? 1 : Size)>
inline void LaunchTransposeKernel(const phi::GPUContext& ctx,
const std::vector<size_t>& dims, const T* src,
const std::vector<size_t>& dims,
const T* src,
T* dst) {
auto rank = dims.size();
IndexT num_batches = (rank == 2) ? 1 : dims[0];
......@@ -976,9 +1060,8 @@ inline void LaunchTransposeKernel(const phi::GPUContext& ctx,
dim3 blocks(num_tile_cols, num_tile_rows, num_batches);
dim3 threads(kTileSize, kBlockRows, 1);
BatchTransposeKernel<T, IndexT,
VecSize><<<blocks, threads, 0, ctx.stream()>>>(
src, dst, rows, cols);
BatchTransposeKernel<T, IndexT, VecSize>
<<<blocks, threads, 0, ctx.stream()>>>(src, dst, rows, cols);
}
template <typename T, typename IndexT>
......@@ -987,14 +1070,16 @@ inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx,
const PermuteType perm_type,
const std::vector<size_t>& dims,
const std::vector<int>& perm,
const T* src, T* dst, IndexT count) {
const T* src,
T* dst,
IndexT count) {
#define CALL_DISPATCH_VEC_SIZE(vec_size) \
case vec_size: { \
if (perm_type == PermuteType::kTranspose) { \
LaunchTransposeKernel<T, IndexT, vec_size>(ctx, dims, src, dst); \
} else { \
LaunchPermuteRankDispatch<T, IndexT, vec_size>(ctx, count, perm_type, \
dims, perm, src, dst); \
LaunchPermuteRankDispatch<T, IndexT, vec_size>( \
ctx, count, perm_type, dims, perm, src, dst); \
} \
break; \
}
......@@ -1014,45 +1099,64 @@ inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx,
template <typename T>
inline void LaunchWithDispatchIndex(const phi::GPUContext& ctx,
const size_t count, const int vec_size,
const size_t count,
const int vec_size,
const PermuteType perm_type,
const std::vector<size_t>& dims,
const std::vector<int>& perm, const T* src,
const std::vector<int>& perm,
const T* src,
T* dst) {
if (count < std::numeric_limits<uint32_t>::max()) {
LaunchWithDispatchVecSize<T, uint32_t>(ctx, vec_size, perm_type, dims, perm,
src, dst,
LaunchWithDispatchVecSize<T, uint32_t>(ctx,
vec_size,
perm_type,
dims,
perm,
src,
dst,
static_cast<uint32_t>(count));
} else {
int64_t cnt = static_cast<int64_t>(count);
LaunchWithDispatchVecSize<T, int64_t>(ctx, vec_size, perm_type, dims, perm,
src, dst,
LaunchWithDispatchVecSize<T, int64_t>(ctx,
vec_size,
perm_type,
dims,
perm,
src,
dst,
static_cast<int64_t>(count));
}
}
template <typename DeviceContext, typename T>
inline void SimplifyThenLaunch(const int rank, const DeviceContext& ctx,
const Tensor& in, Tensor* out,
inline void SimplifyThenLaunch(const int rank,
const DeviceContext& ctx,
const Tensor& in,
Tensor* out,
const std::vector<int32_t>& perm) {
int sm_count = ctx.GetSMCount();
auto src_dims = phi::vectorize<size_t>(in.dims());
auto simplifier = DimsSimplifier<T>(sm_count, rank, perm, src_dims,
in.data<T>(), out->data<T>());
auto simplifier = DimsSimplifier<T>(
sm_count, rank, perm, src_dims, in.data<T>(), out->data<T>());
if (simplifier.GetPermType() == PermuteType::kCopy) {
// If perm is [0,1,2,3], then just operate a DtoD copy.
phi::Copy(ctx, in, ctx.GetPlace(), false, out);
} else {
LaunchWithDispatchIndex<T>(
ctx, simplifier.GetCount(), simplifier.GetVecSize(),
simplifier.GetPermType(), simplifier.GetDims(), simplifier.GetPerm(),
in.data<T>(), out->data<T>());
LaunchWithDispatchIndex<T>(ctx,
simplifier.GetCount(),
simplifier.GetVecSize(),
simplifier.GetPermType(),
simplifier.GetDims(),
simplifier.GetPerm(),
in.data<T>(),
out->data<T>());
}
}
template <typename T>
size_t GetTransposeKey(const int rank, const Tensor& in,
size_t GetTransposeKey(const int rank,
const Tensor& in,
const std::vector<int32_t>& perm) {
auto in_shape = phi::vectorize(in.dims());
return phi::autotune::GetKey(
......@@ -1060,15 +1164,19 @@ size_t GetTransposeKey(const int rank, const Tensor& in,
}
template <typename T>
void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int rank,
void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx,
const int rank,
const Tensor& in,
const std::vector<int32_t>& perm, Tensor* out) {
const std::vector<int32_t>& perm,
Tensor* out) {
PADDLE_ENFORCE_LT(
rank, phi::DDim::kMaxRank,
rank,
phi::DDim::kMaxRank,
platform::errors::OutOfRange(
"The maximum dimension rank of "
"tensor is expected to be less than %d, but here is %d.",
phi::DDim::kMaxRank, rank));
phi::DDim::kMaxRank,
rank));
auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
if (!ret) {
......
......@@ -19,8 +19,8 @@ limitations under the License. */
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/transfer_layout_kernel.h"
#include "paddle/fluid/framework/tensor_util.h"
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
......@@ -31,10 +32,7 @@ void copy(const Tensor& src, const Place& place, bool blocking, Tensor* dst) {
kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
VLOG(6) << "copy API kernel key: " << kernel_key;
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy", kernel_key);
VLOG(6) << "copy API kernel: " << kernel;
VLOG(6) << "start copy. ";
auto target_place = phi::TransToPhiPlace(kernel_key.backend());
auto& pool = paddle::experimental::DeviceContextPool::Instance();
......@@ -47,14 +45,9 @@ void copy(const Tensor& src, const Place& place, bool blocking, Tensor* dst) {
phi::MetaTensor meta_out(kernel_out);
phi::UnchangedInferMeta(*dense_x, &meta_out);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
phi::Place,
bool,
phi::DenseTensor*);
phi::Copy(*dev_ctx, *dense_x, place, blocking, kernel_out);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);
VLOG(6) << "copy finished. ";
}
} // namespace experimental
......
......@@ -19,9 +19,11 @@ limitations under the License. */
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/tensor_base.h"
#include "paddle/phi/api/include/context_pool.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/infermeta/unary.h"
// clang-format off
......@@ -113,9 +115,15 @@ void Tensor::copy_(const Tensor &src,
// Deep Copy AutoGrad info from src to self.
*autograd_meta_ = *(src.autograd_meta_);
}
kernel_key_set.backend_set =
kernel_key_set.backend_set |
BackendSet(phi::TransToPhiBackend(target_place));
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
auto place = phi::TransToPhiPlace(kernel_key.backend());
auto& pool = paddle::experimental::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetMutable(
place.GetType() == target_place.GetType() ? target_place : place);
Backend kernel_backend = Backend::UNDEFINED;
DataLayout kernel_layout = DataLayout::UNDEFINED;
DataType kernel_data_type = DataType::UNDEFINED;
......@@ -135,45 +143,25 @@ void Tensor::copy_(const Tensor &src,
}
if (kernel_type == KernelType::DENSE_TENSOR_KENREL) {
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy", {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << "copy API kernel key: " << kernel_key;
VLOG(6) << "copy API kernel: " << kernel;
using kernel_signature = void (*)(const platform::DeviceContext &,
const phi::DenseTensor &,
phi::Place,
bool,
phi::DenseTensor *);
SetKernelOutput(kernel_backend, this);
phi::MetaTensor meta_out(impl_.get());
phi::UnchangedInferMeta(
MakeMetaTensor(
*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
&meta_out);
auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx,
phi::Copy(*dev_ctx,
(*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
target_place,
blocking,
static_cast<phi::DenseTensor *>(impl_.get()));
} else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) {
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy_sr", {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << "copy API kernel key: " << kernel_key;
VLOG(6) << "copy API kernel: " << kernel;
using kernel_signature = void (*)(const platform::DeviceContext &,
const phi::SelectedRows &,
phi::Place,
bool,
phi::SelectedRows *);
SetSelectedRowsKernelOutput(kernel_backend, this);
phi::MetaTensor meta_out(impl_.get());
phi::UnchangedInferMeta(
MakeMetaTensor(
*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
&meta_out);
auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx,
phi::Copy(*dev_ctx,
(*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
target_place,
blocking,
......
......@@ -81,3 +81,43 @@ if(WITH_MKLDNN)
add_dependencies(dense_tensor mkldnn)
add_dependencies(tensor_base mkldnn)
endif()
if(WITH_GPU)
nv_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS cpu_context
gpu_context
dense_tensor
selected_rows
malloc
memcpy
device_context)
elseif(WITH_ROCM)
hip_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS cpu_context
gpu_context
dense_tensor
selected_rows
malloc
memcpy
device_context)
elseif(WITH_XPU_KP)
xpu_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS cpu_context
xpu_context
dense_tensor
selected_rows
malloc
memcpy
device_context)
else()
cc_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS cpu_context dense_tensor selected_rows malloc memcpy device_context)
endif()
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
......@@ -43,27 +43,46 @@ void Copy(const Context& dev_ctx,
void* dst_ptr = nullptr;
if (paddle::platform::is_cpu_place(dst_place)) {
dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
} else if (paddle::platform::is_cuda_pinned_place(dst_place)) {
// now we only can use mutable_data to Alloc pinned memory here,
// dev_ctx can not alloc pinned memory now
dst_ptr = dst->mutable_data(dst_place, src.dtype());
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if (paddle::platform::is_gpu_place(dst_place) ||
paddle::platform::is_cuda_pinned_place(dst_place)) {
dst_ptr = dev_ctx.Alloc(
dst, src.dtype(), 0, paddle::platform::is_cuda_pinned_place(dst_place));
#endif
#ifdef PADDLE_WITH_XPU
} else if (paddle::platform::is_xpu_place(dst_place)) {
dst_ptr = dev_ctx.Alloc(dst, src.dtype());
#endif
}
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
if (UNLIKELY(size) == 0) {
return;
}
PADDLE_ENFORCE_EQ(
dst->place(),
dst_place,
phi::errors::Unavailable(
"The Dst Tensor's place and dst_place do not match, Tensor's place "
"place is %s, dst_place is %s.",
dst->place(),
dst_place));
if (src_ptr == dst_ptr && src_place == dst_place) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
CHECK(dst->layout() == src.layout());
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
if ((paddle::platform::is_cpu_place(src_place) ||
if (paddle::platform::is_cpu_place(src_place) &&
paddle::platform::is_cpu_place(dst_place)) {
paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if ((paddle::platform::is_cpu_place(src_place) ||
paddle::platform::is_cuda_pinned_place(src_place)) && // NOLINT
(paddle::platform::is_cpu_place(dst_place) ||
paddle::platform::is_cuda_pinned_place(dst_place))) {
......@@ -176,13 +195,87 @@ void Copy(const Context& dev_ctx,
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
#endif
}
#ifdef PADDLE_WITH_XPU
else if (paddle::platform::is_xpu_place(src_place) && // NOLINT
paddle::platform::is_cpu_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_cpu_place(src_place) &&
paddle::platform::is_xpu_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_xpu_place(src_place) &&
paddle::platform::is_xpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else {
PADDLE_THROW(phi::errors::InvalidArgument(
"Place type error. Please check the place of src and dst Tensor."));
PADDLE_THROW(phi::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
}
} // namespace phi
template <typename Context>
void Copy(const Context& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst) {
if (src.value().Holder() != dst->value().Holder() ||
src.value().data() != dst->value().data()) {
dst->set_rows(src.rows());
dst->set_height(src.height());
}
Copy<Context>(
dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
}
template void Copy(const CPUContext& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst);
template void Copy(const DeviceContext& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst);
template void Copy(const CPUContext& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst);
template void Copy(const DeviceContext& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst);
PD_REGISTER_GENERAL_KERNEL(
copy, GPU, ALL_LAYOUT, phi::Copy<phi::GPUContext>, ALL_DTYPE) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template void Copy(const GPUContext& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst);
template void Copy(const GPUContext& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst);
#endif
#ifdef PADDLE_WITH_XPU
template void Copy(const XPUContext& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst);
#endif
} // namespace phi
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/core/tensor_meta.h"
namespace phi {
......@@ -70,4 +71,18 @@ class DenseTensorUtils {
}
};
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst);
template <typename Context>
void Copy(const Context& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst);
} // namespace phi
......@@ -29,7 +29,8 @@ set(COMMON_KERNEL_DEPS
arg_map_context
convert_utils
lod_utils
custom_kernel)
custom_kernel
phi_tensor_utils)
set(COMMON_KERNEL_DEPS
${COMMON_KERNEL_DEPS}
eigen_function
......
......@@ -16,7 +16,7 @@
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/utils/optional.h"
namespace phi {
......
......@@ -20,8 +20,8 @@
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
namespace tune = phi::autotune;
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst);
} // namespace phi
......@@ -20,7 +20,7 @@
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
DECLARE_int32(inner_op_parallelism);
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi {
// NOTE(chenweihang): blocking is useless in cpu kernel
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
const auto& src_place = src.place();
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< src_place;
dst->Resize(src.dims());
auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< src_place;
return;
}
VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
CHECK(dst->layout() == src.layout());
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
if (paddle::platform::is_cpu_place(src_place)) {
paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
}
}
} // namespace phi
PD_REGISTER_GENERAL_KERNEL(
copy, CPU, ALL_LAYOUT, phi::Copy<phi::CPUContext>, ALL_DTYPE) {}
......@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/softmax_kernel.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cpu/elementwise_grad.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cpu/elementwise_grad.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cpu/elementwise_grad.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......
......@@ -15,7 +15,7 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -22,7 +22,7 @@
#endif
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/mode.h"
namespace phi {
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -17,7 +17,7 @@
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cpu/rnn_functor.h"
#include "paddle/phi/kernels/funcs/activation_functor.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/cpu/rnn_functor.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/full_kernel.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/gather.h"
#include "paddle/phi/kernels/funcs/scatter.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/scatter.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/gather.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/scatter.h"
namespace phi {
......
......@@ -21,7 +21,7 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/compare_functors.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -16,8 +16,8 @@
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
namespace phi {
......
......@@ -35,7 +35,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -24,7 +24,7 @@
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
......
......@@ -24,7 +24,7 @@
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/range_function.h"
namespace phi {
......
......@@ -28,8 +28,8 @@ namespace cub = hipcub;
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -28,8 +28,8 @@ namespace cub = hipcub;
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -19,7 +19,7 @@
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
......
......@@ -19,7 +19,7 @@
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
......
......@@ -17,7 +17,7 @@
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/elementwise_grad_kernel.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
......
......@@ -17,7 +17,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/norm_utils.h"
#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
......
......@@ -17,7 +17,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/data_type_transform.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/psroi_pool_kernel.h"
......
......@@ -20,7 +20,7 @@
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/scatter.cu.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/scatter.cu.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/scatter.cu.h"
namespace phi {
......
......@@ -22,7 +22,7 @@
#include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/gpu/reduce.h"
......
......@@ -17,7 +17,7 @@
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -28,7 +28,7 @@
#include "paddle/fluid/framework/tensor_util.h" // TensorToVector()
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/unique_functor.h"
namespace phi {
......
......@@ -33,7 +33,7 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/compare_functors.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
......
......@@ -17,7 +17,6 @@
#include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
#include "paddle/phi/kernels/cholesky_solve_kernel.h"
#include "paddle/phi/kernels/complex_kernel.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/elementwise_add_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/expand_kernel.h"
......
......@@ -16,7 +16,6 @@
#include "paddle/phi/kernels/cholesky_solve_kernel.h"
#include "paddle/phi/kernels/complex_kernel.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/expand_kernel.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/determinant_grad_kernel.h"
#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
namespace phi {
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/impl/expand_kernel_impl.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/meshgrid_kernel.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
......
......@@ -17,7 +17,7 @@
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
template <typename T, typename Context>
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include <type_traits>
#include <vector>
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/tile_grad_kernel.h"
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
template <typename T, typename Context>
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h"
namespace phi {
......
......@@ -20,7 +20,7 @@
#include "paddle/fluid/operators/math/sequence_scale.h"
#include "paddle/phi/backends/dynload/warpctc.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/utils/optional.h"
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
......@@ -16,8 +16,8 @@
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
namespace phi {
......
......@@ -16,7 +16,7 @@
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/selected_rows/copy_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
namespace phi {
namespace sr {
template <typename Context>
void Copy(const Context& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst) {
if (src.value().Holder() != dst->value().Holder() ||
src.value().data() != dst->value().data()) {
dst->set_rows(src.rows());
dst->set_height(src.height());
}
phi::Copy<Context>(
dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
}
} // namespace sr
} // namespace phi
PD_REGISTER_GENERAL_KERNEL(
copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy<phi::CPUContext>, ALL_DTYPE) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_GENERAL_KERNEL(
copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy<phi::GPUContext>, ALL_DTYPE) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace phi {
namespace sr {
template <typename Context>
void Copy(const Context& dev_ctx,
const SelectedRows& src,
Place dst_place,
bool blocking,
SelectedRows* dst);
} // namespace sr
} // namespace phi
......@@ -19,7 +19,7 @@
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
namespace phi {
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
......
......@@ -24,7 +24,7 @@
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
namespace sparse {
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
......
......@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
namespace sparse {
......
......@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/scatter.cu.h"
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
namespace phi {
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
......
......@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
......
......@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
......
......@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/pooling.h"
......
......@@ -19,8 +19,8 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/activation_grad_kernel.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc) \
......
......@@ -19,8 +19,8 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/activation_kernel.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#define DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc) \
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/pstring.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/strings/gpu/copy_utils.h"
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi {
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
void* dst_ptr = nullptr;
dst->Resize(src.dims());
if (paddle::platform::is_cpu_place(dst_place)) {
dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
} else {
dst_ptr = dev_ctx.Alloc(dst, src.dtype());
}
const auto& src_place = src.place();
if (src_ptr == dst_ptr && src_place == dst_place) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
CHECK(dst->layout() == src.layout());
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
if (paddle::platform::is_xpu_place(src_place) && // NOLINT
paddle::platform::is_cpu_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_cpu_place(src_place) &&
paddle::platform::is_xpu_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_xpu_place(src_place) &&
paddle::platform::is_xpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
}
} // namespace phi
PD_REGISTER_GENERAL_KERNEL(
copy, XPU, ALL_LAYOUT, phi::Copy<phi::XPUContext>, ALL_DTYPE) {}
......@@ -29,7 +29,6 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
#endif
namespace paddle {
......
......@@ -22,7 +22,6 @@ limitations under the License. */
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace paddle {
namespace tests {
......
......@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/device_context.h"
......
......@@ -17,12 +17,6 @@
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
#endif
namespace paddle {
namespace tests {
......
......@@ -25,7 +25,6 @@ limitations under the License. */
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace paddle {
namespace tests {
......
......@@ -21,11 +21,6 @@ limitations under the License. */
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
#endif
namespace paddle {
namespace tests {
......
......@@ -22,10 +22,8 @@ limitations under the License. */
#include "paddle/phi/kernels/full_kernel.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
#endif
namespace phi {
......
......@@ -25,8 +25,6 @@ limitations under the License. */
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
namespace phi {
namespace tests {
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
namespace tests {
......
......@@ -23,16 +23,6 @@ limitations under the License. */
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/flatten_kernel.h"
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
#endif
#ifdef PADDLE_WITH_XPU
PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
#endif
namespace phi {
namespace tests {
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
#include "paddle/phi/kernels/sparse/convolution_kernel.h"
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
......
......@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
namespace phi {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册