未验证 提交 cf9d651b 编写于 作者: H HongyuJia 提交者: GitHub

[phi] Transfer coalesce_tensor to phi (#45478)

* add coalesce_tensor kernel

* polist coalesce_tensor kernel

* add sig and InferMeta

* add testcase

* add legacy_api.yaml

* fix infermeta

* fix yaml

* fix kernel implementation

* add compile dependency of phi/kernels

* fix MetaConfig

* add python api

* add and fix testcase

* rnn.py add import

* change _C_ops.coalesce_tensor

* remove useless comments

* add SetBackend

* restore XPU kernel temporarily

* fix code according to PR comments
上级 f69d2c32
......@@ -28,6 +28,8 @@
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
......@@ -506,24 +508,16 @@ value.
} // namespace operators
} // namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR(coalesce_tensor,
CoalesceTensorInferShapeFunctor,
PD_INFER_META(phi::CoalesceTensorInferMeta));
REGISTER_OPERATOR(coalesce_tensor,
paddle::operators::CoalesceTensorOp,
paddle::operators::CoalesceTensorOpMaker);
paddle::operators::CoalesceTensorOpMaker,
CoalesceTensorInferShapeFunctor);
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CPU_KERNEL(coalesce_tensor,
ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
ops::CoalesceTensorOpKernel<phi::CPUContext, double>);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<phi::GPUContext, plat::float16>,
ops::CoalesceTensorOpKernel<phi::GPUContext, int>,
ops::CoalesceTensorOpKernel<phi::GPUContext, float>,
ops::CoalesceTensorOpKernel<phi::GPUContext, double>);
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_CUDA_KERNEL(
......
......@@ -493,6 +493,15 @@
kernel :
func : clip_by_norm
- api : coalesce_tensor
args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {})
output : Tensor[](output){input.size()}, Tensor(fused_output)
infer_meta :
func : CoalesceTensorInferMeta
kernel :
func : coalesce_tensor
data_type : dtype
- api : complex
args : (Tensor x, Tensor y)
output : Tensor
......
......@@ -785,6 +785,56 @@ void CheckFiniteAndUnscaleInferMeta(const std::vector<const MetaTensor*>& xs,
found_infinite->set_dtype(DataType::BOOL);
}
void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
DataType dtype,
bool copy_data,
bool set_constant,
bool persist_output,
float constant,
bool use_align,
int align_size,
int size_of_dtype,
const std::vector<int64_t>& concated_shapes,
const std::vector<int64_t>& concated_ranks,
std::vector<MetaTensor*> output,
MetaTensor* fused_output,
MetaConfig config) {
if (config.is_runtime) {
return;
}
if (size_of_dtype == -1) {
size_of_dtype = paddle::experimental::SizeOf(dtype);
}
auto alignment = [](size_t size, size_t align_size) {
size_t remaining = size % align_size;
auto aligned_size = remaining == 0 ? size : size + (align_size - remaining);
VLOG(4) << remaining << " " << size << " " << align_size << " "
<< aligned_size;
return aligned_size;
};
VLOG(4) << "align_size: " << align_size;
if (use_align && align_size > 0) {
int64_t numel = 0;
for (size_t i = 0; i < input.size(); ++i) {
const auto& dim = input[i]->dims();
auto size = phi::product(dim);
auto len = use_align
? alignment(static_cast<size_t>(size) * size_of_dtype,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
numel += len;
}
if (fused_output) {
fused_output->set_dims(phi::make_ddim({numel}));
fused_output->set_dtype(dtype);
VLOG(4) << "fused_output size:" << phi::make_ddim({numel});
}
}
}
void ConcatInferMeta(const std::vector<const MetaTensor*>& x,
const Scalar& axis_scalar,
MetaTensor* out,
......
......@@ -201,6 +201,21 @@ void CheckFiniteAndUnscaleInferMeta(const std::vector<const MetaTensor*>& xs,
std::vector<MetaTensor*> outs,
MetaTensor* found_infinite);
void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
DataType dtype,
bool copy_data,
bool set_constant,
bool persist_output,
float constant,
bool use_align,
int align_size,
int size_of_dtype,
const std::vector<int64_t>& concated_shapes,
const std::vector<int64_t>& concated_ranks,
std::vector<MetaTensor*> output,
MetaTensor* fused_output,
MetaConfig config = MetaConfig());
void ConcatInferMeta(const std::vector<const MetaTensor*>& x,
const Scalar& axis_scalar,
MetaTensor* out,
......
......@@ -83,7 +83,8 @@ set(COMMON_KERNEL_DEPS
custom_kernel
string_infermeta
gpc
utf8proc)
utf8proc
device_memory_aligment)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
if(WITH_NCCL OR WITH_RCCL)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/coalesce_tensor_kernel.h"
#include <sstream>
#include <vector>
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename Context>
struct FillConstantVisitor {
FillConstantVisitor(const Context &dev_ctx,
DenseTensor *tensor,
const float value)
: dev_ctx_(dev_ctx), tensor_(tensor), value_(value) {}
template <typename T>
void apply(typename std::enable_if<std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value>::type * =
nullptr) const {
PADDLE_THROW(
errors::InvalidArgument("Not support data type for set_constant attr"));
}
template <typename T>
void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value)>::type
* = nullptr) const {
phi::funcs::SetConstant<Context, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
}
const Context &dev_ctx_;
DenseTensor *tensor_;
float value_;
};
void GetMemSizeAndDtype(const std::vector<const DenseTensor *> &lod_tensors,
size_t *numel,
const size_t &size_of_dtype,
const phi::Place &place,
const bool use_align = true,
const int align_size = -1) {
*numel = 0;
std::stringstream ss;
ss << "alloc_space_for_vars: ";
for (size_t i = 0; i < lod_tensors.size(); ++i) {
auto size = lod_tensors[i]->numel();
PADDLE_ENFORCE_GT(size,
0,
errors::InvalidArgument(
"The number of `%d`-th tensor's elements is 0.", i));
auto len = use_align ? paddle::platform::Alignment(
static_cast<size_t>(size) * size_of_dtype,
place,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
const void *ptr =
lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
VLOG(4) << size << " " << len;
ss << "input(" << i << "-th tensor) dim:(" << lod_tensors[i]->dims() << ") "
<< " addres:" << ptr << " len: " << len << ", ";
*numel += len;
}
VLOG(10) << ss.str();
}
template <typename T, typename Context>
void CoalesceTensorKernel(const Context &dev_ctx,
const std::vector<const DenseTensor *> &input,
DataType dtype,
bool copy_data,
bool set_constant,
bool persist_output,
float constant,
bool use_align,
int align_size,
int size_of_dtype,
const std::vector<int64_t> &concated_shapes,
const std::vector<int64_t> &concated_ranks,
std::vector<DenseTensor *> output,
DenseTensor *fused_output) {
PADDLE_ENFORCE_GT(
input.size(),
static_cast<size_t>(0),
errors::InvalidArgument("The CoalesceTensor operator has no input."));
PADDLE_ENFORCE_EQ(input.size(),
output.size(),
errors::InvalidArgument(
"The number of CoalesceTensor operator's input and "
"output is not match, "
"input number is %u, output number is %u.",
input.size(),
output.size()));
// Input & Output check: only support LoDTensor
bool has_not_init_in_vars = false;
for (size_t i = 0; i < input.size(); ++i) {
PADDLE_ENFORCE_NOT_NULL(
input[i],
errors::InvalidArgument("The %d-th input tensor cannot be nullptr.",
i));
PADDLE_ENFORCE_NOT_NULL(
output[i],
errors::InvalidArgument("The %d-th output tensor cannot be nullptr.",
i));
if (!input[i]->IsInitialized()) {
has_not_init_in_vars = true;
}
}
if (has_not_init_in_vars) {
PADDLE_ENFORCE_EQ(
concated_ranks.size(),
output.size(),
errors::InvalidArgument("The attribute(concated_ranks) length must be "
"equal to the output tensor number."));
int64_t accumulated_ranks = 0;
for (size_t i = 0; i < input.size(); ++i) {
phi::DDim dims(concated_shapes.data() + accumulated_ranks,
concated_ranks[i]);
if (!input[i]->IsInitialized()) {
PADDLE_ENFORCE_EQ(
input[i],
output[i],
errors::InvalidArgument(
"The %d-th output tensor and %d-th input tensor when the "
"%d-th input tensor is not initialized.",
i,
i,
i));
output[i]->Resize(dims);
} else {
PADDLE_ENFORCE_EQ(input[i]->dims(),
dims,
errors::InvalidArgument(
"The %d-th input tensor shape does not match the "
"attribute(concated_shapes) and "
"attribute(concated_ranks).",
i));
}
accumulated_ranks += concated_ranks[i];
PADDLE_ENFORCE_LE(
accumulated_ranks,
concated_shapes.size(),
errors::InvalidArgument("The attribute(concated_shapes) and "
"attribute(concated_ranks) do not match."));
}
PADDLE_ENFORCE_EQ(
accumulated_ranks,
concated_shapes.size(),
errors::InvalidArgument("The attribute(concated_shapes) and "
"attribute(concated_ranks) do not match."));
}
// Init the output as input
for (size_t i = 0; i < input.size(); ++i) {
output[i]->Resize(input[i]->dims());
}
// Get numel and dtype
size_t numel = 0;
if (size_of_dtype == -1) {
size_of_dtype = paddle::experimental::SizeOf(dtype);
}
GetMemSizeAndDtype(
input, &numel, size_of_dtype, dev_ctx.GetPlace(), use_align, align_size);
// Alloc the continuous space
void *fused_tensor_ptr = dev_ctx.Alloc(
&fused_output->Resize(phi::make_ddim({static_cast<int64_t>(numel)})),
dtype);
VLOG(10) << "Fused tensor addr " << fused_tensor_ptr;
// Init the continuous space
size_t offset = 0;
if (copy_data) {
for (size_t i = 0; i < input.size(); ++i) {
size_t len = static_cast<size_t>(input[i]->numel());
auto sub_tensor = fused_output->Slice(static_cast<int64_t>(offset),
static_cast<int64_t>(offset + len));
phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor);
offset += use_align
? paddle::platform::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype
: len;
}
} else if (set_constant) {
phi::VisitDataType(
dtype, FillConstantVisitor<Context>(dev_ctx, fused_output, constant));
} else if (persist_output) {
for (size_t i = 0; i < output.size(); ++i) {
size_t len = static_cast<size_t>(output[i]->numel());
auto sub_tensor = fused_output->Slice(static_cast<int64_t>(offset),
static_cast<int64_t>(offset + len));
// some var may not persistable, or persistable var may not init
if (output[i]->IsInitialized()) {
phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor);
}
offset += use_align
? paddle::platform::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype
: len;
}
}
// Make the outputs point to the continuous space.
offset = 0;
std::stringstream ss;
ss << "alloc_space_for_vars: ";
for (size_t i = 0; i < output.size(); ++i) {
size_t len = static_cast<size_t>(output[i]->numel());
auto dim = output[i]->dims();
VLOG(4) << len << " " << dim << " " << offset;
output[i]
->ShareDataWith(fused_output->Slice(static_cast<int64_t>(offset),
static_cast<int64_t>(offset + len)))
.Resize(dim);
len = use_align ? paddle::platform::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype
: len;
ss << "output(" << i << "-th tensor) dim:(" << dim << ")"
<< " address: " << output[i]->data() << " len: " << len << ", ";
offset += len;
}
PADDLE_ENFORCE_EQ((int64_t)offset,
fused_output->numel(),
errors::InvalidArgument(
"The alloc_space_for_vars's offset: %s is unequal with "
"fused_output's numel: %s.",
offset,
fused_output->numel()));
VLOG(10) << ss.str();
}
} // namespace phi
PD_REGISTER_KERNEL(coalesce_tensor,
CPU,
ALL_LAYOUT,
phi::CoalesceTensorKernel,
int,
float,
double) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(coalesce_tensor,
GPU,
ALL_LAYOUT,
phi::CoalesceTensorKernel,
phi::dtype::float16,
int,
float,
double) {
kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void CoalesceTensorKernel(const Context& dev_ctx,
const std::vector<const DenseTensor*>& input,
DataType dtype,
bool copy_data,
bool set_constant,
bool persist_output,
float constant,
bool use_align,
int align_size,
int size_of_dtype,
const std::vector<int64_t>& concated_shapes,
const std::vector<int64_t>& concated_ranks,
std::vector<DenseTensor*> output,
DenseTensor* fused_output);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature CoalesceTensorOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("coalesce_tensor",
{"Input"},
{"dtype",
"copy_data",
"set_constant",
"persist_output",
"constant",
"use_align",
"align_size",
"user_defined_size_of_dtype",
"concated_shapes",
"concated_ranks"},
{"Output", "FusedOutput"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(coalesce_tensor,
phi::CoalesceTensorOpArgumentMapping);
......@@ -18,8 +18,28 @@ import unittest
import numpy as np
from op_test import OpTest
from paddle.fluid import core
alignment = 256
import paddle.fluid as fluid
import paddle
def coalesce_tensor_eager_api(Input,
datatype=core.VarDesc.VarType.FP32,
copy_data=False,
set_constant=False,
persist_output=False,
constant=0.0,
use_align=True,
align_size=-1,
user_defined_size_of_dtype=-1,
concated_shapes=[],
concated_ranks=[]):
if datatype == int(core.VarDesc.VarType.FP32):
datatype = core.VarDesc.VarType.FP32
return paddle._C_ops.coalesce_tensor(Input, datatype, copy_data,
set_constant, persist_output, constant,
use_align, align_size,
user_defined_size_of_dtype,
concated_shapes, concated_ranks)
@unittest.skipIf(not core.is_compiled_with_cuda(),
......@@ -27,17 +47,14 @@ alignment = 256
class TestAllocContinuousSpace(OpTest):
def setUp(self):
self.python_api = coalesce_tensor_eager_api
self.op_type = "coalesce_tensor"
self.dtype, self.fluid_dtype = self.init_dtype()
attrs = self.init_attr()
self.copy_data = attrs["copy_data"]
self.constant = attrs["constant"]
self.set_constant = attrs["set_constant"]
self.attrs = self.init_attr()
self.Inputs = self.init_input()
self.Outputs, self.FusedOutput = self.init_output(
self.Inputs, self.set_constant, self.constant)
self.Inputs, self.attrs["set_constant"], self.attrs["constant"])
self.inputs = {'Input': self.Inputs}
self.attrs = attrs
self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
def init_dtype(self):
......@@ -64,10 +81,14 @@ class TestAllocContinuousSpace(OpTest):
def init_output(self, input_list, set_constant, constant):
inputs = []
outputs = input_list
# GpuMinChunkSize=256 bytes, FP32=4 bytes
alignment = 256 / 4
if 'user_defined_size_of_dtype' in self.attrs:
alignment = 256 / self.attrs['user_defined_size_of_dtype']
for input in input_list:
length = len(input[1].flatten())
aligned_len = (length + alignment) / alignment * alignment
aligned_len = (length + alignment) // alignment * alignment
out = np.zeros(int(aligned_len))
out[0:length] = input[1].flatten()
inputs.append(out)
......@@ -80,10 +101,45 @@ class TestAllocContinuousSpace(OpTest):
for out in outputs]
return outputs, coalesce_tensor_var
def verify_output(self, place):
with fluid.dygraph.base.guard(place=place):
tensor_input = [
fluid.dygraph.base.to_variable(value=data[1])
for data in self.inputs["Input"]
]
eager_outputs, eager_fused_output = coalesce_tensor_eager_api(
tensor_input,
datatype=self.attrs["dtype"],
copy_data=self.attrs["copy_data"]
if "copy_data" in self.attrs else False,
set_constant=self.attrs["set_constant"]
if "set_constant" in self.attrs else False,
persist_output=False,
constant=self.attrs["constant"]
if "constant" in self.attrs else 0.0,
use_align=True,
align_size=-1,
user_defined_size_of_dtype=self.
attrs["user_defined_size_of_dtype"]
if "user_defined_size_of_dtype" in self.attrs else -1,
concated_shapes=[],
concated_ranks=[])
for idx, (expected, eager_output) in enumerate(
zip(self.outputs['Output'], eager_outputs)):
np.testing.assert_allclose(expected[1],
eager_output,
atol=1e-5,
err_msg=f'not equal {idx}')
np.testing.assert_allclose(self.outputs['FusedOutput'],
eager_fused_output,
atol=1e-5,
err_msg=f'not equal fusedoutput')
def test_check_output(self):
self.check_output_with_place(place=core.CUDAPlace(0),
no_check_set=["FusedOutput"],
atol=1e-5)
self.verify_output(core.CUDAPlace(0))
@unittest.skipIf(not core.is_compiled_with_cuda(),
......@@ -103,6 +159,7 @@ class TestAllocContinuousSpace2(TestAllocContinuousSpace):
self.check_output_with_place(place=core.CUDAPlace(0),
no_check_set=["FusedOutput"],
atol=1e-5)
self.verify_output(core.CUDAPlace(0))
if __name__ == '__main__':
......
......@@ -34,6 +34,7 @@ from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
from paddle.fluid.data_feeder import convert_dtype
from paddle import _C_ops, _legacy_C_ops
from paddle import in_dynamic_mode
from paddle.fluid.framework import in_dygraph_mode
from paddle.framework import core
from paddle.static import default_startup_program
from paddle.static import program_guard
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册