未验证 提交 c65ef07c 编写于 作者: L lzy 提交者: GitHub

[Inference] Make share_external_data supports bf16 and bool; fix while_op...

[Inference] Make share_external_data supports bf16 and bool; fix while_op cache_inference_while_scope when using fleet_executor. (#56055)

* 1. make share_external_data supports bf16 and bool; 2. don't drop_kids when cache_inference_while_scope

* fix FLAGS_cache_inference_while_scope

* add unitest

* add unitest

* skip unitest when cudnn_version < 8100

* skip test share_external_data_bf16 when CUDA_ARCH < 80
上级 7f5c14bc
......@@ -35,6 +35,7 @@ PADDLE_DEFINE_EXPORTED_bool(
"Use standalone executor to run ops. Temporary FLAGS, will be removed "
"after all fleet executor cases are modified to run ops with standalone "
"executor.");
PHI_DECLARE_bool(cache_inference_while_scope);
namespace paddle {
namespace distributed {
......@@ -194,14 +195,17 @@ void Carrier::Start() {
// TODO(wangxi): async step
Wait();
dev_ctx_->Wait();
for (auto* micro_scope : microbatch_scopes_) {
// By default, we should delete all kid scopes after run executor because
// some operators may create local scope when running, such as while_op.
// But when while_op also create a local executor to run it's sub block,
// the sub scopes it created should not be dropped immediately, because
// while_grad_op will use some variables created during while_op run, so
// we need to keep the kids and wait for the outer executor to drop them.
micro_scope->DropKids();
if (!FLAGS_cache_inference_while_scope) {
// don't drop_kids when cache_inference_while_scope
for (auto* micro_scope : microbatch_scopes_) {
// By default, we should delete all kid scopes after run executor because
// some operators may create local scope when running, such as while_op.
// But when while_op also create a local executor to run it's sub block,
// the sub scopes it created should not be dropped immediately, because
// while_grad_op will use some variables created during while_op run, so
// we need to keep the kids and wait for the outer executor to drop them.
micro_scope->DropKids();
}
}
}
......
......@@ -226,6 +226,8 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
input_ptr = t->mutable_data<int32_t>(ddim, place);
} else if (pt.dtype == PaddleDType::FLOAT16) {
input_ptr = t->mutable_data<float16>(ddim, place);
} else if (pt.dtype == PaddleDType::BFLOAT16) {
input_ptr = t->mutable_data<bfloat16>(ddim, place);
} else {
LOG(ERROR) << "unsupported feed type " << pt.dtype;
return false;
......@@ -1318,9 +1320,13 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
} else if (type == framework::proto::VarType::FP16) {
GetFetchOne<float16>(fetch, output);
output->dtype = PaddleDType::FLOAT16;
} else if (type == framework::proto::VarType::BF16) {
GetFetchOne<bfloat16>(fetch, output);
output->dtype = PaddleDType::BFLOAT16;
} else {
LOG(ERROR) << "unknown type, only support float32, float16, int64 and "
"int32 now.";
LOG(ERROR)
<< "unknown type, only support float32, float16, bfloat16, int64 and "
"int32 now.";
}
}
return true;
......@@ -1881,6 +1887,8 @@ AnalysisPredictor::GetInputTypes() {
input_type[name] = paddle_infer::DataType::FLOAT32;
} else if (dtype == paddle::framework::proto::VarType::FP16) {
input_type[name] = paddle_infer::DataType::FLOAT16;
} else if (dtype == paddle::framework::proto::VarType::BF16) {
input_type[name] = paddle_infer::DataType::BFLOAT16;
} else if (dtype == paddle::framework::proto::VarType::INT64) {
input_type[name] = paddle_infer::DataType::INT64;
} else if (dtype == paddle::framework::proto::VarType::INT32) {
......@@ -1938,6 +1946,8 @@ AnalysisPredictor::GetOutputTypes() {
output_type[name] = paddle_infer::DataType::FLOAT32;
} else if (dtype == paddle::framework::proto::VarType::FP16) {
output_type[name] = paddle_infer::DataType::FLOAT16;
} else if (dtype == paddle::framework::proto::VarType::BF16) {
output_type[name] = paddle_infer::DataType::BFLOAT16;
} else if (dtype == paddle::framework::proto::VarType::INT64) {
output_type[name] = paddle_infer::DataType::INT64;
} else if (dtype == paddle::framework::proto::VarType::INT32) {
......
......@@ -22,6 +22,7 @@
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/core/allocator.h"
#ifdef PADDLE_WITH_ONNXRUNTIME
#include "onnxruntime_c_api.h" // NOLINT
......@@ -31,6 +32,7 @@
namespace paddle_infer {
using float16 = paddle::platform::float16;
using bfloat16 = phi::dtype::bfloat16;
void Tensor::Reshape(const std::vector<int> &shape) {
#ifdef PADDLE_WITH_ONNXRUNTIME
......@@ -173,6 +175,8 @@ DataType Tensor::type() const {
return DataType::FLOAT32;
} else if (type == paddle::framework::proto::VarType::FP16) {
return DataType::FLOAT16;
} else if (type == paddle::framework::proto::VarType::BF16) {
return DataType::BFLOAT16;
} else if (type == paddle::framework::proto::VarType::INT64) {
return DataType::INT64;
} else if (type == paddle::framework::proto::VarType::INT32) {
......@@ -284,6 +288,11 @@ struct DataTypeInfo<float16> {
phi::DataType TYPE = phi::DataType::FLOAT16;
};
template <>
struct DataTypeInfo<bfloat16> {
phi::DataType TYPE = phi::DataType::BFLOAT16;
};
template <>
struct DataTypeInfo<int64_t> {
phi::DataType TYPE = phi::DataType::INT64;
......@@ -502,6 +511,7 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<bfloat16>(const bfloat16 *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<bool>(const bool *data);
template PD_INFER_DECL void Tensor::ShareExternalData<double>(
......@@ -539,6 +549,11 @@ template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
const std::vector<int> &shape,
PlaceType place,
DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<bfloat16>(
const bfloat16 *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<bool>(
const bool *data,
const std::vector<int> &shape,
......@@ -552,6 +567,7 @@ template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<bfloat16>(bfloat16 *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<bool>(bool *data) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<double>(
......@@ -570,6 +586,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuImpl<int8_t>(
int8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<float16>(
float16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<bfloat16>(
bfloat16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<bool>(bool *data,
void *exec_stream,
CallbackFunc cb,
......@@ -589,6 +607,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
bfloat16 *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(
bool *data, void *exec_stream) const;
......@@ -606,6 +626,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
bfloat16 *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(bool *data,
CallbackFunc cb,
void *cb_params) const;
......@@ -624,6 +646,8 @@ template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
int *size) const;
template PD_INFER_DECL bfloat16 *Tensor::data<bfloat16>(PlaceType *place,
int *size) const;
template PD_INFER_DECL bool *Tensor::data<bool>(PlaceType *place,
int *size) const;
......@@ -634,6 +658,8 @@ template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
template PD_INFER_DECL bfloat16 *Tensor::mutable_data<bfloat16>(
PlaceType place);
template PD_INFER_DECL bool *Tensor::mutable_data<bool>(PlaceType place);
Tensor::Tensor(void *scope, const void *device_contexts)
......@@ -923,6 +949,8 @@ template void InternalUtils::CopyFromCpuWithIoStream<int8_t>(
paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<float16>(
paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<bfloat16>(
paddle_infer::Tensor *t, const bfloat16 *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<bool>(
paddle_infer::Tensor *t, const bool *data, cudaStream_t stream);
......@@ -940,6 +968,8 @@ template void InternalUtils::CopyToCpuWithIoStream<int8_t>(
paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<float16>(
paddle_infer::Tensor *t, float16 *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<bfloat16>(
paddle_infer::Tensor *t, bfloat16 *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<bool>(
paddle_infer::Tensor *t, bool *data, cudaStream_t stream);
......
......@@ -108,9 +108,17 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
cb,
cb_params);
break;
case PaddleDType::BFLOAT16:
src.CopyToCpuImpl(
dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kCPU),
exec_stream,
cb,
cb_params);
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, BFLOAT16, FLOAT32 "
"and "
"FLOAT64 is supported in Tensor. Others not implements"));
}
// gpu => gpu or cpu => gpu
......@@ -172,9 +180,17 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
src.data<paddle::platform::float16>(&src_place, &data_size));
data_len = data_size * 2;
break;
case PaddleDType::BFLOAT16:
dst_data = static_cast<void*>(
dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kGPU));
src_data = static_cast<void*>(
src.data<paddle::platform::bfloat16>(&src_place, &data_size));
data_len = data_size * 2;
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, BFLOAT16, FLOAT32 "
"and "
"FLOAT64 is supported in Tensor. Others not implements"));
}
......
......@@ -64,6 +64,7 @@ enum DataType {
FLOAT16,
BOOL,
FLOAT64,
BFLOAT16,
// TODO(Inference): support more data types if needed.
};
......
......@@ -24,11 +24,7 @@
#endif
#include "paddle/fluid/platform/flags.h"
PADDLE_DEFINE_EXPORTED_bool(
cache_inference_while_scope,
false,
"Cache the scope of the while op to avoid repeated creation of the scope "
"for each iteration and improve inference performance.");
PHI_DECLARE_bool(cache_inference_while_scope);
namespace paddle {
namespace framework {
......
......@@ -271,6 +271,16 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor, // NOLINT
static_cast<phi::dtype::float16 *>(input_tensor.data()),
shape,
ToPaddleInferPlace(input_tensor.place().GetType()));
} else if (input_tensor.dtype() == phi::DataType::BFLOAT16) {
tensor.ShareExternalData(
static_cast<bfloat16 *>(input_tensor.data()),
shape,
ToPaddleInferPlace(input_tensor.place().GetType()));
} else if (input_tensor.dtype() == phi::DataType::BOOL) {
tensor.ShareExternalData(
static_cast<bool *>(input_tensor.data()),
shape,
ToPaddleInferPlace(input_tensor.place().GetType()));
} else if (input_tensor.dtype() == phi::DataType::INT32) {
tensor.ShareExternalData(
static_cast<int32_t *>(input_tensor.data()),
......@@ -284,7 +294,7 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor, // NOLINT
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported data type. Now share_external_data only supports INT32, "
"INT64, FLOAT64, FLOAT32 and FLOAT16."));
"INT64, FLOAT64, FLOAT32, FLOAT16, BFLOAT16 and BOOL."));
}
}
......@@ -311,6 +321,16 @@ void PaddleTensorShareExternalData(paddle_infer::Tensor &tensor, // NOLINT
paddle_tensor.data<paddle::platform::float16>()),
shape,
ToPaddleInferPlace(paddle_tensor.place().GetType()));
} else if (paddle_tensor.dtype() == phi::DataType::BFLOAT16) {
tensor.ShareExternalData(
static_cast<bfloat16 *>(paddle_tensor.data<bfloat16>()),
shape,
ToPaddleInferPlace(paddle_tensor.place().GetType()));
} else if (paddle_tensor.dtype() == phi::DataType::BOOL) {
tensor.ShareExternalData(
static_cast<bool *>(paddle_tensor.data<bool>()),
shape,
ToPaddleInferPlace(paddle_tensor.place().GetType()));
} else if (paddle_tensor.dtype() == phi::DataType::INT32) {
tensor.ShareExternalData(
static_cast<int32_t *>(paddle_tensor.data<int32_t>()),
......@@ -324,7 +344,7 @@ void PaddleTensorShareExternalData(paddle_infer::Tensor &tensor, // NOLINT
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported data type. Now share_external_data only supports INT32, "
"INT64, FLOAT32 and FLOAT16."));
"INT64, FLOAT32, FLOAT16, BFLOAT16 and BOOL."));
}
}
......
......@@ -1073,6 +1073,11 @@ PHI_DEFINE_EXPORTED_bool(
gpugraph_enable_hbm_table_collision_stat,
false,
"enable hash collisions stat for hbm table, default false");
PHI_DEFINE_EXPORTED_bool(
cache_inference_while_scope,
false,
"Cache the scope of the while op to avoid repeated creation of the scope "
"for each iteration and improve inference performance.");
PHI_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
0.75,
"the load factor of hbm table, default 0.75");
......
......@@ -21,6 +21,7 @@ import numpy as np
from paddle import fluid
from paddle.fluid.core import PaddleDType, PaddleTensor
from paddle.framework import core
from paddle.inference import (
Config,
create_predictor,
......@@ -101,6 +102,36 @@ def get_sample_model():
return serialized_program, serialized_params
def get_sample_model_cuda(data_type):
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
data = paddle.static.data(
name="data", shape=[-1, 6, 64, 64], dtype=data_type
)
data_float = paddle.cast(data, "bfloat16")
res = paddle.static.nn.conv2d(
input=data_float,
num_filters=3,
filter_size=3,
groups=1,
padding=0,
bias_attr=False,
act=None,
)
exe.run(startup_program)
serialized_program = paddle.static.serialize_program(
data, res, program=main_program
)
serialized_params = paddle.static.serialize_persistables(
data, res, executor=exe, program=main_program
)
return serialized_program, serialized_params
class TestInferenceBaseAPI(unittest.TestCase):
def get_config(self, model, params):
config = Config()
......@@ -171,5 +202,51 @@ class TestInferenceBaseAPI(unittest.TestCase):
test_paddle_tensor()
@unittest.skipIf(
not core.is_compiled_with_cuda()
or paddle.get_cudnn_version() < 8100
or paddle.device.cuda.get_device_capability()[0] < 8,
"share_external_data_bf16 requires cudnn >= 8.1 and CUDA_ARCH >= 8",
)
class TestInferenceShareExternalDataAPI(unittest.TestCase):
def get_config(self, model, params):
config = Config()
config.set_model_buffer(model, len(model), params, len(params))
config.enable_use_gpu(100, 0)
return config
def test_share_external_data_cuda(self):
def test_paddle_tensor_bf16():
paddle.set_default_dtype("bfloat16")
program, params = get_sample_model_cuda("bfloat16")
paddle.disable_static()
config = self.get_config(program, params)
predictor = create_predictor(config)
in_names = predictor.get_input_names()
in_handle = predictor.get_input_handle(in_names[0])
in_data = paddle.to_tensor(np.ones((1, 6, 32, 32)), "bfloat16")
in_handle.share_external_data(in_data)
predictor.run()
paddle.set_default_dtype("float32")
paddle.enable_static()
def test_paddle_tensor_bool():
paddle.set_default_dtype("bfloat16")
program, params = get_sample_model_cuda("bool")
paddle.disable_static()
config = self.get_config(program, params)
predictor = create_predictor(config)
in_names = predictor.get_input_names()
in_handle = predictor.get_input_handle(in_names[0])
in_data = paddle.to_tensor(np.ones((1, 6, 32, 32)), "bool")
in_handle.share_external_data(in_data)
predictor.run()
paddle.set_default_dtype("float32")
paddle.enable_static()
test_paddle_tensor_bf16()
test_paddle_tensor_bool()
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册