未验证 提交 151c5d74 编写于 作者: C CtfGo 提交者: GitHub

cache scope and place on CinnLaunchContext and pass them to callback (#37983)

cinn_launch_op: cache scope and place on CinnLaunchContext to skip duplicate alloc/free callback construction
上级 9776888a
......@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)
if (WITH_TESTING)
cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
......
......@@ -41,6 +41,7 @@
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -217,6 +218,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
*compiled_obj = {std::move(graph_compiler),
std::move(compiled_res.runtime_program), scope,
symbol.var_model_to_program_map()};
compiled_obj->launch_context =
std::make_unique<operators::details::CinnLaunchContext>(
compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
return compiled_obj;
}
......
......@@ -31,6 +31,13 @@
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace operators {
namespace details {
class CinnLaunchContext;
}
}
namespace framework {
namespace paddle2cinn {
......@@ -39,6 +46,7 @@ struct CinnCompiledObject {
std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
std::shared_ptr<::cinn::hlir::framework::Scope> scope;
std::unordered_map<std::string, std::string> paddle2cinn_varmap;
std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
};
// Entrance to use CINN.
......
......@@ -32,7 +32,30 @@ CinnLaunchContext::CinnLaunchContext(
[](const auto& name_view) { return std::string(name_view.data()); });
}
bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
const platform::Place& place) {
if (std::addressof(scope) == cached_scope_ &&
std::addressof(place) == cached_place_) {
VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
<< " are not changed";
return;
}
cached_scope_ = std::addressof(scope);
cached_place_ = std::addressof(place);
cached_temp_scope_ = scope.NewTmpScope();
VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
<< std::addressof(scope) << ", place:" << cached_place_ << "->"
<< std::addressof(place);
}
bool CinnLaunchContext::IsArgumentsInitialized() const {
if (hold_buffers_.empty() || name2argument_.empty()) {
return false;
}
return true;
}
bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) const {
return paddle2cinn_varmap_.count(paddle_name) > 0 &&
cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
}
......@@ -67,85 +90,83 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
// TODO(CtfGo): check the underlying data type after CINN ready
}
void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
const platform::Place& place,
LoDTensor* paddle_tensor) {
void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name) {
PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
platform::errors::InvalidArgument(
"Paddle variable(%s) not used by cinn", paddle_name));
const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
const auto& paddle_tensor =
cached_scope_->GetVar(paddle_name)->Get<LoDTensor>();
CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
if (!paddle_tensor->IsInitialized()) {
paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
if (paddle_tensor.IsInitialized()) {
CheckTensorEquivalent(paddle_name, paddle_tensor, cinn_tensor);
}
CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
paddle_tensor);
auto cinn_buffer = std::make_unique<cinn_buffer_t>();
// assign dimensions and alloc/free callback of cinn_buffer_t
cinn_buffer->resize(cinn_tensor->shape().data().data(),
cinn_tensor->shape().data().size());
cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
[this, paddle_name](void* ctx, cinn_buffer_t* buffer) {
auto* tensor =
cached_scope_->GetVar(paddle_name)->GetMutable<LoDTensor>();
tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
buffer->memory = reinterpret_cast<uint8_t*>(
tensor->mutable_data<float>(*cached_place_));
return 0;
});
// external variables will be recycled by global gc, so do nothing here
cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
[](void* ctx, cinn_buffer_t* buffer) {
// Do nothing
return 0;
});
return SetArgument(cinn_name, std::move(cinn_buffer));
}
void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
const platform::Place& place,
LoDTensor* paddle_tensor) {
void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name) {
PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
platform::errors::InvalidArgument(
"Variable(%s) not found in cinn socpe.", cinn_name));
CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
if (!paddle_tensor->IsInitialized()) {
paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
}
CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
paddle_tensor);
}
std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
// convert paddle dimensions array to cinn format
std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
for (auto i = 0; i < tensor->dims().size(); ++i) {
cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
}
auto cinn_buffer = std::make_unique<cinn_buffer_t>();
// assign size and memory
cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
// assign dimensions and alloc/free callback of cinn_buffer_t
cinn_buffer->resize(cinn_tensor->shape().data().data(),
cinn_tensor->shape().data().size());
cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
[place, tensor](void* ctx, cinn_buffer_t* buffer) {
buffer->memory =
reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
[this, cinn_name](void* ctx, cinn_buffer_t* buffer) {
auto* tensor =
cached_temp_scope_->Var(cinn_name)->GetMutable<LoDTensor>();
tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
buffer->memory = reinterpret_cast<uint8_t*>(
tensor->mutable_data<float>(*cached_place_));
return 0;
});
if (free_mem_callback) {
// internal variables should release its buffer immediately
// if no instruction use it
cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
[tensor](void* ctx, cinn_buffer_t* buffer) {
[this, cinn_name](void* ctx, cinn_buffer_t* buffer) {
auto* tensor =
cached_temp_scope_->GetVar(cinn_name)->GetMutable<LoDTensor>();
tensor->clear();
return 0;
});
return cinn_buffer;
}
cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
[](void* ctx, cinn_buffer_t* buffer) {
// Do nothing
return 0;
});
return cinn_buffer;
return SetArgument(cinn_name, std::move(cinn_buffer));
}
void CinnLaunchContext::SetArgument(const std::string& cinn_name,
const platform::Place& place,
bool free_mem_callback,
LoDTensor* paddle_tensor) {
auto buffer =
ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
std::unique_ptr<cinn_buffer_t>&& buffer) {
VLOG(4) << "SetArgument-" << name2argument_.size() << ": name(" << cinn_name
<< "), dims(" << framework::DDim(buffer->dims, buffer->dimensions)
<< ").";
name2argument_.emplace(cinn_name, buffer.get());
hold_buffers_.emplace_back(std::move(buffer));
VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
<< "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
<< ").";
}
const std::map<std::string, cinn_pod_value_t>&
......
......@@ -24,7 +24,7 @@
#include "cinn/runtime/cinn_runtime.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace operators {
......@@ -40,16 +40,22 @@ class CinnLaunchContext {
const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
const std::shared_ptr<CinnScope>& cinn_scope);
// explicitly update several environment variables captured
// by callback of execution arguments
void UpdateCapturedEnv(const framework::Scope& scope,
const platform::Place& place);
// Return whether execution arguments has been initialized
bool IsArgumentsInitialized() const;
// Return whether a Paddle variable used on compiled kernels
bool IsVariableUsed(const std::string& var_name);
bool IsVariableUsed(const std::string& paddle_name) const;
// Assign tensor buffer to input or output variables
void AssignExternalVariable(const std::string& var_name,
const platform::Place& place, LoDTensor* tensor);
void AssignExternalVariable(const std::string& paddle_name);
// Assign tensor buffer to internal variables
void AssignInternalVariable(const std::string& var_name,
const platform::Place& place, LoDTensor* tensor);
void AssignInternalVariable(const std::string& cinn_name);
// Extract internal variable names from CinnScope
// by excluding used input and output variables
......@@ -58,10 +64,6 @@ class CinnLaunchContext {
// Finalize all execution arguments and return them
const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
return std::move(hold_buffers_);
}
private:
// Get CinnTensor with CINN variable name
CinnTensor GetCinnTensor(const std::string& var_name);
......@@ -72,16 +74,15 @@ class CinnLaunchContext {
const LoDTensor& paddle_tensor,
const CinnTensor& cinn_tensor);
// Share the buffer of a Paddle tensor to CINN by delivering memory address
// to a cinn_buffer_t object
std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
// Set an argument with (cinn name)->(paddle tensor) pair
void SetArgument(const std::string& cinn_name, const platform::Place& place,
bool free_mem_callback, LoDTensor* paddle_tensor);
// Set an argument with (cinn name)->(cinn_buffer_t) pair
void SetArgument(const std::string& cinn_name,
std::unique_ptr<cinn_buffer_t>&& buffer);
private:
const framework::Scope* cached_scope_ = nullptr;
const platform::Place* cached_place_ = nullptr;
std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
// a variable name map from paddle to cinn
const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
// the variable scope of cinn
......
......@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
}
TEST(CinnLaunchContextTest, TestIsVariableUsed) {
TEST(CinnLaunchContextTest, TestBasic) {
auto launch_context = CreateDefaultLaunchContext();
// test IsVariableUsed
ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
}
TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
auto launch_context = CreateDefaultLaunchContext();
auto internal_variable_names = launch_context->GetInternalVariableNames();
ASSERT_EQ(internal_variable_names.size(), 3);
EXPECT_NE(internal_variable_names.find("cinn_var2"),
internal_variable_names.end());
// test UpdateCapturedEnv
platform::CPUPlace place;
framework::Scope scope;
ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
// test IsArgumentsInitialized
ASSERT_FALSE(launch_context->IsArgumentsInitialized());
}
TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
auto launch_context = CreateDefaultLaunchContext();
platform::CPUPlace place;
framework::Scope scope;
auto launch_context = CreateDefaultLaunchContext();
launch_context->UpdateCapturedEnv(scope, place);
auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
// CheckTensorEquivalent: tensor dimension not equivalent
tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
paddle::platform::EnforceNotMet);
}
TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
auto launch_context = CreateDefaultLaunchContext();
platform::CPUPlace place;
framework::Scope scope;
auto launch_context = CreateDefaultLaunchContext();
launch_context->UpdateCapturedEnv(scope, place);
auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
// not used
ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
paddle::platform::EnforceNotMet);
// not found
ASSERT_THROW(
launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
paddle::platform::EnforceNotMet);
}
TEST(CinnLaunchContextTest, TestSetArgument) {
platform::CPUPlace cpu_place;
platform::Place place(cpu_place);
framework::Scope scope;
auto launch_context = CreateDefaultLaunchContext();
launch_context->UpdateCapturedEnv(scope, place);
platform::CPUPlace place;
framework::Scope scope;
// assign external variables
auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
float* data1 =
tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
data1[0] = 9.99f;
data1[10] = 19.99f;
ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
// assign external variable
ASSERT_NO_THROW(
launch_context->AssignExternalVariable("var1", place, tensor1));
auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
ASSERT_NO_THROW(
launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
// FinalizeArguments not missed check
ASSERT_THROW(launch_context->FinalizeArguments(),
paddle::platform::EnforceNotMet);
auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
ASSERT_NO_THROW(
launch_context->AssignExternalVariable("var3", place, tensor3));
ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
// FinalizeArguments missed check
ASSERT_THROW(launch_context->FinalizeArguments(),
paddle::platform::EnforceNotMet);
// test get internal variables
auto internal_variable_names = launch_context->GetInternalVariableNames();
ASSERT_EQ(internal_variable_names.size(), 1);
EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
// check argument is set correctly and alloc/free callbacks work well
auto name2argument = launch_context->FinalizeArguments();
ASSERT_EQ(name2argument.size(), 3);
ASSERT_EQ(name2argument.count("cinn_var1"), 1);
// check ShareTensorWithCinnBuffer
ASSERT_TRUE(launch_context->IsArgumentsInitialized());
auto* cinn_buffer =
static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
ASSERT_EQ(cinn_buffer->memory, nullptr);
cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
ASSERT_NE(cinn_buffer->memory, nullptr);
......
......@@ -31,26 +31,6 @@ namespace operators {
namespace details {
#ifdef PADDLE_WITH_CUDA
void CUDART_CB ReleaseScope(void* data) {
auto* temp_scope = static_cast<framework::Scope*>(data);
delete temp_scope;
}
void CUDART_CB ReleaseBuffers(void* data) {
auto* buffers =
static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
delete buffers;
}
template <>
void ReleaseResource<platform::CUDADeviceContext>(
const std::vector<void*>& resources, void* stream) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
}
template <>
void* GetStream<platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx) {
......
......@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
// Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
void SetCinnRuntimeFlags();
template <typename DeviceContext>
void ReleaseResource(const std::vector<void*>& resources, void* stream) {
auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
auto* buffers =
static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
delete temp_scope;
delete buffers;
}
template <typename DeviceContext>
void* GetStream(const framework::ExecutionContext& ctx) {
return nullptr;
}
#ifdef PADDLE_WITH_CUDA
template <>
void ReleaseResource<platform::CUDADeviceContext>(
const std::vector<void*>& resources, void* stream);
template <>
void* GetStream<platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx);
......@@ -116,10 +103,10 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
compilation_key, inputs_name2tensor, target, stream);
details::DebugCinnCompiledResult(cinn_compiled_object);
auto launch_context = std::make_unique<details::CinnLaunchContext>(
cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
const auto& launch_context = cinn_compiled_object.launch_context;
// Step 3. Prepare arguments needed for the compiled executable program.
launch_context->UpdateCapturedEnv(scope, place);
if (!launch_context->IsArgumentsInitialized()) {
VLOG(4) << "CinnLaunchOp prepare arguments";
// 3.1 Prepare input variables: tensors of input variables have
......@@ -134,8 +121,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
continue;
}
launch_context->AssignExternalVariable(
var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
launch_context->AssignExternalVariable(var_name);
}
// 3.2 Prepare output variables: all output variables should
......@@ -147,12 +133,12 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
// Paddle allocation and CINN buffer assginment.
auto output_variable_names = ctx.OutputNames(kOutputs);
for (const auto var_name : output_variable_names) {
PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
PADDLE_ENFORCE_EQ(
launch_context->IsVariableUsed(var_name), true,
platform::errors::InvalidArgument(
"Output variable(%s) not used by cinn", var_name));
auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
launch_context->AssignExternalVariable(var_name, place, tensor);
launch_context->AssignExternalVariable(var_name);
}
// 3.3 Prepare internal or temporary variables: Create a temporary
......@@ -162,10 +148,9 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
// names, because they will not be used outside the graph
// and should be destructed after computation finished.
auto internal_variable_names = launch_context->GetInternalVariableNames();
framework::Scope* temp_scope = scope.NewTmpScope().release();
for (const auto& var_name : internal_variable_names) {
auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
launch_context->AssignInternalVariable(var_name, place, tensor);
launch_context->AssignInternalVariable(var_name);
}
}
// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
......@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
VLOG(4) << "CinnLaunchOp launch execution done.";
// Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
launch_context->HandoverBuffers()};
details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
stream);
}
};
......
......@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
scope.Var(test_out_name)->GetMutable<LoDTensor>();
scope.Var(expected_out_name)->GetMutable<LoDTensor>();
cinn_launch_op->Run(scope, place);
elementwise_add_op->Run(scope, place);
platform::Place run_place(place);
cinn_launch_op->Run(scope, run_place);
elementwise_add_op->Run(scope, run_place);
LoDTensor test_out, expected_out;
TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册