未验证 提交 2a1f009e 编写于 作者: A Aurelius84 提交者: GitHub

[NewExe] Support layout/dtype transform by adding transfer_layout/transfer_dtype op (#37299)

* Add transfer_layout/dtype op

* clean useless codes

* fix unused var

* add optest in white.txt

* split into data_transfer.cc

* fix cmake

* modify according reviewer comment

* replace cast_op with transfer_dtype_op
上级 684de4b3
...@@ -37,30 +37,19 @@ std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) { ...@@ -37,30 +37,19 @@ std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
} }
} }
struct CastDataLayout { template <typename T>
CastDataLayout(const platform::DeviceContext* ctx, void CastDataLayout::apply() {
const std::vector<int>& axis, const framework::Tensor& in, auto place = ctx_->GetPlace();
framework::Tensor* out)
: in_(in), out_(out), ctx_(ctx), axis_(axis) {} if (platform::is_cpu_place(place)) {
const framework::Tensor in_; operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
framework::Tensor* out_; auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
const platform::DeviceContext* ctx_; trans4(*context, in_, out_, axis_);
const std::vector<int> axis_; } else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
template <typename T> "Unsupported data layout cast from CPU to GPU."));
void apply() {
auto place = ctx_->GetPlace();
if (platform::is_cpu_place(place)) {
operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans4(*context, in_, out_, axis_);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Unsupported data layout cast from CPU to GPU."));
}
} }
}; }
void TransDataLayout(const OpKernelType& kernel_type_for_var, void TransDataLayout(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type, const Tensor& in, const OpKernelType& expected_kernel_type, const Tensor& in,
......
...@@ -36,6 +36,21 @@ class Tensor; ...@@ -36,6 +36,21 @@ class Tensor;
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct CastDataLayout {
CastDataLayout(const platform::DeviceContext* ctx,
const std::vector<int>& axis, const framework::Tensor& in,
framework::Tensor* out)
: in_(in), out_(out), ctx_(ctx), axis_(axis) {}
const framework::Tensor in_;
framework::Tensor* out_;
const platform::DeviceContext* ctx_;
const std::vector<int> axis_;
template <typename T>
void apply();
};
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
using MKLDNNDataType = dnnl::memory::data_type; using MKLDNNDataType = dnnl::memory::data_type;
......
...@@ -2,10 +2,11 @@ set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_f ...@@ -2,10 +2,11 @@ set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_f
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor nan_inf_utils) graph_to_program_pass variable_helper timer monitor nan_inf_utils)
cc_library(data_transfer SRCS data_transfer.cc DEPS enforce scope glog)
cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce) cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope) cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope)
cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS} executor_gc_helper) cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS} executor_gc_helper)
cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs) cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs) cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs) cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager) cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager)
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/new_executor/data_transfer.h"
namespace paddle {
namespace framework {
namespace interpreter {
bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key,
const std::string& var_name,
std::string* new_var_name,
std::vector<OpFuncNode>* op_func_nodes,
bool use_local_scope) {
bool is_transferred = false;
auto* src_var_name = &var_name;
Scope* local_scope = use_local_scope ? var_scope_->GetMutableLocalScope()
: var_scope_->GetMutableScope();
// 1. layout transform
if (need_layout_transform(kernel_type_for_var, expected_kernel_key)) {
auto op = TransferLayout(
*src_var_name, new_var_name, kernel_type_for_var.data_layout_,
expected_kernel_key.data_layout_, var_scope_, local_scope);
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
// update src_var_name
src_var_name = new_var_name;
is_transferred = true;
}
// 2. dype transform
if (need_dtype_transform(kernel_type_for_var, expected_kernel_key)) {
auto op = TransferDtype(
*src_var_name, new_var_name, kernel_type_for_var.data_type_,
expected_kernel_key.data_type_, var_scope_, local_scope);
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
// update src_var_name
src_var_name = new_var_name;
is_transferred = true;
}
// 3. device transform
if (need_device_transform(kernel_type_for_var, expected_kernel_key)) {
auto src_place = kernel_type_for_var.place_;
auto dst_place = expected_kernel_key.place_;
auto op = TransferDevice(*src_var_name, new_var_name, src_place, dst_place,
var_scope_, local_scope);
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
is_transferred = true;
}
return is_transferred;
}
void DataTranferHelper::RunAndConstructOpFuncNode(
const std::shared_ptr<OperatorBase>& op, const std::string& var_name,
const std::string& new_var_name,
std::vector<OpFuncNode>* new_op_func_nodes) {
auto& op_type = op->Type();
// 1. Construct RuntimeContext
RuntimeContext runtime_context({}, {});
runtime_context.inputs["X"] = {var_scope_->Var(var_name)};
runtime_context.outputs["Out"] = {var_scope_->Var(new_var_name)};
InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
// 2. Execute infer shape and choose kernel
auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
static_cast<const framework::OperatorWithKernel*>(op.get())->InferShape(
&infer_shape_ctx);
auto kernels_iter = all_op_kernels.find(op_type);
PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
platform::errors::Unavailable(
"There are no kernels which are registered in "
"the %s operator.",
op_type));
OpKernelMap& kernels = kernels_iter->second;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place_);
Scope scope;
auto exec_ctx = ExecutionContext(*op, scope, *dev_ctx, runtime_context);
auto expected_kernel_key =
dynamic_cast<const framework::OperatorWithKernel*>(op.get())
->GetExpectedKernelType(exec_ctx);
auto kernel_iter = kernels.find(expected_kernel_key);
// 3. Execute transfer op and construct OpFuncNode
OpFuncNode new_op_func_node;
new_op_func_node.input_index["X"] = {var_scope_->VarId(var_name)};
new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)};
new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
new_op_func_node.kernel_func_(exec_ctx);
// NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them
// as kQueueSync and execute them in thread pool.
new_op_func_node.type_ = OpFuncType::kQueueSync;
new_op_func_node.dev_ctx_ = dev_ctx;
new_op_func_node.operator_base_ = op;
VLOG(3) << "Run " << op_type << " done.";
new_op_func_nodes->emplace_back(std::move(new_op_func_node));
}
std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
std::string* new_var_name,
DataLayout in_layout,
DataLayout out_layout,
VariableScope* var_scope,
framework::Scope* local_scope) {
// 1. Generate new_var_name and Initialize it
*new_var_name =
var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1);
auto* ptr = local_scope->Var(new_var_name);
auto var_type = var_scope->Var(var_name)->Type();
InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
<< ptr << "Variable Type " << var_type;
var_scope->SetVarDesc(var_name, nullptr);
// 2. Construct VariableNameMap
VariableNameMap in_name_map = {{"X", {var_name}}};
VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
AttributeMap attr_map = {{"dst_layout", static_cast<int>(out_layout)}};
// 3. Create transfer_op
std::string op_type("transfer_layout");
auto& op_info = OpInfoMap::Instance().Get(op_type);
auto op = std::shared_ptr<OperatorBase>(
op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
VLOG(3) << string::Sprintf("Insert %s(%s) with %s -> %s(%s).", op_type,
var_name, in_layout, *new_var_name, out_layout);
return op;
}
std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
std::string* new_var_name,
proto::VarType::Type in_dtype,
proto::VarType::Type out_dtype,
VariableScope* var_scope,
framework::Scope* local_scope) {
// 1. Generate new_var_name and Initialize it
*new_var_name =
var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1);
auto* ptr = local_scope->Var(new_var_name);
auto var_type = var_scope->Var(var_name)->Type();
InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
<< ptr << "Variable Type " << var_type;
var_scope->SetVarDesc(var_name, nullptr);
// 2. Construct VariableNameMap
VariableNameMap in_name_map = {{"X", {var_name}}};
VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
AttributeMap attr_map;
attr_map["in_dtype"] = static_cast<int>(in_dtype);
attr_map["out_dtype"] = static_cast<int>(out_dtype);
// NOTE(Aurelius84): In whice case use_mkldnn = true?
attr_map["use_mkldnn"] = false;
// 3. Create transfer_op
std::string op_type("transfer_dtype");
auto& op_info = OpInfoMap::Instance().Get(op_type);
auto op = std::shared_ptr<OperatorBase>(
op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", op_type,
var_name, DataTypeToString(in_dtype),
*new_var_name, DataTypeToString(out_dtype));
return op;
}
std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
std::string* new_var_name,
const platform::Place& src_place,
const platform::Place& dst_place,
VariableScope* var_scope,
framework::Scope* local_scope) {
// 1. Generate new_var_name and Initialize it
*new_var_name =
var_name + "_device_" + std::to_string(var_scope->VarSize() + 1);
auto* ptr = local_scope->Var(new_var_name);
auto var_type = var_scope->Var(var_name)->Type();
InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
<< ptr << "Variable Type " << var_type;
var_scope->SetVarDesc(var_name, nullptr);
// 2. Construct VariableNameMap
VariableNameMap in_name_map = {{"X", {var_name}}};
VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
int dst_place_type = platform::is_cpu_place(dst_place)
? 0
: platform::is_gpu_place(dst_place) ? 1 : -1;
AttributeMap attr_map = {{"dst_place_type", dst_place_type}};
// 3. Create transfer_op
std::string op_type = get_memcpy_type(src_place, dst_place);
auto& op_info = OpInfoMap::Instance().Get(op_type);
auto op = std::shared_ptr<OperatorBase>(
op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", op_type,
var_name, src_place, *new_var_name, dst_place);
return op;
}
void ApplyDataTransform(const OpKernelType& expected_kernel_key,
const platform::Place& place,
VariableValueMap* ins_map_temp,
VariableScope* var_scope, OpFuncNode* op_func_node,
std::vector<OpFuncNode>* new_op_func_nodes,
bool use_local_scope) {
auto op_base = op_func_node->operator_base_.get();
PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
"op_base is null, please pass a valid "
"op_base in apply_data_transform."));
VariableNameMap new_ins(op_base->Inputs());
// record the no need transform variable index.
std::unordered_set<int> no_data_transform_index;
DataTranferHelper data_transfer_helper(place, var_scope);
for (auto& var_name_item : *ins_map_temp) {
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto var = var_name_item.second[i];
if (!(var->IsType<LoDTensor>() || var->IsType<SelectedRows>())) {
continue;
}
auto& var_name = new_ins[var_name_item.first].at(i);
auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
if (!tensor_in->IsInitialized()) {
continue;
}
auto kernel_type_for_var =
static_cast<const framework::OperatorWithKernel*>(op_base)
->GetKernelTypeForVar(var_name_item.first, *tensor_in,
expected_kernel_key);
// apply data transform
std::string new_var_name;
bool is_transferred = data_transfer_helper.apply(
kernel_type_for_var, expected_kernel_key, var_name, &new_var_name,
new_op_func_nodes, use_local_scope);
if (is_transferred) {
// update RuntimeContext.inputs and original op_func_node inputs
op_func_node->input_index[var_name_item.first][i] =
var_scope->VarId(new_var_name);
var_name_item.second[i] = var_scope->Var(new_var_name);
new_ins[var_name_item.first][i] = new_var_name;
// NOTE(Aurelius84): avoid deepcopy twice if we already insert data
// transfer op.
if (op_base->Type() == "fetch_v2") {
op_base->SetAttr("deepcopy", false);
}
} else {
// record no need data transformer input var_id
VLOG(3) << op_base->Type()
<< " found no data_transform var: " << var_name
<< " with id: " << var_scope->VarId(var_name);
no_data_transform_index.emplace(var_scope->VarId(var_name));
}
}
}
// NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
// with instruction. (hot fix, it is not good design here)
op_func_node->operator_base_ =
std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs()));
op_func_node->no_data_transform_index = std::move(no_data_transform_index);
}
std::string get_memcpy_type(const platform::Place& src_place,
const platform::Place& dst_place) {
PADDLE_ENFORCE_EQ(platform::is_same_place(src_place, dst_place), false,
platform::errors::PreconditionNotMet(
"Required src_place shall be different with dst_place, "
"but received same place: %s",
src_place));
if (platform::is_gpu_place(dst_place)) {
return kMemcpyH2D;
} else if (platform::is_gpu_place(src_place)) {
return kMemcpyD2H;
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Not support Memcpy typ : %s -> %s", src_place, dst_place));
}
}
} // namespace interpreter
} // namespace framework
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/framework/op_kernel_type.h"
namespace paddle {
namespace framework {
namespace interpreter {
/*
* A Helper class to implement data transform operation.
* It will apply layout/dtype/device transfer by turns.
*/
class DataTranferHelper {
public:
DataTranferHelper(const platform::Place& place, VariableScope* var_scope)
: place_(place), var_scope_(var_scope) {}
bool apply(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key,
const std::string& var_name, std::string* new_var_name,
std::vector<OpFuncNode>* new_op_func_nodes, bool use_local_scope);
private:
platform::Place place_;
VariableScope* var_scope_;
void RunAndConstructOpFuncNode(const std::shared_ptr<OperatorBase>& op,
const std::string& var_name,
const std::string& new_var_name,
std::vector<OpFuncNode>* op_func_nodes);
};
void ApplyDataTransform(const OpKernelType& expected_kernel_key,
const platform::Place& place,
VariableValueMap* ins_map_temp,
VariableScope* var_scope, OpFuncNode* op_func_node,
std::vector<OpFuncNode>* op_func_nodes,
bool use_local_scope = true);
std::string get_memcpy_type(const platform::Place& src_place,
const platform::Place& dst_place);
inline bool need_device_transform(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key) {
auto& src_place = kernel_type_for_var.place_;
auto& dst_place = expected_kernel_key.place_;
if (platform::is_same_place(src_place, dst_place) ||
(platform::is_cuda_pinned_place(src_place) &&
platform::is_cpu_place(dst_place))) {
return false;
}
return true;
}
inline bool need_dtype_transform(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key) {
return framework::NeedTransformDataType(kernel_type_for_var,
expected_kernel_key);
}
inline bool need_layout_transform(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key) {
return framework::NeedTransformLayout(kernel_type_for_var.data_layout_,
expected_kernel_key.data_layout_);
}
std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
std::string* new_var_name,
DataLayout in_layout,
DataLayout out_layout,
VariableScope* var_scope,
framework::Scope* local_scope);
std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
std::string* new_var_name,
proto::VarType::Type in_dtype,
proto::VarType::Type out_dtype,
VariableScope* var_scope,
framework::Scope* local_scope);
std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
std::string* new_var_name,
const platform::Place& src_place,
const platform::Place& dst_place,
VariableScope* var_scope,
framework::Scope* local_scope);
} // namespace interpreter
} // namespace framework
} // namespace paddle
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/new_executor/data_transfer.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -114,23 +115,6 @@ get_unused_vars(const BlockDesc& block, ...@@ -114,23 +115,6 @@ get_unused_vars(const BlockDesc& block,
return result; return result;
} }
std::string get_memcpy_type(const platform::Place& src_place,
const platform::Place& dst_place) {
PADDLE_ENFORCE_EQ(platform::is_same_place(src_place, dst_place), false,
platform::errors::PreconditionNotMet(
"Required src_place shall be different with dst_place, "
"but received same place: %s",
src_place));
if (platform::is_gpu_place(dst_place)) {
return kMemcpyH2D;
} else if (platform::is_gpu_place(src_place)) {
return kMemcpyD2H;
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Not support Memcpy typ : %s -> %s", src_place, dst_place));
}
}
void build_variable_scope(const framework::BlockDesc& block, void build_variable_scope(const framework::BlockDesc& block,
VariableScope* var_scope, bool use_local_scope) { VariableScope* var_scope, bool use_local_scope) {
VLOG(3) << "Creating Variables"; VLOG(3) << "Creating Variables";
...@@ -269,195 +253,6 @@ void deal_operator_base(const platform::Place& place, ...@@ -269,195 +253,6 @@ void deal_operator_base(const platform::Place& place,
op_func_node->dev_ctx_ = dev_ctx; op_func_node->dev_ctx_ = dev_ctx;
} }
// the return value is whether data transformer is needed for this var
bool need_place_transform_for_var(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key) {
if (platform::is_same_place(kernel_type_for_var.place_,
expected_kernel_key.place_) ||
(is_cuda_pinned_place(kernel_type_for_var.place_) &&
is_cpu_place(expected_kernel_key.place_))) {
return false;
} else {
return true;
}
}
bool need_dtype_transform_for_var(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key) {
return false; // TODO(@xiongkun) add dtype judgement here
}
bool need_layout_transform_for_var(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key) {
return false; // TODO(@xiongkun) add layout judgement here
}
// NOTE(@xiongkun03)
// the difference between var_name and outer_name :
// if "X": ["var1", "var2"], then X is the outer name,
// var1 and var2 is the var_name
std::tuple<std::string, OpFuncNode> apply_place_transform_for_var(
const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key, const platform::Place& place,
const std::string& var_name, const std::string& outer_name,
const OpFuncNode& op_func_node, Variable* var, VariableScope* var_scope,
bool use_local_scope = true) {
Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope()
: var_scope->GetMutableScope();
auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
std::string new_var_name =
var_name + "_copy_" + std::to_string(var_scope->VarSize() + 1);
auto* ptr = local_scope->Var(new_var_name);
InitializeVariable(ptr, static_cast<proto::VarType::Type>(var->Type()));
VLOG(3) << "Create Variable " << var_name << " locally, which pointer is "
<< ptr << "Variable Type " << var->Type();
var_scope->SetVarDesc(var_name, nullptr);
VariableNameMap copy_in_map;
copy_in_map["X"] = {var_name};
VariableNameMap copy_out_map;
copy_out_map["Out"] = {new_var_name};
AttributeMap attr_map;
attr_map["dst_place_type"] =
is_cpu_place(expected_kernel_key.place_)
? 0
: is_gpu_place(expected_kernel_key.place_) ? 1 : -1;
std::map<std::string, std::vector<int>> copy_ins_name2id;
copy_ins_name2id["X"] = {var_scope->VarId(var_name)};
std::map<std::string, std::vector<int>> copy_out_name2id;
copy_out_name2id["Out"] = {var_scope->VarId(new_var_name)};
VariableValueMap copy_ins_value_map;
copy_ins_value_map["X"] = {var};
VariableValueMap copy_outs_value_map;
copy_outs_value_map["Out"] = {var_scope->Var(new_var_name)};
// memcpy_d2h, memcpy_h2d
auto memcpy_op_type =
get_memcpy_type(kernel_type_for_var.place_, expected_kernel_key.place_);
VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", memcpy_op_type,
var_name, kernel_type_for_var.place_, new_var_name,
expected_kernel_key.place_);
auto& copy_info = OpInfoMap::Instance().Get(memcpy_op_type);
auto copy_op = std::shared_ptr<OperatorBase>(
copy_info.Creator()(memcpy_op_type, copy_in_map, copy_out_map, attr_map));
OpFuncNode copy_op_func_node;
copy_op_func_node.input_index = copy_ins_name2id;
copy_op_func_node.output_index = copy_out_name2id;
RuntimeContext copy_runtime_context({}, {});
copy_runtime_context.inputs.swap(copy_ins_value_map);
copy_runtime_context.outputs.swap(copy_outs_value_map);
InterpretercoreInferShapeContext copy_infer_shape_ctx(*copy_op.get(),
copy_runtime_context);
static_cast<const framework::OperatorWithKernel*>(copy_op.get())
->InferShape(&copy_infer_shape_ctx);
auto kernels_iter = all_op_kernels.find(memcpy_op_type);
PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
platform::errors::Unavailable(
"There are no kernels which are registered in "
"the memcpy operator."));
OpKernelMap& kernels = kernels_iter->second;
auto* dev_ctx = pool.Get(place);
Scope scope;
auto copy_exec_ctx =
ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context);
auto copy_expected_kernel_key =
dynamic_cast<const framework::OperatorWithKernel*>(copy_op.get())
->GetExpectedKernelType(copy_exec_ctx);
auto kernel_iter = kernels.find(copy_expected_kernel_key);
copy_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
copy_op_func_node.kernel_func_(copy_exec_ctx);
VLOG(3) << "Run " << memcpy_op_type << " done.";
// NOTE(Aurelius84): memcpy_op is expensive operation, so we tag them
// as kQueueSync and execute them in thread pool.
copy_op_func_node.type_ = OpFuncType::kQueueSync;
copy_op_func_node.dev_ctx_ = dev_ctx;
copy_op_func_node.operator_base_ = copy_op;
return std::make_pair(new_var_name, copy_op_func_node);
}
void apply_data_transform(const OpKernelType& expected_kernel_key,
const platform::Place& place,
VariableValueMap* ins_map_temp,
VariableScope* var_scope, OpFuncNode* op_func_node,
std::vector<OpFuncNode>* copy_func_nodes,
bool use_local_scope = true) {
auto op_base = op_func_node->operator_base_.get();
PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
"op_base is null, please pass a valid "
"op_base in apply_data_transform."));
VariableNameMap new_ins(op_base->Inputs());
std::unordered_set<int>
no_data_transform_index; // record the no need transform variable index.
for (auto& var_name_item : *ins_map_temp) {
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto var = var_name_item.second[i];
if (!(var->IsType<LoDTensor>() || var->IsType<SelectedRows>())) {
continue;
}
auto& var_name = new_ins[var_name_item.first].at(i);
auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
if (!tensor_in->IsInitialized()) {
continue;
}
auto kernel_type_for_var = // the true kernel type for op_base
static_cast<const framework::OperatorWithKernel*>(op_base)
->GetKernelTypeForVar(var_name_item.first, *tensor_in,
expected_kernel_key);
if (need_place_transform_for_var(kernel_type_for_var,
expected_kernel_key)) {
if (op_base->Type() == "fetch_v2") {
op_base->SetAttr("deepcopy", false);
}
std::string new_var_name;
OpFuncNode copy_op_func_node;
std::tie(new_var_name, copy_op_func_node) =
apply_place_transform_for_var(kernel_type_for_var,
expected_kernel_key, place, var_name,
var_name_item.first, *op_func_node,
var, var_scope, use_local_scope);
op_func_node->input_index[var_name_item.first][i] =
var_scope->VarId(new_var_name);
copy_func_nodes->emplace_back(copy_op_func_node);
var_name_item.second[i] = var_scope->Var(new_var_name);
new_ins[var_name_item.first][i] = new_var_name;
} else if (need_dtype_transform_for_var(kernel_type_for_var,
expected_kernel_key)) {
// TODO(@xiongkun) add dtype judgement here
} else if (need_layout_transform_for_var(kernel_type_for_var,
expected_kernel_key)) {
// TODO(@xiongkun) add layout judgement here
} else {
// record no need data transformer input var_id
VLOG(3) << op_base->Type()
<< " found no data_transform var: " << var_name
<< " with id: " << var_scope->VarId(var_name);
no_data_transform_index.emplace(var_scope->VarId(var_name));
}
}
}
// NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
// with instruction
// hot fix, it is not good design here
op_func_node->operator_base_ =
std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs()));
op_func_node->no_data_transform_index = std::move(no_data_transform_index);
}
void build_op_func_list(const platform::Place& place, void build_op_func_list(const platform::Place& place,
const framework::BlockDesc& block, const framework::BlockDesc& block,
std::vector<OpFuncNode>* vec_func_list, std::vector<OpFuncNode>* vec_func_list,
...@@ -498,6 +293,7 @@ void build_op_func_list(const platform::Place& place, ...@@ -498,6 +293,7 @@ void build_op_func_list(const platform::Place& place,
// step 2: build OpFuncNode // step 2: build OpFuncNode
OpFuncNode op_func_node; OpFuncNode op_func_node;
op_func_node.operator_base_ = ops[i];
op_func_node.input_index = ins_name2id; op_func_node.input_index = ins_name2id;
op_func_node.output_index = outs_name2id; op_func_node.output_index = outs_name2id;
...@@ -538,16 +334,13 @@ void build_op_func_list(const platform::Place& place, ...@@ -538,16 +334,13 @@ void build_op_func_list(const platform::Place& place,
&expected_kernel_key); // change device by the device_guard() &expected_kernel_key); // change device by the device_guard()
VLOG(3) << "expected_kernel_key : " << expected_kernel_key; VLOG(3) << "expected_kernel_key : " << expected_kernel_key;
// step 3. apply data transforms and insert memory ops // step 3. apply data transforms and insert data transfer ops
VariableValueMap& ins_map_temp = runtime_context.inputs; VariableValueMap& ins_map_temp = runtime_context.inputs;
std::vector<OpFuncNode> copy_op_to_insert; std::vector<OpFuncNode> new_op_func_nodes;
// NOTE(xiongkun03): assign op_base here to reduce parameter number of ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope,
// apply_data_transform. &op_func_node, &new_op_func_nodes, use_local_scope);
op_func_node.operator_base_ = ops[i]; for (auto& item : new_op_func_nodes) {
apply_data_transform(expected_kernel_key, place, &ins_map_temp, var_scope, vec_func_list->emplace_back(std::move(item));
&op_func_node, &copy_op_to_insert, use_local_scope);
for (auto& item : copy_op_to_insert) {
vec_func_list->push_back(item);
} }
// step 4. Run op kernel // step 4. Run op kernel
VLOG(3) << op->Type() VLOG(3) << op->Type()
...@@ -660,12 +453,13 @@ void update_var_min_rw_op(const std::map<int, std::set<int>>& op2dependences, ...@@ -660,12 +453,13 @@ void update_var_min_rw_op(const std::map<int, std::set<int>>& op2dependences,
int cur_op, int rw_var) { int cur_op, int rw_var) {
// rw_var is inputs or outputs of cur_op // rw_var is inputs or outputs of cur_op
// this function update the var2min_rw_op set . // this function update the var2min_rw_op set .
if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) {
(*var2min_rw_op)[rw_var] = std::list<int>(); (*var2min_rw_op)[rw_var] = std::list<int>();
}
for (auto dep_op : op2dependences.at(cur_op)) { for (auto dep_op : op2dependences.at(cur_op)) {
(*var2min_rw_op)[rw_var].remove(dep_op); var2min_rw_op->at(rw_var).remove(dep_op);
} }
(*var2min_rw_op)[rw_var].push_back(cur_op); var2min_rw_op->at(rw_var).push_back(cur_op);
} }
std::map<int, std::list<int>> get_downstream_map( std::map<int, std::list<int>> get_downstream_map(
......
...@@ -94,9 +94,6 @@ class AsyncWorkQueue { ...@@ -94,9 +94,6 @@ class AsyncWorkQueue {
AtomicVectorSizeT atomic_var_ref_; AtomicVectorSizeT atomic_var_ref_;
}; };
std::string get_memcpy_type(const platform::Place& src_place,
const platform::Place& dst_place);
void build_variable_scope(const framework::BlockDesc& block, void build_variable_scope(const framework::BlockDesc& block,
VariableScope* var_scope, VariableScope* var_scope,
bool use_local_scope = true); bool use_local_scope = true);
......
...@@ -112,16 +112,23 @@ class CastOp : public framework::OperatorWithKernel { ...@@ -112,16 +112,23 @@ class CastOp : public framework::OperatorWithKernel {
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext; using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cast, ops::CastOp, #define REGISTER_CAST_CPU_BASE(op_name, ...) \
ops::CastOpGradMaker<paddle::framework::OpDesc>, REGISTER_OPERATOR(op_name, ops::CastOp, \
ops::CastOpGradMaker<paddle::imperative::OpBase>, ops::CastOpGradMaker<paddle::framework::OpDesc>, \
ops::CastOpProtoMaker); ops::CastOpGradMaker<paddle::imperative::OpBase>, \
REGISTER_OP_CPU_KERNEL( ops::CastOpProtoMaker); \
cast, ops::CastOpKernel<CPU, float>, ops::CastOpKernel<CPU, double>, REGISTER_OP_CPU_KERNEL( \
ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int64_t>, op_name, ops::CastOpKernel<CPU, float>, ops::CastOpKernel<CPU, double>, \
ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int16_t>, ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int64_t>, \
ops::CastOpKernel<CPU, bool>, ops::CastOpKernel<CPU, uint8_t>, ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int16_t>, \
ops::CastOpKernel<CPU, paddle::platform::float16>, ops::CastOpKernel<CPU, bool>, ops::CastOpKernel<CPU, uint8_t>, \
ops::CastOpKernel<CPU, paddle::platform::bfloat16>, ops::CastOpKernel<CPU, paddle::platform::float16>, \
ops::CastOpKernel<CPU, paddle::platform::complex<float>>, ops::CastOpKernel<CPU, paddle::platform::bfloat16>, \
ops::CastOpKernel<CPU, paddle::platform::complex<double>>); ops::CastOpKernel<CPU, paddle::platform::complex<float>>, \
ops::CastOpKernel<CPU, paddle::platform::complex<double>>);
REGISTER_CAST_CPU_BASE(cast)
// [ why register transfer_dtype_op alias with cast_op? ]
// In case of InterpreterCore, if we reuse cast_op, we cannot distinguish
// which cast_op is inserted by new executor when we do profiling.
REGISTER_CAST_CPU_BASE(transfer_dtype)
...@@ -107,6 +107,9 @@ namespace plat = paddle::platform; ...@@ -107,6 +107,9 @@ namespace plat = paddle::platform;
#if !defined(PADDLE_WITH_HIP) #if !defined(PADDLE_WITH_HIP)
REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel<plat::bfloat16>) REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel<plat::bfloat16>)
// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastCUDAOpKernel<plat::bfloat16>)
#else #else
REGISTER_CAST_CUDA_BASE(cast) REGISTER_CAST_CUDA_BASE(cast)
REGISTER_CAST_CUDA_BASE(transfer_dtype)
#endif #endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/transfer_layout_op.h"
#include <string>
namespace paddle {
namespace framework {
class OpDesc;
class InferShapeContext;
template <typename T>
class EmptyGradOpMaker;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
} // namespace paddle
namespace paddle {
namespace operators {
class TransferLayoutOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "TransferLayout");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TransferLayout");
auto dst_layout = ctx->Attrs().Get<int>("dst_layout");
auto low_bound = static_cast<int>(framework::DataLayout::kNHWC);
auto upper_bound = static_cast<int>(framework::DataLayout::kMKLDNN);
PADDLE_ENFORCE_GE(
dst_layout, low_bound,
platform::errors::PreconditionNotMet(
"Required dst_layout >= %d, but received dst_layout = %d",
low_bound, dst_layout));
PADDLE_ENFORCE_LE(
dst_layout, upper_bound,
platform::errors::PreconditionNotMet(
"Required dst_layout <= %d, but received dst_layout = %d",
upper_bound, dst_layout));
// TODO(Aurelius84): Out's ddim is different with X because they have
// different layout
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
// kernel's device type is decided by input tensor place
auto *in = ctx.InputVar("X");
auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in);
PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"The tensor of Input(X) is not initialized."));
// dtype is not important
return framework::OpKernelType(framework::proto::VarType::FP32,
in_tensor->place());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(),
expected_kernel_type.data_layout_);
}
};
class TransferLayoutInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
ctx->SyncTypeAndDataType("X", "Out");
}
};
class TransferLayoutKernel {
public:
void operator()(const framework::ExecutionContext &ctx) const {
auto *x = ctx.InputVar("X");
auto *out = ctx.OutputVar("Out");
auto &dev_ctx = ctx.device_context();
auto dst_layout = ctx.Attr<int>("dst_layout");
TransferLayoutFunctor(x, out, dev_ctx, dst_layout)();
}
};
class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(LoDTensor) The input Tensor");
AddOutput("Out", "(LoDTensor) The Output Tensor with desired layout");
AddAttr<int>("dst_layout",
"kNHWC = 0, kNCHW = 1, kAnyLayout = 2, kMKLDNN = 3");
AddComment(R"DOC(
TransferLayout Operator)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
transfer_layout, ops::TransferLayoutOp, ops::TransferLayoutOpProtoMaker,
ops::TransferLayoutInferVarType,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
// dtype is not important
REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float,
ops::TransferLayoutKernel);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
using DataLayout = framework::DataLayout;
class TransferLayoutFunctor {
public:
TransferLayoutFunctor(const framework::Variable *in, framework::Variable *out,
const platform::DeviceContext &dev_ctx,
const int dst_layout)
: in_(in), out_(out), dev_ctx_(dev_ctx), dst_layout_(dst_layout) {}
void operator()() const {
auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
framework::LoDTensor out_tensor;
auto out_layout = static_cast<DataLayout>(dst_layout_);
out_tensor.set_layout(out_layout);
#ifdef PADDLE_WITH_MKLDNN
auto in_layout = in_tensor.layout();
if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) {
PADDLE_ENFORCE_NE(
in_layout, out_layout,
platform::errors::PreconditionNotMet(
"No layout transform needed between two MKLDNN OPKernels."));
if (in_layout != DataLayout::kMKLDNN &&
out_layout == DataLayout::kMKLDNN) {
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(
in_tensor.dims().size(), ToMKLDNNFormat(in_layout));
out_tensor.ShareDataWith(in_tensor);
// For NHWC data we need reshape of tensors as MKL-DNN
// is expecting NHWC dims description order
platform::MatchShapeToLayout(&out_tensor, in_layout, out_layout);
paddle::platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
in_layout);
out_tensor.set_layout(DataLayout::kMKLDNN);
out_tensor.set_format(out_format);
} else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
// Do transform via MKLDNN lib
innerTransDataLayoutFromMKLDNN(
in_layout, paddle::platform::MKLDNNDeviceContext::tls()
.get_cur_paddle_data_layout(),
in_tensor, &out_tensor, dev_ctx_.GetPlace());
}
} else {
// Case3 - transfrom between Non-MKLDNN OPKernels
TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
}
#else
// Case3 - transfrom between Non-MKLDNN OPKernels
TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
#endif
framework::SetTensorToVariable(*in_, out_tensor, out_);
}
private:
void TransDataLayout(const platform::DeviceContext &dev_ctx,
const framework::Tensor &in,
framework::Tensor *out) const {
PADDLE_ENFORCE_EQ(
framework::arity(in.dims()), 4,
platform::errors::InvalidArgument(
"Input dimension arity only can be 4, the input dimension is %s.",
in.dims()));
auto src_dim = in.dims();
std::vector<int64_t> dst_dim;
auto axis = framework::GetAxis(in.layout(), out->layout());
dst_dim.resize(axis.size());
for (size_t i = 0; i < axis.size(); i++) {
dst_dim[i] = src_dim[axis[i]];
}
out->Resize(framework::make_ddim(dst_dim));
out->mutable_data(in.place(), in.type());
framework::VisitDataType(
in.type(), framework::CastDataLayout(&dev_ctx, axis, in, out));
}
const framework::Variable *in_;
framework::Variable *out_;
const platform::DeviceContext &dev_ctx_;
const int dst_layout_;
};
} // namespace operators
} // namespace paddle
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16
class TestTransferDtypeOpFp32ToFp64(OpTest):
def setUp(self):
ipt = np.random.random(size=[10, 10])
self.inputs = {'X': ipt.astype('float32')}
self.outputs = {'Out': ipt.astype('float64')}
self.attrs = {
'out_dtype': int(core.VarDesc.VarType.FP64),
'in_dtype': int(core.VarDesc.VarType.FP32)
}
self.op_type = 'transfer_dtype'
def test_check_output(self):
self.check_output()
class TestTransferDtypeOpFp16ToFp32(OpTest):
def setUp(self):
ipt = np.random.random(size=[10, 10])
self.inputs = {'X': ipt.astype('float16')}
self.outputs = {'Out': ipt.astype('float32')}
self.attrs = {
'out_dtype': int(core.VarDesc.VarType.FP32),
'in_dtype': int(core.VarDesc.VarType.FP16)
}
self.op_type = 'transfer_dtype'
def test_check_output(self):
self.check_output(atol=1e-3)
class TestTransferDtypeOpFp32ToFp16(OpTest):
def setUp(self):
ipt = np.random.random(size=[10, 10])
self.inputs = {'X': ipt.astype('float32')}
self.outputs = {'Out': ipt.astype('float16')}
self.attrs = {
'out_dtype': int(core.VarDesc.VarType.FP16),
'in_dtype': int(core.VarDesc.VarType.FP32)
}
self.op_type = 'transfer_dtype'
def test_check_output(self):
self.check_output(atol=1e-3)
class TestTransferDtypeOpBf16ToFp32(OpTest):
def setUp(self):
ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16')
self.inputs = {'X': ipt}
self.outputs = {'Out': convert_uint16_to_float(ipt)}
self.attrs = {
'out_dtype': int(core.VarDesc.VarType.FP32),
'in_dtype': int(core.VarDesc.VarType.BF16)
}
self.op_type = 'transfer_dtype'
def test_check_output(self):
self.check_output()
class TestTransferDtypeFp32ToBf16(OpTest):
def setUp(self):
ipt = np.random.random(size=[10, 10]).astype('float32')
self.inputs = {'X': ipt}
self.outputs = {'Out': convert_float_to_uint16(ipt)}
self.attrs = {
'out_dtype': int(core.VarDesc.VarType.BF16),
'in_dtype': int(core.VarDesc.VarType.FP32)
}
self.op_type = 'transfer_dtype'
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
from op_test import OpTest
# default kNCHW
class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
def setUp(self):
ipt = np.random.random(size=[2, 3, 10, 10])
self.inputs = {'X': ipt.astype('float32')}
self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])}
self.attrs = {
'dst_layout': 0 # kNHWC
}
self.op_type = 'transfer_layout'
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
STATIC_MODE_TESTING_LIST = [ STATIC_MODE_TESTING_LIST = [
'test_affine_channel_op', 'test_affine_channel_op',
'test_transfer_dtype_op',
'test_transfer_layout_op',
'test_concat_op', 'test_concat_op',
'test_elementwise_add_op', 'test_elementwise_add_op',
'test_elementwise_sub_op', 'test_elementwise_sub_op',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册