diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 1701359b045999a90ffb96a3106a86025dba828c..688835cc3c93b03f06abc539c276f0b668a36259 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -37,30 +37,19 @@ std::vector GetAxis(const DataLayout& from, const DataLayout& to) { } } -struct CastDataLayout { - CastDataLayout(const platform::DeviceContext* ctx, - const std::vector& axis, const framework::Tensor& in, - framework::Tensor* out) - : in_(in), out_(out), ctx_(ctx), axis_(axis) {} - const framework::Tensor in_; - framework::Tensor* out_; - const platform::DeviceContext* ctx_; - const std::vector axis_; - - template - void apply() { - auto place = ctx_->GetPlace(); - - if (platform::is_cpu_place(place)) { - operators::math::Transpose trans4; - auto* context = static_cast(ctx_); - trans4(*context, in_, out_, axis_); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Unsupported data layout cast from CPU to GPU.")); - } +template +void CastDataLayout::apply() { + auto place = ctx_->GetPlace(); + + if (platform::is_cpu_place(place)) { + operators::math::Transpose trans4; + auto* context = static_cast(ctx_); + trans4(*context, in_, out_, axis_); + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Unsupported data layout cast from CPU to GPU.")); } -}; +} void TransDataLayout(const OpKernelType& kernel_type_for_var, const OpKernelType& expected_kernel_type, const Tensor& in, diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 3c58a2d09f2599072c550244b287b3fb64450fef..f7b4a36d2f400173e4e2d0f16ac112456f249212 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -36,6 +36,21 @@ class Tensor; namespace paddle { namespace framework { +struct CastDataLayout { + CastDataLayout(const platform::DeviceContext* ctx, + const std::vector& axis, const framework::Tensor& in, + framework::Tensor* out) + : in_(in), out_(out), ctx_(ctx), axis_(axis) {} + + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + const std::vector axis_; + + template + void apply(); +}; + #ifdef PADDLE_WITH_MKLDNN using MKLDNNDataType = dnnl::memory::data_type; diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index 622aeec142c3a2d50d2d07e4c4afaf46e46ff011..59f027cf4894f3783cf958f549e6363f1602ca42 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -2,10 +2,11 @@ set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_f lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method graph_to_program_pass variable_helper timer monitor nan_inf_utils) +cc_library(data_transfer SRCS data_transfer.cc DEPS enforce scope glog) cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce) cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope) cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS} executor_gc_helper) -cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs) +cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer) cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs) cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs) cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager) diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc new file mode 100644 index 0000000000000000000000000000000000000000..7b6ea752b12a7934a0e20e2df6d94d350b022600 --- /dev/null +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -0,0 +1,305 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/new_executor/data_transfer.h" + +namespace paddle { +namespace framework { +namespace interpreter { + +bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_key, + const std::string& var_name, + std::string* new_var_name, + std::vector* op_func_nodes, + bool use_local_scope) { + bool is_transferred = false; + auto* src_var_name = &var_name; + + Scope* local_scope = use_local_scope ? var_scope_->GetMutableLocalScope() + : var_scope_->GetMutableScope(); + + // 1. layout transform + if (need_layout_transform(kernel_type_for_var, expected_kernel_key)) { + auto op = TransferLayout( + *src_var_name, new_var_name, kernel_type_for_var.data_layout_, + expected_kernel_key.data_layout_, var_scope_, local_scope); + RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes); + // update src_var_name + src_var_name = new_var_name; + is_transferred = true; + } + // 2. dype transform + if (need_dtype_transform(kernel_type_for_var, expected_kernel_key)) { + auto op = TransferDtype( + *src_var_name, new_var_name, kernel_type_for_var.data_type_, + expected_kernel_key.data_type_, var_scope_, local_scope); + RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes); + // update src_var_name + src_var_name = new_var_name; + is_transferred = true; + } + // 3. device transform + if (need_device_transform(kernel_type_for_var, expected_kernel_key)) { + auto src_place = kernel_type_for_var.place_; + auto dst_place = expected_kernel_key.place_; + auto op = TransferDevice(*src_var_name, new_var_name, src_place, dst_place, + var_scope_, local_scope); + RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes); + is_transferred = true; + } + return is_transferred; +} + +void DataTranferHelper::RunAndConstructOpFuncNode( + const std::shared_ptr& op, const std::string& var_name, + const std::string& new_var_name, + std::vector* new_op_func_nodes) { + auto& op_type = op->Type(); + + // 1. Construct RuntimeContext + RuntimeContext runtime_context({}, {}); + runtime_context.inputs["X"] = {var_scope_->Var(var_name)}; + runtime_context.outputs["Out"] = {var_scope_->Var(new_var_name)}; + InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context); + + // 2. Execute infer shape and choose kernel + auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); + static_cast(op.get())->InferShape( + &infer_shape_ctx); + auto kernels_iter = all_op_kernels.find(op_type); + PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(), + platform::errors::Unavailable( + "There are no kernels which are registered in " + "the %s operator.", + op_type)); + OpKernelMap& kernels = kernels_iter->second; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place_); + Scope scope; + auto exec_ctx = ExecutionContext(*op, scope, *dev_ctx, runtime_context); + auto expected_kernel_key = + dynamic_cast(op.get()) + ->GetExpectedKernelType(exec_ctx); + auto kernel_iter = kernels.find(expected_kernel_key); + + // 3. Execute transfer op and construct OpFuncNode + OpFuncNode new_op_func_node; + new_op_func_node.input_index["X"] = {var_scope_->VarId(var_name)}; + new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)}; + new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); + new_op_func_node.kernel_func_(exec_ctx); + // NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them + // as kQueueSync and execute them in thread pool. + new_op_func_node.type_ = OpFuncType::kQueueSync; + new_op_func_node.dev_ctx_ = dev_ctx; + new_op_func_node.operator_base_ = op; + VLOG(3) << "Run " << op_type << " done."; + + new_op_func_nodes->emplace_back(std::move(new_op_func_node)); +} + +std::shared_ptr TransferLayout(const std::string& var_name, + std::string* new_var_name, + DataLayout in_layout, + DataLayout out_layout, + VariableScope* var_scope, + framework::Scope* local_scope) { + // 1. Generate new_var_name and Initialize it + *new_var_name = + var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1); + auto* ptr = local_scope->Var(new_var_name); + + auto var_type = var_scope->Var(var_name)->Type(); + InitializeVariable(ptr, static_cast(var_type)); + VLOG(3) << "Create Variable " << var_name << " locally, which pointer is " + << ptr << "Variable Type " << var_type; + var_scope->SetVarDesc(var_name, nullptr); + + // 2. Construct VariableNameMap + VariableNameMap in_name_map = {{"X", {var_name}}}; + VariableNameMap out_name_map = {{"Out", {*new_var_name}}}; + AttributeMap attr_map = {{"dst_layout", static_cast(out_layout)}}; + + // 3. Create transfer_op + std::string op_type("transfer_layout"); + auto& op_info = OpInfoMap::Instance().Get(op_type); + auto op = std::shared_ptr( + op_info.Creator()(op_type, in_name_map, out_name_map, attr_map)); + + VLOG(3) << string::Sprintf("Insert %s(%s) with %s -> %s(%s).", op_type, + var_name, in_layout, *new_var_name, out_layout); + return op; +} + +std::shared_ptr TransferDtype(const std::string& var_name, + std::string* new_var_name, + proto::VarType::Type in_dtype, + proto::VarType::Type out_dtype, + VariableScope* var_scope, + framework::Scope* local_scope) { + // 1. Generate new_var_name and Initialize it + *new_var_name = + var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1); + auto* ptr = local_scope->Var(new_var_name); + + auto var_type = var_scope->Var(var_name)->Type(); + InitializeVariable(ptr, static_cast(var_type)); + VLOG(3) << "Create Variable " << var_name << " locally, which pointer is " + << ptr << "Variable Type " << var_type; + var_scope->SetVarDesc(var_name, nullptr); + + // 2. Construct VariableNameMap + VariableNameMap in_name_map = {{"X", {var_name}}}; + VariableNameMap out_name_map = {{"Out", {*new_var_name}}}; + AttributeMap attr_map; + attr_map["in_dtype"] = static_cast(in_dtype); + attr_map["out_dtype"] = static_cast(out_dtype); + // NOTE(Aurelius84): In whice case use_mkldnn = true? + attr_map["use_mkldnn"] = false; + + // 3. Create transfer_op + std::string op_type("transfer_dtype"); + auto& op_info = OpInfoMap::Instance().Get(op_type); + auto op = std::shared_ptr( + op_info.Creator()(op_type, in_name_map, out_name_map, attr_map)); + + VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", op_type, + var_name, DataTypeToString(in_dtype), + *new_var_name, DataTypeToString(out_dtype)); + return op; +} + +std::shared_ptr TransferDevice(const std::string& var_name, + std::string* new_var_name, + const platform::Place& src_place, + const platform::Place& dst_place, + VariableScope* var_scope, + framework::Scope* local_scope) { + // 1. Generate new_var_name and Initialize it + *new_var_name = + var_name + "_device_" + std::to_string(var_scope->VarSize() + 1); + auto* ptr = local_scope->Var(new_var_name); + + auto var_type = var_scope->Var(var_name)->Type(); + InitializeVariable(ptr, static_cast(var_type)); + VLOG(3) << "Create Variable " << var_name << " locally, which pointer is " + << ptr << "Variable Type " << var_type; + var_scope->SetVarDesc(var_name, nullptr); + + // 2. Construct VariableNameMap + VariableNameMap in_name_map = {{"X", {var_name}}}; + VariableNameMap out_name_map = {{"Out", {*new_var_name}}}; + int dst_place_type = platform::is_cpu_place(dst_place) + ? 0 + : platform::is_gpu_place(dst_place) ? 1 : -1; + AttributeMap attr_map = {{"dst_place_type", dst_place_type}}; + + // 3. Create transfer_op + std::string op_type = get_memcpy_type(src_place, dst_place); + auto& op_info = OpInfoMap::Instance().Get(op_type); + auto op = std::shared_ptr( + op_info.Creator()(op_type, in_name_map, out_name_map, attr_map)); + + VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", op_type, + var_name, src_place, *new_var_name, dst_place); + return op; +} + +void ApplyDataTransform(const OpKernelType& expected_kernel_key, + const platform::Place& place, + VariableValueMap* ins_map_temp, + VariableScope* var_scope, OpFuncNode* op_func_node, + std::vector* new_op_func_nodes, + bool use_local_scope) { + auto op_base = op_func_node->operator_base_.get(); + PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet( + "op_base is null, please pass a valid " + "op_base in apply_data_transform.")); + + VariableNameMap new_ins(op_base->Inputs()); + // record the no need transform variable index. + std::unordered_set no_data_transform_index; + + DataTranferHelper data_transfer_helper(place, var_scope); + for (auto& var_name_item : *ins_map_temp) { + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto var = var_name_item.second[i]; + if (!(var->IsType() || var->IsType())) { + continue; + } + auto& var_name = new_ins[var_name_item.first].at(i); + auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); + if (!tensor_in->IsInitialized()) { + continue; + } + auto kernel_type_for_var = + static_cast(op_base) + ->GetKernelTypeForVar(var_name_item.first, *tensor_in, + expected_kernel_key); + // apply data transform + std::string new_var_name; + bool is_transferred = data_transfer_helper.apply( + kernel_type_for_var, expected_kernel_key, var_name, &new_var_name, + new_op_func_nodes, use_local_scope); + + if (is_transferred) { + // update RuntimeContext.inputs and original op_func_node inputs + op_func_node->input_index[var_name_item.first][i] = + var_scope->VarId(new_var_name); + var_name_item.second[i] = var_scope->Var(new_var_name); + new_ins[var_name_item.first][i] = new_var_name; + // NOTE(Aurelius84): avoid deepcopy twice if we already insert data + // transfer op. + if (op_base->Type() == "fetch_v2") { + op_base->SetAttr("deepcopy", false); + } + } else { + // record no need data transformer input var_id + VLOG(3) << op_base->Type() + << " found no data_transform var: " << var_name + << " with id: " << var_scope->VarId(var_name); + no_data_transform_index.emplace(var_scope->VarId(var_name)); + } + } + } + + // NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent + // with instruction. (hot fix, it is not good design here) + op_func_node->operator_base_ = + std::shared_ptr(framework::OpRegistry::CreateOp( + op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs())); + op_func_node->no_data_transform_index = std::move(no_data_transform_index); +} + +std::string get_memcpy_type(const platform::Place& src_place, + const platform::Place& dst_place) { + PADDLE_ENFORCE_EQ(platform::is_same_place(src_place, dst_place), false, + platform::errors::PreconditionNotMet( + "Required src_place shall be different with dst_place, " + "but received same place: %s", + src_place)); + if (platform::is_gpu_place(dst_place)) { + return kMemcpyH2D; + } else if (platform::is_gpu_place(src_place)) { + return kMemcpyD2H; + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Not support Memcpy typ : %s -> %s", src_place, dst_place)); + } +} + +} // namespace interpreter +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/data_transfer.h b/paddle/fluid/framework/new_executor/data_transfer.h new file mode 100644 index 0000000000000000000000000000000000000000..d66188709f726e78b09b0fe24ba36c45ffdd97bc --- /dev/null +++ b/paddle/fluid/framework/new_executor/data_transfer.h @@ -0,0 +1,107 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/new_executor/new_executor_defs.h" +#include "paddle/fluid/framework/op_kernel_type.h" + +namespace paddle { +namespace framework { +namespace interpreter { + +/* + * A Helper class to implement data transform operation. + * It will apply layout/dtype/device transfer by turns. + */ +class DataTranferHelper { + public: + DataTranferHelper(const platform::Place& place, VariableScope* var_scope) + : place_(place), var_scope_(var_scope) {} + + bool apply(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_key, + const std::string& var_name, std::string* new_var_name, + std::vector* new_op_func_nodes, bool use_local_scope); + + private: + platform::Place place_; + VariableScope* var_scope_; + + void RunAndConstructOpFuncNode(const std::shared_ptr& op, + const std::string& var_name, + const std::string& new_var_name, + std::vector* op_func_nodes); +}; + +void ApplyDataTransform(const OpKernelType& expected_kernel_key, + const platform::Place& place, + VariableValueMap* ins_map_temp, + VariableScope* var_scope, OpFuncNode* op_func_node, + std::vector* op_func_nodes, + bool use_local_scope = true); + +std::string get_memcpy_type(const platform::Place& src_place, + const platform::Place& dst_place); + +inline bool need_device_transform(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_key) { + auto& src_place = kernel_type_for_var.place_; + auto& dst_place = expected_kernel_key.place_; + if (platform::is_same_place(src_place, dst_place) || + (platform::is_cuda_pinned_place(src_place) && + platform::is_cpu_place(dst_place))) { + return false; + } + return true; +} + +inline bool need_dtype_transform(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_key) { + return framework::NeedTransformDataType(kernel_type_for_var, + expected_kernel_key); +} + +inline bool need_layout_transform(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_key) { + return framework::NeedTransformLayout(kernel_type_for_var.data_layout_, + expected_kernel_key.data_layout_); +} + +std::shared_ptr TransferLayout(const std::string& var_name, + std::string* new_var_name, + DataLayout in_layout, + DataLayout out_layout, + VariableScope* var_scope, + framework::Scope* local_scope); + +std::shared_ptr TransferDtype(const std::string& var_name, + std::string* new_var_name, + proto::VarType::Type in_dtype, + proto::VarType::Type out_dtype, + VariableScope* var_scope, + framework::Scope* local_scope); + +std::shared_ptr TransferDevice(const std::string& var_name, + std::string* new_var_name, + const platform::Place& src_place, + const platform::Place& dst_place, + VariableScope* var_scope, + framework::Scope* local_scope); + +} // namespace interpreter +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 07743150b60038e89c88500a38daa73ae910afde..061b2e7806b85c1a30bb6e71ad597b116bd826ac 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -15,6 +15,7 @@ #include #include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/new_executor/data_transfer.h" namespace paddle { namespace framework { @@ -114,23 +115,6 @@ get_unused_vars(const BlockDesc& block, return result; } -std::string get_memcpy_type(const platform::Place& src_place, - const platform::Place& dst_place) { - PADDLE_ENFORCE_EQ(platform::is_same_place(src_place, dst_place), false, - platform::errors::PreconditionNotMet( - "Required src_place shall be different with dst_place, " - "but received same place: %s", - src_place)); - if (platform::is_gpu_place(dst_place)) { - return kMemcpyH2D; - } else if (platform::is_gpu_place(src_place)) { - return kMemcpyD2H; - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not support Memcpy typ : %s -> %s", src_place, dst_place)); - } -} - void build_variable_scope(const framework::BlockDesc& block, VariableScope* var_scope, bool use_local_scope) { VLOG(3) << "Creating Variables"; @@ -269,195 +253,6 @@ void deal_operator_base(const platform::Place& place, op_func_node->dev_ctx_ = dev_ctx; } -// the return value is whether data transformer is needed for this var -bool need_place_transform_for_var(const OpKernelType& kernel_type_for_var, - const OpKernelType& expected_kernel_key) { - if (platform::is_same_place(kernel_type_for_var.place_, - expected_kernel_key.place_) || - (is_cuda_pinned_place(kernel_type_for_var.place_) && - is_cpu_place(expected_kernel_key.place_))) { - return false; - } else { - return true; - } -} - -bool need_dtype_transform_for_var(const OpKernelType& kernel_type_for_var, - const OpKernelType& expected_kernel_key) { - return false; // TODO(@xiongkun) add dtype judgement here -} - -bool need_layout_transform_for_var(const OpKernelType& kernel_type_for_var, - const OpKernelType& expected_kernel_key) { - return false; // TODO(@xiongkun) add layout judgement here -} - -// NOTE(@xiongkun03) -// the difference between var_name and outer_name : -// if "X": ["var1", "var2"], then X is the outer name, -// var1 and var2 is the var_name -std::tuple apply_place_transform_for_var( - const OpKernelType& kernel_type_for_var, - const OpKernelType& expected_kernel_key, const platform::Place& place, - const std::string& var_name, const std::string& outer_name, - const OpFuncNode& op_func_node, Variable* var, VariableScope* var_scope, - bool use_local_scope = true) { - Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope() - : var_scope->GetMutableScope(); - - auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - std::string new_var_name = - var_name + "_copy_" + std::to_string(var_scope->VarSize() + 1); - - auto* ptr = local_scope->Var(new_var_name); - InitializeVariable(ptr, static_cast(var->Type())); - VLOG(3) << "Create Variable " << var_name << " locally, which pointer is " - << ptr << "Variable Type " << var->Type(); - var_scope->SetVarDesc(var_name, nullptr); - - VariableNameMap copy_in_map; - copy_in_map["X"] = {var_name}; - VariableNameMap copy_out_map; - copy_out_map["Out"] = {new_var_name}; - AttributeMap attr_map; - attr_map["dst_place_type"] = - is_cpu_place(expected_kernel_key.place_) - ? 0 - : is_gpu_place(expected_kernel_key.place_) ? 1 : -1; - - std::map> copy_ins_name2id; - copy_ins_name2id["X"] = {var_scope->VarId(var_name)}; - std::map> copy_out_name2id; - copy_out_name2id["Out"] = {var_scope->VarId(new_var_name)}; - - VariableValueMap copy_ins_value_map; - copy_ins_value_map["X"] = {var}; - VariableValueMap copy_outs_value_map; - copy_outs_value_map["Out"] = {var_scope->Var(new_var_name)}; - - // memcpy_d2h, memcpy_h2d - auto memcpy_op_type = - get_memcpy_type(kernel_type_for_var.place_, expected_kernel_key.place_); - VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", memcpy_op_type, - var_name, kernel_type_for_var.place_, new_var_name, - expected_kernel_key.place_); - auto& copy_info = OpInfoMap::Instance().Get(memcpy_op_type); - auto copy_op = std::shared_ptr( - copy_info.Creator()(memcpy_op_type, copy_in_map, copy_out_map, attr_map)); - - OpFuncNode copy_op_func_node; - copy_op_func_node.input_index = copy_ins_name2id; - copy_op_func_node.output_index = copy_out_name2id; - - RuntimeContext copy_runtime_context({}, {}); - copy_runtime_context.inputs.swap(copy_ins_value_map); - copy_runtime_context.outputs.swap(copy_outs_value_map); - InterpretercoreInferShapeContext copy_infer_shape_ctx(*copy_op.get(), - copy_runtime_context); - static_cast(copy_op.get()) - ->InferShape(©_infer_shape_ctx); - - auto kernels_iter = all_op_kernels.find(memcpy_op_type); - PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in " - "the memcpy operator.")); - - OpKernelMap& kernels = kernels_iter->second; - auto* dev_ctx = pool.Get(place); - Scope scope; - auto copy_exec_ctx = - ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context); - auto copy_expected_kernel_key = - dynamic_cast(copy_op.get()) - ->GetExpectedKernelType(copy_exec_ctx); - auto kernel_iter = kernels.find(copy_expected_kernel_key); - copy_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); - copy_op_func_node.kernel_func_(copy_exec_ctx); - VLOG(3) << "Run " << memcpy_op_type << " done."; - // NOTE(Aurelius84): memcpy_op is expensive operation, so we tag them - // as kQueueSync and execute them in thread pool. - copy_op_func_node.type_ = OpFuncType::kQueueSync; - copy_op_func_node.dev_ctx_ = dev_ctx; - copy_op_func_node.operator_base_ = copy_op; - - return std::make_pair(new_var_name, copy_op_func_node); -} - -void apply_data_transform(const OpKernelType& expected_kernel_key, - const platform::Place& place, - VariableValueMap* ins_map_temp, - VariableScope* var_scope, OpFuncNode* op_func_node, - std::vector* copy_func_nodes, - bool use_local_scope = true) { - auto op_base = op_func_node->operator_base_.get(); - PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet( - "op_base is null, please pass a valid " - "op_base in apply_data_transform.")); - - VariableNameMap new_ins(op_base->Inputs()); - - std::unordered_set - no_data_transform_index; // record the no need transform variable index. - - for (auto& var_name_item : *ins_map_temp) { - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto var = var_name_item.second[i]; - if (!(var->IsType() || var->IsType())) { - continue; - } - auto& var_name = new_ins[var_name_item.first].at(i); - auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); - if (!tensor_in->IsInitialized()) { - continue; - } - auto kernel_type_for_var = // the true kernel type for op_base - static_cast(op_base) - ->GetKernelTypeForVar(var_name_item.first, *tensor_in, - expected_kernel_key); - if (need_place_transform_for_var(kernel_type_for_var, - expected_kernel_key)) { - if (op_base->Type() == "fetch_v2") { - op_base->SetAttr("deepcopy", false); - } - std::string new_var_name; - OpFuncNode copy_op_func_node; - std::tie(new_var_name, copy_op_func_node) = - apply_place_transform_for_var(kernel_type_for_var, - expected_kernel_key, place, var_name, - var_name_item.first, *op_func_node, - var, var_scope, use_local_scope); - op_func_node->input_index[var_name_item.first][i] = - var_scope->VarId(new_var_name); - copy_func_nodes->emplace_back(copy_op_func_node); - var_name_item.second[i] = var_scope->Var(new_var_name); - new_ins[var_name_item.first][i] = new_var_name; - } else if (need_dtype_transform_for_var(kernel_type_for_var, - expected_kernel_key)) { - // TODO(@xiongkun) add dtype judgement here - } else if (need_layout_transform_for_var(kernel_type_for_var, - expected_kernel_key)) { - // TODO(@xiongkun) add layout judgement here - } else { - // record no need data transformer input var_id - VLOG(3) << op_base->Type() - << " found no data_transform var: " << var_name - << " with id: " << var_scope->VarId(var_name); - no_data_transform_index.emplace(var_scope->VarId(var_name)); - } - } - } - - // NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent - // with instruction - // hot fix, it is not good design here - op_func_node->operator_base_ = - std::shared_ptr(framework::OpRegistry::CreateOp( - op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs())); - op_func_node->no_data_transform_index = std::move(no_data_transform_index); -} - void build_op_func_list(const platform::Place& place, const framework::BlockDesc& block, std::vector* vec_func_list, @@ -498,6 +293,7 @@ void build_op_func_list(const platform::Place& place, // step 2: build OpFuncNode OpFuncNode op_func_node; + op_func_node.operator_base_ = ops[i]; op_func_node.input_index = ins_name2id; op_func_node.output_index = outs_name2id; @@ -538,16 +334,13 @@ void build_op_func_list(const platform::Place& place, &expected_kernel_key); // change device by the device_guard() VLOG(3) << "expected_kernel_key : " << expected_kernel_key; - // step 3. apply data transforms and insert memory ops + // step 3. apply data transforms and insert data transfer ops VariableValueMap& ins_map_temp = runtime_context.inputs; - std::vector copy_op_to_insert; - // NOTE(xiongkun03): assign op_base here to reduce parameter number of - // apply_data_transform. - op_func_node.operator_base_ = ops[i]; - apply_data_transform(expected_kernel_key, place, &ins_map_temp, var_scope, - &op_func_node, ©_op_to_insert, use_local_scope); - for (auto& item : copy_op_to_insert) { - vec_func_list->push_back(item); + std::vector new_op_func_nodes; + ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope, + &op_func_node, &new_op_func_nodes, use_local_scope); + for (auto& item : new_op_func_nodes) { + vec_func_list->emplace_back(std::move(item)); } // step 4. Run op kernel VLOG(3) << op->Type() @@ -660,12 +453,13 @@ void update_var_min_rw_op(const std::map>& op2dependences, int cur_op, int rw_var) { // rw_var is inputs or outputs of cur_op // this function update the var2min_rw_op set . - if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) + if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) { (*var2min_rw_op)[rw_var] = std::list(); + } for (auto dep_op : op2dependences.at(cur_op)) { - (*var2min_rw_op)[rw_var].remove(dep_op); + var2min_rw_op->at(rw_var).remove(dep_op); } - (*var2min_rw_op)[rw_var].push_back(cur_op); + var2min_rw_op->at(rw_var).push_back(cur_op); } std::map> get_downstream_map( diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 60312d153c361eeb1162d1b82e583bb2c33371e5..9fc93afb5dedb9a583ca3ddd6bf74e7e439d64cc 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -94,9 +94,6 @@ class AsyncWorkQueue { AtomicVectorSizeT atomic_var_ref_; }; -std::string get_memcpy_type(const platform::Place& src_place, - const platform::Place& dst_place); - void build_variable_scope(const framework::BlockDesc& block, VariableScope* var_scope, bool use_local_scope = true); diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 5fc97924ef27fedd7503207ea6e8cd7d63ecb1a2..7278d80ce9ba195a1c83a3ba67dcb449d7f81e59 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -112,16 +112,23 @@ class CastOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; -REGISTER_OPERATOR(cast, ops::CastOp, - ops::CastOpGradMaker, - ops::CastOpGradMaker, - ops::CastOpProtoMaker); -REGISTER_OP_CPU_KERNEL( - cast, ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel>, - ops::CastOpKernel>); +#define REGISTER_CAST_CPU_BASE(op_name, ...) \ + REGISTER_OPERATOR(op_name, ops::CastOp, \ + ops::CastOpGradMaker, \ + ops::CastOpGradMaker, \ + ops::CastOpProtoMaker); \ + REGISTER_OP_CPU_KERNEL( \ + op_name, ops::CastOpKernel, ops::CastOpKernel, \ + ops::CastOpKernel, ops::CastOpKernel, \ + ops::CastOpKernel, ops::CastOpKernel, \ + ops::CastOpKernel, ops::CastOpKernel, \ + ops::CastOpKernel, \ + ops::CastOpKernel, \ + ops::CastOpKernel>, \ + ops::CastOpKernel>); + +REGISTER_CAST_CPU_BASE(cast) +// [ why register transfer_dtype_op alias with cast_op? ] +// In case of InterpreterCore, if we reuse cast_op, we cannot distinguish +// which cast_op is inserted by new executor when we do profiling. +REGISTER_CAST_CPU_BASE(transfer_dtype) diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 05a110fe65b839e5d305fea0a14aaeb1c83aeaf6..bb4246e3e9b845078894d5ac6604d65668aaaa89 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -107,6 +107,9 @@ namespace plat = paddle::platform; #if !defined(PADDLE_WITH_HIP) REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel) +// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc +REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastCUDAOpKernel) #else REGISTER_CAST_CUDA_BASE(cast) +REGISTER_CAST_CUDA_BASE(transfer_dtype) #endif diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..994aabd66cf9381ea31d35d7b258af87927cf3b8 --- /dev/null +++ b/paddle/fluid/operators/transfer_layout_op.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/transfer_layout_op.h" + +#include + +namespace paddle { +namespace framework { +class OpDesc; +class InferShapeContext; +template +class EmptyGradOpMaker; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +} // namespace paddle + +namespace paddle { +namespace operators { + +class TransferLayoutOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "TransferLayout"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TransferLayout"); + + auto dst_layout = ctx->Attrs().Get("dst_layout"); + auto low_bound = static_cast(framework::DataLayout::kNHWC); + auto upper_bound = static_cast(framework::DataLayout::kMKLDNN); + PADDLE_ENFORCE_GE( + dst_layout, low_bound, + platform::errors::PreconditionNotMet( + "Required dst_layout >= %d, but received dst_layout = %d", + low_bound, dst_layout)); + PADDLE_ENFORCE_LE( + dst_layout, upper_bound, + platform::errors::PreconditionNotMet( + "Required dst_layout <= %d, but received dst_layout = %d", + upper_bound, dst_layout)); + + // TODO(Aurelius84): Out's ddim is different with X because they have + // different layout + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + // kernel's device type is decided by input tensor place + auto *in = ctx.InputVar("X"); + auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in); + PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(), true, + platform::errors::PreconditionNotMet( + "The tensor of Input(X) is not initialized.")); + // dtype is not important + return framework::OpKernelType(framework::proto::VarType::FP32, + in_tensor->place()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), + expected_kernel_type.data_layout_); + } +}; + +class TransferLayoutInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + ctx->SyncTypeAndDataType("X", "Out"); + } +}; + +class TransferLayoutKernel { + public: + void operator()(const framework::ExecutionContext &ctx) const { + auto *x = ctx.InputVar("X"); + auto *out = ctx.OutputVar("Out"); + auto &dev_ctx = ctx.device_context(); + auto dst_layout = ctx.Attr("dst_layout"); + TransferLayoutFunctor(x, out, dev_ctx, dst_layout)(); + } +}; + +class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(LoDTensor) The input Tensor"); + AddOutput("Out", "(LoDTensor) The Output Tensor with desired layout"); + AddAttr("dst_layout", + "kNHWC = 0, kNCHW = 1, kAnyLayout = 2, kMKLDNN = 3"); + AddComment(R"DOC( + TransferLayout Operator)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OPERATOR( + transfer_layout, ops::TransferLayoutOp, ops::TransferLayoutOpProtoMaker, + ops::TransferLayoutInferVarType, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +// dtype is not important +REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float, + ops::TransferLayoutKernel); diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1d740093b4fbf882a029f2bd4baf3f198186c5cd --- /dev/null +++ b/paddle/fluid/operators/transfer_layout_op.h @@ -0,0 +1,130 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/data_transform.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace platform { +class DeviceContext; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace framework { +class LoDTensor; +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { +using DataLayout = framework::DataLayout; + +class TransferLayoutFunctor { + public: + TransferLayoutFunctor(const framework::Variable *in, framework::Variable *out, + const platform::DeviceContext &dev_ctx, + const int dst_layout) + : in_(in), out_(out), dev_ctx_(dev_ctx), dst_layout_(dst_layout) {} + + void operator()() const { + auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_); + framework::LoDTensor out_tensor; + + auto out_layout = static_cast(dst_layout_); + out_tensor.set_layout(out_layout); + +#ifdef PADDLE_WITH_MKLDNN + auto in_layout = in_tensor.layout(); + if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) { + PADDLE_ENFORCE_NE( + in_layout, out_layout, + platform::errors::PreconditionNotMet( + "No layout transform needed between two MKLDNN OPKernels.")); + + if (in_layout != DataLayout::kMKLDNN && + out_layout == DataLayout::kMKLDNN) { + // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel + // Just set layout/format. No real transform occur + + auto out_format = platform::MKLDNNFormatForSize( + in_tensor.dims().size(), ToMKLDNNFormat(in_layout)); + out_tensor.ShareDataWith(in_tensor); + // For NHWC data we need reshape of tensors as MKL-DNN + // is expecting NHWC dims description order + platform::MatchShapeToLayout(&out_tensor, in_layout, out_layout); + paddle::platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout( + in_layout); + out_tensor.set_layout(DataLayout::kMKLDNN); + out_tensor.set_format(out_format); + } else { + // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel + // Do transform via MKLDNN lib + innerTransDataLayoutFromMKLDNN( + in_layout, paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), + in_tensor, &out_tensor, dev_ctx_.GetPlace()); + } + } else { + // Case3 - transfrom between Non-MKLDNN OPKernels + TransDataLayout(dev_ctx_, in_tensor, &out_tensor); + } +#else + // Case3 - transfrom between Non-MKLDNN OPKernels + TransDataLayout(dev_ctx_, in_tensor, &out_tensor); +#endif + framework::SetTensorToVariable(*in_, out_tensor, out_); + } + + private: + void TransDataLayout(const platform::DeviceContext &dev_ctx, + const framework::Tensor &in, + framework::Tensor *out) const { + PADDLE_ENFORCE_EQ( + framework::arity(in.dims()), 4, + platform::errors::InvalidArgument( + "Input dimension arity only can be 4, the input dimension is %s.", + in.dims())); + + auto src_dim = in.dims(); + std::vector dst_dim; + + auto axis = framework::GetAxis(in.layout(), out->layout()); + dst_dim.resize(axis.size()); + for (size_t i = 0; i < axis.size(); i++) { + dst_dim[i] = src_dim[axis[i]]; + } + + out->Resize(framework::make_ddim(dst_dim)); + out->mutable_data(in.place(), in.type()); + + framework::VisitDataType( + in.type(), framework::CastDataLayout(&dev_ctx, axis, in, out)); + } + + const framework::Variable *in_; + framework::Variable *out_; + const platform::DeviceContext &dev_ctx_; + const int dst_layout_; +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py b/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py new file mode 100644 index 0000000000000000000000000000000000000000..637a6c144685bbc094a79656839aa4fc84893a1e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py @@ -0,0 +1,103 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16 + + +class TestTransferDtypeOpFp32ToFp64(OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]) + self.inputs = {'X': ipt.astype('float32')} + self.outputs = {'Out': ipt.astype('float64')} + self.attrs = { + 'out_dtype': int(core.VarDesc.VarType.FP64), + 'in_dtype': int(core.VarDesc.VarType.FP32) + } + self.op_type = 'transfer_dtype' + + def test_check_output(self): + self.check_output() + + +class TestTransferDtypeOpFp16ToFp32(OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]) + self.inputs = {'X': ipt.astype('float16')} + self.outputs = {'Out': ipt.astype('float32')} + self.attrs = { + 'out_dtype': int(core.VarDesc.VarType.FP32), + 'in_dtype': int(core.VarDesc.VarType.FP16) + } + self.op_type = 'transfer_dtype' + + def test_check_output(self): + self.check_output(atol=1e-3) + + +class TestTransferDtypeOpFp32ToFp16(OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]) + self.inputs = {'X': ipt.astype('float32')} + self.outputs = {'Out': ipt.astype('float16')} + self.attrs = { + 'out_dtype': int(core.VarDesc.VarType.FP16), + 'in_dtype': int(core.VarDesc.VarType.FP32) + } + self.op_type = 'transfer_dtype' + + def test_check_output(self): + self.check_output(atol=1e-3) + + +class TestTransferDtypeOpBf16ToFp32(OpTest): + def setUp(self): + ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16') + self.inputs = {'X': ipt} + self.outputs = {'Out': convert_uint16_to_float(ipt)} + self.attrs = { + 'out_dtype': int(core.VarDesc.VarType.FP32), + 'in_dtype': int(core.VarDesc.VarType.BF16) + } + self.op_type = 'transfer_dtype' + + def test_check_output(self): + self.check_output() + + +class TestTransferDtypeFp32ToBf16(OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]).astype('float32') + self.inputs = {'X': ipt} + self.outputs = {'Out': convert_float_to_uint16(ipt)} + self.attrs = { + 'out_dtype': int(core.VarDesc.VarType.BF16), + 'in_dtype': int(core.VarDesc.VarType.FP32) + } + self.op_type = 'transfer_dtype' + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py new file mode 100644 index 0000000000000000000000000000000000000000..49cf1bf65c3edc3b3e9c268d176bd069097663e8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from op_test import OpTest + + +# default kNCHW +class TestTransferLayoutOpkNCHWTokNHWC(OpTest): + def setUp(self): + ipt = np.random.random(size=[2, 3, 10, 10]) + self.inputs = {'X': ipt.astype('float32')} + self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])} + self.attrs = { + 'dst_layout': 0 # kNHWC + } + self.op_type = 'transfer_layout' + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 8705e29cbb220f8e117f9a13c1a01e8d45e055f6..3e1167b6586b328e491dc99e58dc59caf828af2f 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -14,6 +14,8 @@ STATIC_MODE_TESTING_LIST = [ 'test_affine_channel_op', + 'test_transfer_dtype_op', + 'test_transfer_layout_op', 'test_concat_op', 'test_elementwise_add_op', 'test_elementwise_sub_op',