提交 696cf9d9 编写于 作者: L liuqi

Bug: Support auto transformation between cpu and gpu more reasonable

上级 bfbe1a30
......@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <utility>
#include <algorithm>
#include <limits>
#include <unordered_set>
#include <utility>
#include "mace/core/future.h"
#include "mace/core/macros.h"
......@@ -53,6 +54,13 @@ std::string TransformedName(const std::string &input_name,
ss << input_name << "_mem_type_" << mem_type;
return ss.str();
}
bool TransformRequiredOp(const std::string &op_type) {
static const std::unordered_set<std::string> kNoTransformOp = {
"Shape", "InferConv2dShape"
};
return kNoTransformOp.count(op_type) == 0;
}
#endif // MACE_ENABLE_OPENCL
} // namespace
......@@ -72,6 +80,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
// otherwise, fallback to CPU device.
DeviceType device_type = DeviceType::CPU;
construct_context->set_device(cpu_device_);
construct_context->set_operator_def(op_def);
construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
for (auto device : available_devices) {
if (device == target_device_type) {
......@@ -103,7 +112,6 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
}
}
}
construct_context->set_operator_def(op_def);
std::unique_ptr<Operation> op(
op_registry->CreateOperation(construct_context, device_type));
return std::move(op);
......@@ -126,7 +134,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
std::unordered_map<std::string, InternalOutputInfo> output_map;
// used for memory optimization
std::unordered_map<std::string, MemoryType> output_mem_map;
std::unordered_map<std::string, std::string> transformed_map;
std::unordered_set<std::string> transformed_set;
// add input information
MemoryType target_mem_type;
// quantize model flag
......@@ -180,71 +188,80 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
#ifdef MACE_ENABLE_OPENCL
// Add input transform operation if necessary
if (target_device_->device_type() == DeviceType::GPU) {
const DataType dt =
static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
// the outputs' memory type of the operation
MemoryType out_mem_type = construct_context.output_mem_type();
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
if (output_map.count(op_def->input(i)) == 1) {
// if op is memory-reuse op, no transformation
if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
out_mem_type = output_map.at(op_def->input(i)).mem_type;
break;
}
// check whether is the output tensor of other operation
if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
output_map.at(op_def->input(i)).dtype != dt) {
auto key = TransformedName(op_def->input(i), out_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_map.count(key) == 0) {
VLOG(1) << "Add Transform operation to transform tensor '"
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to " << out_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< dt;
std::string input_name = op_def->input(i);
std::string t_input_name =
TransformedName(input_name,
out_mem_type);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
// if op is memory-unused op, no transformation
if (TransformRequiredOp(op_def->type())) {
for (int i = 0; i < input_size; ++i) {
if (output_map.count(op_def->input(i)) == 1) {
// if op is memory-reuse op, no transformation
if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
out_mem_type = output_map.at(op_def->input(i)).mem_type;
break;
}
// check whether to do transform
MemoryType wanted_in_mem_type =
construct_context.GetInputMemType(i);
DataType wanted_in_dt = construct_context.GetInputDataType(i);
if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type
|| output_map.at(op_def->input(i)).dtype != wanted_in_dt) {
auto t_input_name = TransformedName(op_def->input(i),
wanted_in_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_set.count(t_input_name) == 0) {
VLOG(1) << "Add Transform operation to transform tensor '"
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to "
<< wanted_in_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< wanted_in_dt;
std::string input_name = op_def->input(i);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
}
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
input_name, input_shape, t_input_name,
wanted_in_dt, wanted_in_mem_type);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
data_format_flag);
operators_.emplace_back(std::move(transform_op));
transformed_set.insert(t_input_name);
output_mem_map[t_input_name] = wanted_in_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else {
op_def->set_input(i, t_input_name);
}
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
input_name, input_shape, t_input_name,
dt, out_mem_type);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
data_format_flag);
operators_.emplace_back(std::move(transform_op));
transformed_map.emplace(key, t_input_name);
output_mem_map[t_input_name] = out_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else {
op_def->set_input(i, transformed_map[key]);
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
}
}
// update the map : output_tensor -> Operation
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
DataType dt;
if (op_def->output_type_size() == op_def->output_size()) {
dt = op_def->output_type(out_idx);
} else {
dt = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
}
output_mem_map[op_def->output(out_idx)] = out_mem_type;
output_map.emplace(
op_def->output(out_idx),
......@@ -272,13 +289,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
auto &internal_output_info = output_map.at(output_info.name());
if ((internal_output_info.mem_type != target_mem_type &&
internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
internal_output_info.dtype != DataType::DT_FLOAT) {
internal_output_info.dtype != output_info.data_type()) {
VLOG(1) << "Add Transform operation to transform output tensor '"
<< output_info.name() << "', from memory type "
<< internal_output_info.mem_type
<< " to " << target_mem_type
<< ", from Data Type " << internal_output_info.dtype
<< " to " << DataType::DT_FLOAT;
<< " to " << output_info.data_type();
std::string t_output_name = TransformedName(output_info.name(),
target_mem_type);
auto output_op_def =
......@@ -298,7 +315,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
t_output_name,
internal_output_info.shape,
output_info.name(),
DataType::DT_FLOAT,
output_info.data_type(),
target_mem_type);
auto transform_op = CreateOperation(
op_registry,
......
......@@ -24,6 +24,57 @@ namespace mace {
OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr), ws_(ws), device_(nullptr) {}
void OpConstructContext::set_operator_def(
std::shared_ptr<mace::OperatorDef> operator_def) {
operator_def_ = operator_def;
input_data_types_.clear();
}
void OpConstructContext::set_output_mem_type(mace::MemoryType type) {
MACE_CHECK(operator_def_ != nullptr);
output_mem_type_ = type;
input_mem_types_.clear();
}
void OpConstructContext::SetInputInfo(size_t idx,
mace::MemoryType mem_type,
mace::DataType dt) {
if (input_mem_types_.empty()) {
// the default inputs' memory types are same as output memory type.
input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
}
if (input_data_types_.empty()) {
// the default inputs' data types are same as operation's data type.
DataType op_dt = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
input_data_types_.resize(operator_def_->input_size(), op_dt);
}
MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size());
input_mem_types_[idx] = mem_type;
input_data_types_[idx] = dt;
}
MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
if (input_mem_types_.empty()) {
return output_mem_type_;
}
MACE_CHECK(idx < input_mem_types_.size(),
idx, " < ", input_mem_types_.size());
return input_mem_types_[idx];
}
DataType OpConstructContext::GetInputDataType(size_t idx) const {
if (input_data_types_.empty()) {
// the default inputs' data types are same as operation's data type.
return static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
}
MACE_CHECK(idx < input_data_types_.size());
return input_data_types_[idx];
}
OpInitContext::OpInitContext(Workspace *ws, Device *device)
: ws_(ws), device_(device) {}
......
......@@ -35,9 +35,7 @@ class OpConstructContext {
explicit OpConstructContext(Workspace *ws);
~OpConstructContext() = default;
inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
operator_def_ = operator_def;
}
void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
inline std::shared_ptr<OperatorDef> operator_def() const {
return operator_def_;
......@@ -55,19 +53,26 @@ class OpConstructContext {
return device_;
}
inline void set_output_mem_type(MemoryType type) {
output_mem_type_ = type;
}
void set_output_mem_type(MemoryType type);
inline MemoryType output_mem_type() const {
return output_mem_type_;
}
void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt);
MemoryType GetInputMemType(size_t idx) const;
DataType GetInputDataType(size_t idx) const;
private:
std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_;
Device *device_;
MemoryType output_mem_type_; // used for transform memory
// used for memory transform
std::vector<MemoryType> input_mem_types_;
std::vector<DataType> input_data_types_;
MemoryType output_mem_type_; // there is only one output memory type now.
};
// memory_optimizer, device
......@@ -93,6 +98,12 @@ class OpInitContext {
Device *device_;
};
// Conventions
// * If there exist format, NHWC is the default format
// * The input/output format of CPU ops with float data type is NCHW
// * The input/output format of GPU ops and CPU Quantization ops is NHWC
// * Inputs' data type is same as the operation data type by default.
// * The outputs' data type is same as the operation data type by default.
class Operation {
public:
explicit Operation(OpConstructContext *context);
......
......@@ -612,11 +612,9 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<int> &dilations,
const int wino_blk_size = 0) {
testing::internal::LogToStderr();
srand(time(NULL));
auto func = [&](index_t batch, int stride_h, int stride_w, Padding padding) {
// generate random input
static unsigned int seed = time(NULL);
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t kernel_h = filter_shape[0];
......
......@@ -375,10 +375,16 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
} else if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
} else {
if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
3,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32);
}
}
MaceStatus Run(OpContext *context) override {
......
......@@ -166,10 +166,15 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
}
}
for (auto output : op_def_.output()) {
ws_.RemoveTensor(output);
for (int i = 0; i < op_def_.output_size(); ++i) {
ws_.RemoveTensor(op_def_.output(i));
auto output_info = net_def.add_output_info();
output_info->set_name(output);
output_info->set_name(op_def_.output(i));
if (op_def_.output_type_size() == op_def_.output_size()) {
output_info->set_data_type(op_def_.output_type(i));
} else {
output_info->set_data_type(DataType::DT_FLOAT);
}
}
}
MemoryOptimizer mem_optimizer;
......
......@@ -21,6 +21,7 @@
namespace mace {
namespace ops {
namespace {
template <typename T, typename DstType>
void ScalarEltwise(const T* in0,
const T* in1,
......@@ -81,6 +82,7 @@ void ScalarEltwise(const T* in0,
LOG(FATAL) << "Eltwise op not support type " << type;
}
}
} // namespace
template <DeviceType D, typename T>
......@@ -156,12 +158,6 @@ void RegisterScalarMath(OpRegistryBase *op_registry) {
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
DeviceType::GPU, int32_t);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
......
......@@ -79,30 +79,6 @@ TEST_F(ScalarMathOpTest, SimpleCPU) {
ops::EltwiseType::EQUAL, 3, 3, 1, 1);
}
TEST_F(ScalarMathOpTest, SimpleGPU) {
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::SUM, 1, 2, 1, 3);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::SUB, 1, 2, 1, -1);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::PROD, 3, -2, 1, -6);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::DIV, 3, -2, 1, -1.5);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::MIN, 3, -2, 1, -2);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::MAX, 3, -2, 1, 3);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::NEG, 3, -2, 1, -3);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::ABS, 3, -2, 1, 3);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
ScalarMathTest<DeviceType::GPU, float, float>(
ops::EltwiseType::POW, 3, 1, 1, 3);
ScalarMathTest<DeviceType::GPU, float, int32_t>(
ops::EltwiseType::EQUAL, 3, 3, 1, 1);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -21,11 +21,7 @@ template <DeviceType D, typename T>
class ShapeOp : public Operation {
public:
explicit ShapeOp(OpConstructContext *context)
: Operation(context) {
if (D == DeviceType::GPU) {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
}
: Operation(context) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......@@ -66,12 +62,6 @@ class ShapeOp : public Operation {
void RegisterShape(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
......
......@@ -25,11 +25,7 @@ class StackOp : public Operation {
public:
explicit StackOp(OpConstructContext *context)
: Operation(context),
axis_(Operation::GetOptionalArg<int>("axis", 0)) {
if (D == DeviceType::GPU) {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
}
axis_(Operation::GetOptionalArg<int>("axis", 0)) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......@@ -54,6 +50,7 @@ class StackOp : public Operation {
}
// Output is on host, no need to map data
Tensor::MappingGuard output_guard(output);
auto *output_data = output->mutable_data<T>();
std::vector<const T *> input_data(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
......@@ -83,10 +80,6 @@ class StackOp : public Operation {
void RegisterStack(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, int32_t);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
......
......@@ -217,12 +217,6 @@ void RegisterStridedSlice(OpRegistryBase *op_registry) {
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
DeviceType::GPU, int32_t);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
......
......@@ -741,7 +741,7 @@ def download_file(url, dst, num_retries=3):
MaceLogger.info('\nDownloaded successfully.')
except (urllib.error.ContentTooShortError, urllib.error.HTTPError,
urllib.error.URLError) as e:
MaceLogger.warning('Download error:', e)
MaceLogger.warning('Download error:' + str(e))
if num_retries > 0:
return download_file(url, dst, num_retries - 1)
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册