diff --git a/mace/core/net.cc b/mace/core/net.cc index 279724f6e791623923e8772b5db88a4bb8293413..1732cfe1a36f04b9fed6c378e67b4637554113ae 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include #include +#include +#include #include "mace/core/future.h" #include "mace/core/macros.h" @@ -53,6 +54,13 @@ std::string TransformedName(const std::string &input_name, ss << input_name << "_mem_type_" << mem_type; return ss.str(); } + +bool TransformRequiredOp(const std::string &op_type) { + static const std::unordered_set kNoTransformOp = { + "Shape", "InferConv2dShape" + }; + return kNoTransformOp.count(op_type) == 0; +} #endif // MACE_ENABLE_OPENCL } // namespace @@ -72,6 +80,7 @@ std::unique_ptr SerialNet::CreateOperation( // otherwise, fallback to CPU device. DeviceType device_type = DeviceType::CPU; construct_context->set_device(cpu_device_); + construct_context->set_operator_def(op_def); construct_context->set_output_mem_type(MemoryType::CPU_BUFFER); for (auto device : available_devices) { if (device == target_device_type) { @@ -103,7 +112,6 @@ std::unique_ptr SerialNet::CreateOperation( } } } - construct_context->set_operator_def(op_def); std::unique_ptr op( op_registry->CreateOperation(construct_context, device_type)); return std::move(op); @@ -126,7 +134,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, std::unordered_map output_map; // used for memory optimization std::unordered_map output_mem_map; - std::unordered_map transformed_map; + std::unordered_set transformed_set; // add input information MemoryType target_mem_type; // quantize model flag @@ -180,71 +188,80 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, #ifdef MACE_ENABLE_OPENCL // Add input transform operation if necessary if (target_device_->device_type() == DeviceType::GPU) { - const DataType dt = - static_cast( - ProtoArgHelper::GetOptionalArg( - *op_def, "T", static_cast(DataType::DT_FLOAT))); // the outputs' memory type of the operation MemoryType out_mem_type = construct_context.output_mem_type(); int input_size = op_def->input_size(); - for (int i = 0; i < input_size; ++i) { - if (output_map.count(op_def->input(i)) == 1) { - // if op is memory-reuse op, no transformation - if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) { - out_mem_type = output_map.at(op_def->input(i)).mem_type; - break; - } - // check whether is the output tensor of other operation - if (output_map.at(op_def->input(i)).mem_type != out_mem_type || - output_map.at(op_def->input(i)).dtype != dt) { - auto key = TransformedName(op_def->input(i), out_mem_type); - auto &output_info = output_map.at(op_def->input(i)); - // check whether the tensor has been transformed - if (transformed_map.count(key) == 0) { - VLOG(1) << "Add Transform operation to transform tensor '" - << op_def->input(i) << "', from memory type " - << output_info.mem_type << " to " << out_mem_type - << ", from Data Type " << output_info.dtype << " to " - << dt; - std::string input_name = op_def->input(i); - std::string t_input_name = - TransformedName(input_name, - out_mem_type); - op_def->set_input(i, t_input_name); - auto input_shape = output_info.shape; - if (output_info.mem_type == MemoryType::CPU_BUFFER && - input_shape.size() == 4) { - // NCHW -> NHWC - input_shape = - TransposeShape(input_shape, - {0, 2, 3, 1}); + // if op is memory-unused op, no transformation + if (TransformRequiredOp(op_def->type())) { + for (int i = 0; i < input_size; ++i) { + if (output_map.count(op_def->input(i)) == 1) { + // if op is memory-reuse op, no transformation + if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) { + out_mem_type = output_map.at(op_def->input(i)).mem_type; + break; + } + // check whether to do transform + MemoryType wanted_in_mem_type = + construct_context.GetInputMemType(i); + DataType wanted_in_dt = construct_context.GetInputDataType(i); + if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type + || output_map.at(op_def->input(i)).dtype != wanted_in_dt) { + auto t_input_name = TransformedName(op_def->input(i), + wanted_in_mem_type); + auto &output_info = output_map.at(op_def->input(i)); + // check whether the tensor has been transformed + if (transformed_set.count(t_input_name) == 0) { + VLOG(1) << "Add Transform operation to transform tensor '" + << op_def->input(i) << "', from memory type " + << output_info.mem_type << " to " + << wanted_in_mem_type + << ", from Data Type " << output_info.dtype << " to " + << wanted_in_dt; + std::string input_name = op_def->input(i); + op_def->set_input(i, t_input_name); + auto input_shape = output_info.shape; + if (output_info.mem_type == MemoryType::CPU_BUFFER && + input_shape.size() == 4) { + // NCHW -> NHWC + input_shape = + TransposeShape(input_shape, + {0, 2, 3, 1}); + } + auto transform_op_def = OpenCLUtil::CreateTransformOpDef( + input_name, input_shape, t_input_name, + wanted_in_dt, wanted_in_mem_type); + auto transform_op = CreateOperation( + op_registry, + &construct_context, + transform_op_def, + data_format_flag); + operators_.emplace_back(std::move(transform_op)); + transformed_set.insert(t_input_name); + output_mem_map[t_input_name] = wanted_in_mem_type; + // where to do graph reference count. + mem_optimizer->UpdateTensorRef(transform_op_def.get()); + } else { + op_def->set_input(i, t_input_name); } - auto transform_op_def = OpenCLUtil::CreateTransformOpDef( - input_name, input_shape, t_input_name, - dt, out_mem_type); - auto transform_op = CreateOperation( - op_registry, - &construct_context, - transform_op_def, - data_format_flag); - operators_.emplace_back(std::move(transform_op)); - transformed_map.emplace(key, t_input_name); - output_mem_map[t_input_name] = out_mem_type; - // where to do graph reference count. - mem_optimizer->UpdateTensorRef(transform_op_def.get()); - } else { - op_def->set_input(i, transformed_map[key]); } + } else { + MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr + && ws_->GetTensor(op_def->input(i))->is_weight(), + "Tensor ", op_def->input(i), " of ", + op_def->name(), " not allocated"); } - } else { - MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr - && ws_->GetTensor(op_def->input(i))->is_weight(), - "Tensor ", op_def->input(i), " of ", - op_def->name(), " not allocated"); } } // update the map : output_tensor -> Operation for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { + DataType dt; + if (op_def->output_type_size() == op_def->output_size()) { + dt = op_def->output_type(out_idx); + } else { + dt = static_cast( + ProtoArgHelper::GetOptionalArg( + *op_def, "T", static_cast(DataType::DT_FLOAT))); + } output_mem_map[op_def->output(out_idx)] = out_mem_type; output_map.emplace( op_def->output(out_idx), @@ -272,13 +289,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, auto &internal_output_info = output_map.at(output_info.name()); if ((internal_output_info.mem_type != target_mem_type && internal_output_info.mem_type != MemoryType::CPU_BUFFER) || - internal_output_info.dtype != DataType::DT_FLOAT) { + internal_output_info.dtype != output_info.data_type()) { VLOG(1) << "Add Transform operation to transform output tensor '" << output_info.name() << "', from memory type " << internal_output_info.mem_type << " to " << target_mem_type << ", from Data Type " << internal_output_info.dtype - << " to " << DataType::DT_FLOAT; + << " to " << output_info.data_type(); std::string t_output_name = TransformedName(output_info.name(), target_mem_type); auto output_op_def = @@ -298,7 +315,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, t_output_name, internal_output_info.shape, output_info.name(), - DataType::DT_FLOAT, + output_info.data_type(), target_mem_type); auto transform_op = CreateOperation( op_registry, diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 6a437f884c506af231db882a500560bdd8dc67ec..ad88c35b2d0bc0b5a216148084783cc5941cf9d1 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -24,6 +24,57 @@ namespace mace { OpConstructContext::OpConstructContext(Workspace *ws) : operator_def_(nullptr), ws_(ws), device_(nullptr) {} +void OpConstructContext::set_operator_def( + std::shared_ptr operator_def) { + operator_def_ = operator_def; + input_data_types_.clear(); +} + +void OpConstructContext::set_output_mem_type(mace::MemoryType type) { + MACE_CHECK(operator_def_ != nullptr); + output_mem_type_ = type; + input_mem_types_.clear(); +} + +void OpConstructContext::SetInputInfo(size_t idx, + mace::MemoryType mem_type, + mace::DataType dt) { + if (input_mem_types_.empty()) { + // the default inputs' memory types are same as output memory type. + input_mem_types_.resize(operator_def_->input_size(), output_mem_type_); + } + if (input_data_types_.empty()) { + // the default inputs' data types are same as operation's data type. + DataType op_dt = static_cast( + ProtoArgHelper::GetOptionalArg( + *operator_def_, "T", static_cast(DataType::DT_FLOAT))); + input_data_types_.resize(operator_def_->input_size(), op_dt); + } + MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size()); + input_mem_types_[idx] = mem_type; + input_data_types_[idx] = dt; +} + +MemoryType OpConstructContext::GetInputMemType(size_t idx) const { + if (input_mem_types_.empty()) { + return output_mem_type_; + } + MACE_CHECK(idx < input_mem_types_.size(), + idx, " < ", input_mem_types_.size()); + return input_mem_types_[idx]; +} + +DataType OpConstructContext::GetInputDataType(size_t idx) const { + if (input_data_types_.empty()) { + // the default inputs' data types are same as operation's data type. + return static_cast( + ProtoArgHelper::GetOptionalArg( + *operator_def_, "T", static_cast(DataType::DT_FLOAT))); + } + MACE_CHECK(idx < input_data_types_.size()); + return input_data_types_[idx]; +} + OpInitContext::OpInitContext(Workspace *ws, Device *device) : ws_(ws), device_(device) {} diff --git a/mace/core/operator.h b/mace/core/operator.h index 8d3e1557bd5673ea07ddc4b3008711e43a8e27c2..5a119d1ee0cde520ac1820117080c7d0a19bc52b 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -35,9 +35,7 @@ class OpConstructContext { explicit OpConstructContext(Workspace *ws); ~OpConstructContext() = default; - inline void set_operator_def(std::shared_ptr operator_def) { - operator_def_ = operator_def; - } + void set_operator_def(std::shared_ptr operator_def); inline std::shared_ptr operator_def() const { return operator_def_; @@ -55,19 +53,26 @@ class OpConstructContext { return device_; } - inline void set_output_mem_type(MemoryType type) { - output_mem_type_ = type; - } + void set_output_mem_type(MemoryType type); inline MemoryType output_mem_type() const { return output_mem_type_; } + void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt); + + MemoryType GetInputMemType(size_t idx) const; + + DataType GetInputDataType(size_t idx) const; + private: std::shared_ptr operator_def_; Workspace *ws_; Device *device_; - MemoryType output_mem_type_; // used for transform memory + // used for memory transform + std::vector input_mem_types_; + std::vector input_data_types_; + MemoryType output_mem_type_; // there is only one output memory type now. }; // memory_optimizer, device @@ -93,6 +98,12 @@ class OpInitContext { Device *device_; }; +// Conventions +// * If there exist format, NHWC is the default format +// * The input/output format of CPU ops with float data type is NCHW +// * The input/output format of GPU ops and CPU Quantization ops is NHWC +// * Inputs' data type is same as the operation data type by default. +// * The outputs' data type is same as the operation data type by default. class Operation { public: explicit Operation(OpConstructContext *context); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index eb21ef2c3e596ba28ce4178574dcb74db59a434f..d94b208d7bff66b36e3b179d7f33c471a17e8c8b 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -612,11 +612,9 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, const std::vector &dilations, const int wino_blk_size = 0) { testing::internal::LogToStderr(); - srand(time(NULL)); auto func = [&](index_t batch, int stride_h, int stride_w, Padding padding) { // generate random input - static unsigned int seed = time(NULL); index_t height = input_shape[0]; index_t width = input_shape[1]; index_t kernel_h = filter_shape[0]; diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 0b11667e39843378d7b58e86abefb15fa76fae89..5697c8413544742ad1517154c84511f9031cbabb 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -375,10 +375,16 @@ class Deconv2dOp : public Deconv2dOpBase { context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } - } else if (operator_def_->input_size() >= 4) { - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type) - == MaceStatus::MACE_SUCCESS); + } else { + if (operator_def_->input_size() >= 4) { + MACE_CHECK(TransformFilter( + context, + operator_def_.get(), + 3, + OpenCLBufferType::ARGUMENT, + mem_type) == MaceStatus::MACE_SUCCESS); + } + context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32); } } MaceStatus Run(OpContext *context) override { diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index 21407c6a743491820d431e077d01e30aa629ac9b..6b08761e34eec22992db490c21740865bdfe3660 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -166,10 +166,15 @@ bool OpsTestNet::Setup(mace::DeviceType device) { } } - for (auto output : op_def_.output()) { - ws_.RemoveTensor(output); + for (int i = 0; i < op_def_.output_size(); ++i) { + ws_.RemoveTensor(op_def_.output(i)); auto output_info = net_def.add_output_info(); - output_info->set_name(output); + output_info->set_name(op_def_.output(i)); + if (op_def_.output_type_size() == op_def_.output_size()) { + output_info->set_data_type(op_def_.output_type(i)); + } else { + output_info->set_data_type(DataType::DT_FLOAT); + } } } MemoryOptimizer mem_optimizer; diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc index 5539e53f83be152a839e9bfa98178c2fedb933c6..297dcb33700d5c258676f28fa954f9692831ba0a 100644 --- a/mace/ops/scalar_math.cc +++ b/mace/ops/scalar_math.cc @@ -21,6 +21,7 @@ namespace mace { namespace ops { +namespace { template void ScalarEltwise(const T* in0, const T* in1, @@ -81,6 +82,7 @@ void ScalarEltwise(const T* in0, LOG(FATAL) << "Eltwise op not support type " << type; } } +} // namespace template @@ -156,12 +158,6 @@ void RegisterScalarMath(OpRegistryBase *op_registry) { DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, DeviceType::CPU, int32_t); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, - DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, - DeviceType::GPU, int32_t); -#endif // MACE_ENABLE_OPENCL } } // namespace ops diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc index b9d8fd0b59de9df82cd9cfa17683d75ee643de08..743dd2565d6d6226b873e73fcfb7f2299a9dbfc2 100644 --- a/mace/ops/scalar_math_test.cc +++ b/mace/ops/scalar_math_test.cc @@ -79,30 +79,6 @@ TEST_F(ScalarMathOpTest, SimpleCPU) { ops::EltwiseType::EQUAL, 3, 3, 1, 1); } -TEST_F(ScalarMathOpTest, SimpleGPU) { - ScalarMathTest( - ops::EltwiseType::SUM, 1, 2, 1, 3); - ScalarMathTest( - ops::EltwiseType::SUB, 1, 2, 1, -1); - ScalarMathTest( - ops::EltwiseType::PROD, 3, -2, 1, -6); - ScalarMathTest( - ops::EltwiseType::DIV, 3, -2, 1, -1.5); - ScalarMathTest( - ops::EltwiseType::MIN, 3, -2, 1, -2); - ScalarMathTest( - ops::EltwiseType::MAX, 3, -2, 1, 3); - ScalarMathTest( - ops::EltwiseType::NEG, 3, -2, 1, -3); - ScalarMathTest( - ops::EltwiseType::ABS, 3, -2, 1, 3); - ScalarMathTest( - ops::EltwiseType::SQR_DIFF, 3, -2, 1, 25); - ScalarMathTest( - ops::EltwiseType::POW, 3, 1, 1, 3); - ScalarMathTest( - ops::EltwiseType::EQUAL, 3, 3, 1, 1); -} } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc index 675ab7c82a7fa553d9ec69cd6f4a77b68f5ceb98..58031ae098583d6be2108d791dfedf44cbfd8968 100644 --- a/mace/ops/shape.cc +++ b/mace/ops/shape.cc @@ -21,11 +21,7 @@ template class ShapeOp : public Operation { public: explicit ShapeOp(OpConstructContext *context) - : Operation(context) { - if (D == DeviceType::GPU) { - context->set_output_mem_type(MemoryType::GPU_BUFFER); - } - } + : Operation(context) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -66,12 +62,6 @@ class ShapeOp : public Operation { void RegisterShape(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, DeviceType::CPU, float); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, - DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL } } // namespace ops diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc index f6269b0f4a08d471a0e25efbe3374142e5a9e20c..97719f18dc41dfd73bd9861901a497a54594303b 100644 --- a/mace/ops/stack.cc +++ b/mace/ops/stack.cc @@ -25,11 +25,7 @@ class StackOp : public Operation { public: explicit StackOp(OpConstructContext *context) : Operation(context), - axis_(Operation::GetOptionalArg("axis", 0)) { - if (D == DeviceType::GPU) { - context->set_output_mem_type(MemoryType::GPU_BUFFER); - } - } + axis_(Operation::GetOptionalArg("axis", 0)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -54,6 +50,7 @@ class StackOp : public Operation { } // Output is on host, no need to map data + Tensor::MappingGuard output_guard(output); auto *output_data = output->mutable_data(); std::vector input_data(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { @@ -83,10 +80,6 @@ class StackOp : public Operation { void RegisterStack(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, int32_t); -#endif // MACE_ENABLE_OPENCL } } // namespace ops diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc index 7c60bfe89faf5c091caa2f77420753315682e8c7..b3b53ec859e704328793394437e44160d36c7c76 100644 --- a/mace/ops/strided_slice.cc +++ b/mace/ops/strided_slice.cc @@ -217,12 +217,6 @@ void RegisterStridedSlice(OpRegistryBase *op_registry) { DeviceType::CPU, float); MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, DeviceType::CPU, int32_t); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, - DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, - DeviceType::GPU, int32_t); -#endif // MACE_ENABLE_OPENCL } } // namespace ops diff --git a/tools/converter.py b/tools/converter.py index e98715fc95def1972c376c76c211758b19c6b2b2..0e9202f1066f87805f52050ca50ff0f9a7042cf0 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -741,7 +741,7 @@ def download_file(url, dst, num_retries=3): MaceLogger.info('\nDownloaded successfully.') except (urllib.error.ContentTooShortError, urllib.error.HTTPError, urllib.error.URLError) as e: - MaceLogger.warning('Download error:', e) + MaceLogger.warning('Download error:' + str(e)) if num_retries > 0: return download_file(url, dst, num_retries - 1) else: