diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 06928c94a1f0be17a03101d15f8418dd0aafdd9b..a574449d3bb81a73566dd2cfaae935b7c991d9c9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -144,7 +144,7 @@ model_tests: - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - > python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file || exit 1; - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; + python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file --address_sanitizer || exit 1; python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py benchmark --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=file --model_data_format=file || exit 1; python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=code --model_data_format=file || exit 1; @@ -195,7 +195,8 @@ extra_tests: GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1; + - python tools/bazel_adb_run.py --target="//mace/utils:utils_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1; + - python tools/bazel_adb_run.py --target="//mace/port:port_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1; so_size_check: stage: so_size_check diff --git a/BUILD b/BUILD.bazel similarity index 100% rename from BUILD rename to BUILD.bazel diff --git a/docs/development/how_to_debug.rst b/docs/development/how_to_debug.rst index ea4688585562d812d06cdfa4a27935f3252df66a..1f516d28301b73fbe44ecd2cdaa9e1dd0aa7393e 100644 --- a/docs/development/how_to_debug.rst +++ b/docs/development/how_to_debug.rst @@ -101,17 +101,20 @@ MACE also provides model visualization HTML generated in `builds` directory, gen Debug engine using log -------------------------- -Mace defines two sorts of logs: one is for users (LOG), the other is for developers (VLOG). +MACE implements a similar logging mechanism like `glog `__. +There are two types of logs, LOG for normal logging and VLOG for debugging. -LOG includes four levels, i.e, ``INFO``, ``WARNING``, ``ERROR``, ``FATAL``; -Environment variable ``MACE_CPP_MIN_LOG_LEVEL`` can be set to specify log level of users, e.g., -``set MACE_CPP_MIN_LOG_LEVEL=0`` will enable ``INFO`` log level, while ``set MACE_CPP_MIN_LOG_LEVEL=4`` will enable ``FATAL`` log level. +LOG includes four levels, sorted by severity level: ``INFO``, ``WARNING``, ``ERROR``, ``FATAL``. +The logging severity threshold can be configured via environment variable, e.g. ``MACE_CPP_MIN_LOG_LEVEL=WARNING`` to set as ``WARNING``. +Only the log messages with equal or above the specified severity threshold will be printed, the default threshold is ``INFO``. +We don't support integer log severity value like `glog `__, because they are confusing with VLOG. +VLOG is verbose logging which is logged as ``LOG(INFO)``. VLOG also has more detailed integer verbose levels, like 0, 1, 2, 3, etc. +The threshold can be configured through environment variable, e.g. ``MACE_CPP_MIN_VLOG_LEVEL=2`` to set as ``2``. +With VLOG, the lower the verbose level, the more likely messages are to be logged. For example, when the threshold is set +to 2, both ``VLOG(1)``, ``VLOG(2)`` log messages will be printed, but ``VLOG(3)`` and highers won't. -VLOG level is specified by numbers, e.g., 0, 1, 2. Environment variable ``MACE_CPP_MIN_VLOG_LEVEL`` can be set to specify vlog level. -Logs with higher levels than which is specified will be printed. So simply specifying a very large level number will make all logs printed. - -By using Mace run tool, vlog level can be easily set by option, e.g., +By using ``mace_run`` tool, VLOG level can be easily set by option, e.g., .. code:: sh @@ -168,9 +171,3 @@ things may be a little bit complicated. # then you can use it as host gdb, e.g., bt - - - - - - diff --git a/docs/installation/env_requirement.rst b/docs/installation/env_requirement.rst index be15c67c0917d59caea47836225ba67143098bf9..4a599ec523e31413cf2bd7c169782bba488760d3 100644 --- a/docs/installation/env_requirement.rst +++ b/docs/installation/env_requirement.rst @@ -41,7 +41,7 @@ For Bazel, install it following installation guide. For python dependencies, .. code:: sh - pip install -U --user setup/requirements.txt + pip install -U --user -r setup/requirements.txt @@ -83,7 +83,7 @@ For python dependencies, .. code:: sh - pip install -U --user setup/optionals.txt + pip install -U --user -r setup/optionals.txt .. note:: diff --git a/docs/installation/using_docker.rst b/docs/installation/using_docker.rst index 61e929a33b5f14bf58a1fb74fb13e0598919cd3c..b0c5ac4e5781cb1857eb42f2a8f0fdf268fcb29a 100644 --- a/docs/installation/using_docker.rst +++ b/docs/installation/using_docker.rst @@ -15,18 +15,18 @@ In most cases, the ``lite edition`` image can satisfy developer's basic needs. .. code:: sh - # Pull lite edition docker image + # You can pull lite edition docker image from docker repo (recommended) docker pull registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev-lite - # Build lite edition docker image + # Or build lite edition docker image by yourself docker build -t registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev-lite ./docker/mace-dev-lite - ``full edition`` docker image (which contains multiple NDK versions and other dev tools). .. code:: sh - # Pull full edition docker image + # You can pull full edition docker image from docker repo (recommended) docker pull registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev - # Build full edition docker image + # Or build full edition docker image by yourself docker build -t registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev ./docker/mace-dev .. note:: diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 58c0f9352df91652220f6c45c0e5a76a504d4d80..dfd69cca91ef8ac90f35d1aa3dc6a4a9d8f832ac 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -81,7 +81,7 @@ in one deployment file. * - backend - The onnx backend framework for validation, could be [tensorflow, caffe2, pytorch], default is tensorflow. * - runtime - - The running device, one of [cpu, gpu, dsp, cpu_gpu]. cpu_gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. + - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. * - data_type - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. * - input_data_types @@ -421,11 +421,6 @@ the detailed information is in :doc:`benchmark`. - 3 - ``run``/``benchmark`` - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH - * - --gpu_perf_hint - - int - - 3 - - ``run``/``benchmark`` - - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH * - --gpu_priority_hint - int - 3 diff --git a/mace/BUILD b/mace/BUILD.bazel similarity index 78% rename from mace/BUILD rename to mace/BUILD.bazel index 4b7da51fccfac614fe845bdd95e58d960c62ed75..ef1c338d0838c12ef2c44035e6b8104baf1d6361 100644 --- a/mace/BUILD +++ b/mace/BUILD.bazel @@ -6,6 +6,22 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "linux", + define_values = { + "linux": "true", + }, + visibility = ["//visibility:public"], +) + +config_setting( + name = "darwin", + define_values = { + "darwin": "true", + }, + visibility = ["//visibility:public"], +) + config_setting( name = "android_armv7", values = { @@ -62,6 +78,17 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "hta_enabled", + define_values = { + "hta": "true", + }, + values = { + "crosstool_top": "//external:android/crosstool", + }, + visibility = ["//visibility:public"], +) + config_setting( name = "openmp_enabled", define_values = { diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD.bazel similarity index 100% rename from mace/benchmark/BUILD rename to mace/benchmark/BUILD.bazel diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 4bd44ada514baf095cdb4bdfb6520808a285172c..adb267f3c8bb5361e5b4f929d3888b37b1c014f2 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -21,9 +21,11 @@ #include // NOLINT(build/c++11) #include "gflags/gflags.h" +#include "mace/port/env.h" +#include "mace/port/file_system.h" #include "mace/public/mace.h" #include "mace/utils/logging.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" #include "mace/benchmark/statistics.h" #ifdef MODEL_GRAPH_FORMAT_CODE #include "mace/codegen/engine/mace_engine_factory.h" @@ -31,24 +33,6 @@ namespace mace { namespace benchmark { -namespace str_util { - -std::vector Split(const std::string &str, char delims) { - std::vector result; - std::string tmp = str; - while (!tmp.empty()) { - size_t next_offset = tmp.find(delims); - result.push_back(tmp.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp = tmp.substr(next_offset + 1); - } - } - return result; -} - -} // namespace str_util void ParseShape(const std::string &str, std::vector *shape) { std::string tmp = str; @@ -90,6 +74,18 @@ DeviceType ParseDeviceType(const std::string &device_str) { } } +DataFormat ParseDataFormat(const std::string &data_format_str) { + if (data_format_str == "NHWC") { + return DataFormat::NHWC; + } else if (data_format_str == "NCHW") { + return DataFormat::NCHW; + } else if (data_format_str == "OIHW") { + return DataFormat::OIHW; + } else { + return DataFormat::DF_NONE; + } +} + bool RunInference(MaceEngine *engine, const std::map &input_infos, std::map *output_infos, @@ -168,6 +164,12 @@ DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma"); DEFINE_string(input_shape, "", "input shape, separated by colon and comma"); DEFINE_string(output_shape, "", "output shape, separated by colon and comma"); +DEFINE_string(input_data_format, + "NHWC", + "input data formats, NONE|NHWC|NCHW"); +DEFINE_string(output_data_format, + "NHWC", + "output data formats, NONE|NHWC|NCHW"); DEFINE_string(input_file, "", "input file name"); DEFINE_int32(max_num_runs, 100, "max number of runs"); DEFINE_double(max_seconds, 10.0, "max number of seconds to run"); @@ -213,14 +215,10 @@ int Main(int argc, char **argv) { std::unique_ptr statistician(new OpStat()); - std::vector input_names = - str_util::Split(FLAGS_input_node, ','); - std::vector output_names = - str_util::Split(FLAGS_output_node, ','); - std::vector input_shapes = - str_util::Split(FLAGS_input_shape, ':'); - std::vector output_shapes = - str_util::Split(FLAGS_output_shape, ':'); + std::vector input_names = Split(FLAGS_input_node, ','); + std::vector output_names = Split(FLAGS_output_node, ','); + std::vector input_shapes = Split(FLAGS_input_shape, ':'); + std::vector output_shapes = Split(FLAGS_output_shape, ':'); const size_t input_count = input_shapes.size(); const size_t output_count = output_shapes.size(); @@ -233,6 +231,19 @@ int Main(int argc, char **argv) { ParseShape(output_shapes[i], &output_shape_vec[i]); } + std::vector raw_input_data_formats = + Split(FLAGS_input_data_format, ','); + std::vector raw_output_data_formats = + Split(FLAGS_output_data_format, ','); + std::vector input_data_formats(input_count); + std::vector output_data_formats(output_count); + for (size_t i = 0; i < input_count; ++i) { + input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]); + } + for (size_t i = 0; i < output_count; ++i) { + output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]); + } + mace::DeviceType device_type = ParseDeviceType(FLAGS_device); // configuration @@ -273,41 +284,46 @@ int Main(int argc, char **argv) { std::shared_ptr engine; MaceStatus create_engine_status; // Create Engine - std::vector model_graph_data; + std::unique_ptr model_graph_data; if (FLAGS_model_file != "") { - if (!mace::ReadBinaryFile(&model_graph_data, FLAGS_model_file)) { + auto fs = GetFileSystem(); + auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(), + &model_graph_data); + if (status != MaceStatus::MACE_SUCCESS) { LOG(FATAL) << "Failed to read file: " << FLAGS_model_file; } } - const unsigned char *model_weights_data = nullptr; - size_t model_weights_data_size = 0; + std::unique_ptr model_weights_data; if (FLAGS_model_data_file != "") { - MemoryMap(FLAGS_model_data_file, - &model_weights_data, - &model_weights_data_size); - MACE_CHECK(model_weights_data != nullptr && model_weights_data_size != 0); + auto fs = GetFileSystem(); + auto status = fs->NewReadOnlyMemoryRegionFromFile( + FLAGS_model_data_file.c_str(), + &model_weights_data); + if (status != MaceStatus::MACE_SUCCESS) { + LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file; + } + MACE_CHECK(model_weights_data->length() > 0); } #ifdef MODEL_GRAPH_FORMAT_CODE - create_engine_status = - CreateMaceEngineFromCode(FLAGS_model_name, - model_weights_data, - model_weights_data_size, - input_names, - output_names, - config, - &engine); + create_engine_status = CreateMaceEngineFromCode(FLAGS_model_name, + reinterpret_cast(model_weights_data->data()), + model_weights_data->length(), + input_names, + output_names, + config, + &engine); #else - create_engine_status = - CreateMaceEngineFromProto(model_graph_data.data(), - model_graph_data.size(), - model_weights_data, - model_weights_data_size, - input_names, - output_names, - config, - &engine); + create_engine_status = CreateMaceEngineFromProto( + reinterpret_cast(model_graph_data->data()), + model_graph_data->length(), + reinterpret_cast(model_weights_data->data()), + model_weights_data->length(), + input_names, + output_names, + config, + &engine); #endif if (create_engine_status != MaceStatus::MACE_SUCCESS) { LOG(FATAL) << "Create engine error, please check the arguments"; @@ -333,7 +349,8 @@ int Main(int argc, char **argv) { LOG(INFO) << "Open input file failed"; return -1; } - inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in); + inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in, + input_data_formats[i]); } for (size_t i = 0; i < output_count; ++i) { @@ -344,7 +361,8 @@ int Main(int argc, char **argv) { auto buffer_out = std::shared_ptr(new float[output_size], std::default_delete()); outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i], - buffer_out); + buffer_out, + output_data_formats[i]); } int64_t warmup_time_us = 0; @@ -380,10 +398,6 @@ int Main(int argc, char **argv) { statistician->PrintStat(); - if (model_weights_data != nullptr) { - MemoryUnMap(model_weights_data, model_weights_data_size); - } - return 0; } diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc index 66b178cf7178919adf57d064f2aa21ccee0dc491..cdc4639155cdab36f45eb038907e7ac71e069f2e 100644 --- a/mace/benchmark/model_throughput_test.cc +++ b/mace/benchmark/model_throughput_test.cc @@ -23,8 +23,7 @@ * --dsp_model_data_file=dsp_model_data.data \ * --run_seconds=10 */ -#include -#include +#include #include #include #include @@ -33,7 +32,7 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/utils/env_time.h" +#include "mace/port/env.h" #include "mace/utils/logging.h" #include "mace/core/types.h" diff --git a/mace/codegen/BUILD b/mace/codegen/BUILD.bazel similarity index 100% rename from mace/codegen/BUILD rename to mace/codegen/BUILD.bazel diff --git a/mace/core/BUILD b/mace/core/BUILD.bazel similarity index 73% rename from mace/core/BUILD rename to mace/core/BUILD.bazel index 2e37524ffd8a77e400ba2924cd656586744b3af3..91df4f0f1d0d0a66b2903575a4373b26897628cb 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD.bazel @@ -10,11 +10,14 @@ licenses(["notice"]) # Apache 2.0 load( "//mace:mace.bzl", "if_android", + "if_android_armv7", "if_hexagon_enabled", - "if_not_hexagon_enabled", - "if_openmp_enabled", + "if_hta_enabled", + "if_hexagon_or_hta_enabled", "if_neon_enabled", + "if_not_hexagon_enabled", "if_opencl_enabled", + "if_openmp_enabled", "if_quantize_enabled", ) @@ -32,17 +35,24 @@ cc_library( [ "runtime/opencl/*.cc", ], - )) + if_hexagon_enabled(glob([ - "runtime/hexagon/*.cc", - ])), + )) + if_hexagon_enabled([ + "runtime/hexagon/hexagon_dsp_wrapper.cc", + ]) + if_hta_enabled([ + "runtime/hexagon/hexagon_hta_wrapper.cc", + ]), hdrs = glob([ "*.h", "runtime/cpu/*.h", - ]) + if_opencl_enabled(glob( - [ - "runtime/opencl/*.h", - ], - )) + if_hexagon_enabled(glob(["runtime/hexagon/*.h"])), + ]) + if_opencl_enabled(glob([ + "runtime/opencl/*.h", + ])) + if_hexagon_or_hta_enabled(glob([ + "runtime/hexagon/hexagon_control_wrapper.h", + "runtime/hexagon/hexagon_device.h", + ])) + if_hexagon_enabled(glob([ + "runtime/hexagon/*dsp*.h", + ])) + if_hta_enabled(glob([ + "runtime/hexagon/*hta*.h", + ])), copts = [ "-Werror", "-Wextra", @@ -56,17 +66,20 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_hta_enabled([ + "-DMACE_ENABLE_HTA", ]) + if_neon_enabled([ "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + "-mfloat-abi=softfp", ]), - linkopts = ["-ldl"] + if_android([ - "-pie", - "-lm", - ]), + linkopts = ["-ldl"], deps = [ "//mace/codegen:generated_version", "//mace/proto:mace_cc", "//mace/utils", + "//mace/port", "@half//:half", ] + if_opencl_enabled([ ":opencl_headers", @@ -75,6 +88,8 @@ cc_library( "@gemmlowp", ]) + if_hexagon_enabled([ "//third_party/nnlib:libhexagon", + ]) + if_hta_enabled([ + "//third_party/hta", ]), ) diff --git a/mace/core/allocator.h b/mace/core/allocator.h index 9c9103635245921ca2c354702b8ec9b062c40f37..c7499b92b51053436e61edabf4c93069c93f7a81 100644 --- a/mace/core/allocator.h +++ b/mace/core/allocator.h @@ -15,14 +15,13 @@ #ifndef MACE_CORE_ALLOCATOR_H_ #define MACE_CORE_ALLOCATOR_H_ -#include -#include +#include #include #include #include #include -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/core/types.h" #include "mace/core/runtime_failure_mock.h" #include "mace/public/mace.h" diff --git a/mace/core/buffer.h b/mace/core/buffer.h index 66684db150f459f877ac6b9a893b9027f9644548..d1f5f1a507ffde8f884b81096ea19b7ffd60ba73 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -21,8 +21,9 @@ #include #include "mace/core/allocator.h" -#include "mace/core/macros.h" #include "mace/core/types.h" +#include "mace/utils/logging.h" +#include "mace/utils/macros.h" namespace mace { namespace core { @@ -434,16 +435,11 @@ class BufferSlice : public BufferBase { } void *Map(index_t offset, index_t length, std::vector *pitch) const { - MACE_UNUSED(offset); - MACE_UNUSED(length); - MACE_UNUSED(pitch); - MACE_NOT_IMPLEMENTED; - return nullptr; + return buffer_->Map(offset_ + offset, length, pitch); } void UnMap(void *mapped_ptr) const { - MACE_UNUSED(mapped_ptr); - MACE_NOT_IMPLEMENTED; + buffer_->UnMap(mapped_ptr); } void Map(std::vector *pitch) { @@ -507,7 +503,7 @@ class ScratchBuffer: public Buffer { virtual ~ScratchBuffer() {} MaceStatus GrowSize(const index_t size) { - if (size > size_) { + if (offset_ + size > size_) { VLOG(1) << "Grow scratch size to: " << size; MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size"); return Resize(size); diff --git a/mace/core/device.cc b/mace/core/device.cc index 177b443ba25c729c54a49f4d77cc09cfac952879..535b7193633cf6881fea54f129c0485ddc3ed585 100644 --- a/mace/core/device.cc +++ b/mace/core/device.cc @@ -15,16 +15,17 @@ #include "mace/core/device.h" #include "mace/core/buffer.h" +#include "mace/utils/memory.h" namespace mace { CPUDevice::CPUDevice(const int num_threads, const CPUAffinityPolicy policy, const bool use_gemmlowp) - : cpu_runtime_(new CPURuntime(num_threads, - policy, - use_gemmlowp)), - scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {} + : cpu_runtime_(make_unique(num_threads, + policy, + use_gemmlowp)), + scratch_buffer_(make_unique(GetCPUAllocator())) {} CPUDevice::~CPUDevice() = default; diff --git a/mace/core/future.h b/mace/core/future.h index 13382e1bf84575f2b0e5e63b0d881d720fc0e5d9..c7227d4df6ade05b1a6d392de0dbfa4772dff39d 100644 --- a/mace/core/future.h +++ b/mace/core/future.h @@ -20,11 +20,10 @@ #include #include "mace/utils/logging.h" +#include "mace/public/mace.h" namespace mace { -struct CallStats; - // Wait the call to finish and get the stats if param is not nullptr struct StatsFuture { std::function wait_fn = [](CallStats *stats) { diff --git a/mace/core/kv_storage.cc b/mace/core/kv_storage.cc index 5eba8567171bba82b6f2e9d2bef094b5614490e8..e2feb8c827b5939098eb0b7d3a451b1ad62b44a6 100644 --- a/mace/core/kv_storage.cc +++ b/mace/core/kv_storage.cc @@ -13,18 +13,18 @@ // limitations under the License. #include -#include #include #include #include +#include #include #include #include #include #include "mace/core/kv_storage.h" -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc index 756e9321cf3a93559737e8b5e3c897462e3a5488..004fb1a927ae9a15ad733ebcf61918c4983f99e0 100644 --- a/mace/core/memory_optimizer.cc +++ b/mace/core/memory_optimizer.cc @@ -21,8 +21,9 @@ #include #include "mace/core/arg_helper.h" -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/utils/logging.h" +#include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/opencl_util.h" @@ -61,12 +62,22 @@ void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) { } MemoryBlock MemoryOptimizer::CreateMemoryBlock( - std::vector shape, + const OperatorDef *op_def, + int output_idx, DataType dt, - mace::MemoryType mem_type) { + MemoryType mem_type) { + auto shape = std::vector( + op_def->output_shape(output_idx).dims().begin(), + op_def->output_shape(output_idx).dims().end()); MemoryBlock block; #ifdef MACE_ENABLE_OPENCL if (mem_type == MemoryType::GPU_IMAGE) { + OpenCLBufferType buffer_type = OpenCLBufferType::IN_OUT_CHANNEL; + if (op_def->type() == "BufferTransform") { + buffer_type = static_cast( + ProtoArgHelper::GetOptionalArg( + *op_def, "buffer_type", OpenCLBufferType::IN_OUT_CHANNEL)); + } std::vector image_shape; if (shape.size() == 1) { shape = {shape[0], 1, 1, 1}; @@ -75,9 +86,7 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock( } else { MACE_CHECK(shape.size() == 4) << "GPU only support 1D/2D/4D input"; } - OpenCLUtil::CalImage2DShape(shape, - OpenCLBufferType::IN_OUT_CHANNEL, - &image_shape); + OpenCLUtil::CalImage2DShape(shape, buffer_type, &image_shape); block.set_x(image_shape[0]); block.set_y(image_shape[1]); return block; @@ -95,7 +104,7 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock( void MemoryOptimizer::Optimize( const mace::OperatorDef *op_def, - const std::unordered_map &mem_types) { + const std::unordered_map *mem_types) { MACE_LATENCY_LOGGER(2, "Optimize memory"); if (op_def->output_size() != op_def->output_shape_size()) { VLOG(1) << op_def->name() @@ -117,6 +126,8 @@ void MemoryOptimizer::Optimize( op_def->output_type_size()); DataType dt; + bool has_data_format = ProtoArgHelper::GetOptionalArg( + *op_def, "has_data_format", 0) != 0; int output_size = op_def->output_size(); for (int i = 0; i < output_size; ++i) { if (i < op_def->output_type_size()) { @@ -127,22 +138,15 @@ void MemoryOptimizer::Optimize( int best_mem_id = -1; MemoryType mem_type = MemoryType::CPU_BUFFER; if (device == DeviceType::GPU) { - mem_type = mem_types.at(op_def->output(i)); + mem_type = mem_types->at(op_def->output(i)); } - auto shape = std::vector( - op_def->output_shape(i).dims().begin(), - op_def->output_shape(i).dims().end()); - MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type); + MemoryBlock op_mem_block = CreateMemoryBlock(op_def, i, dt, mem_type); MemoryBlock best_mem_block; if (IsMemoryReuseOp(op_def->type())) { if (tensor_mem_map_.count(op_def->input(0)) == 1) { - best_mem_id = tensor_mem_map_[op_def->input(0)].first; + best_mem_id = tensor_mem_map_.at(op_def->input(0)).mem_id; } } else { - auto shape = std::vector( - op_def->output_shape(i).dims().begin(), - op_def->output_shape(i).dims().end()); - int64_t op_mem_size = op_mem_block.x() * op_mem_block.y(); int64_t best_added_mem_size = LLONG_MAX; int64_t best_wasted_mem_size = LLONG_MAX; @@ -206,7 +210,8 @@ void MemoryOptimizer::Optimize( } else { mem_ref_count_[best_mem_id] = 1; } - tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt); + tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id, + dt, has_data_format)); } } @@ -218,7 +223,7 @@ void MemoryOptimizer::Optimize( tensor_ref_count_[input_name] -= 1; if (tensor_ref_count_.at(input_name) == 0 && tensor_mem_map_.count(input_name) == 1) { - int mem_id = tensor_mem_map_.at(input_name).first; + int mem_id = tensor_mem_map_.at(input_name).mem_id; mem_ref_count_[mem_id] -= 1; if (mem_ref_count_.at(mem_id) == 0) { idle_blocks_.insert(mem_id); @@ -238,7 +243,7 @@ const std::vector& MemoryOptimizer::mem_blocks() const { return mem_blocks_; } -const std::unordered_map>& +const std::unordered_map& MemoryOptimizer::tensor_mem_map() const { return tensor_mem_map_; } diff --git a/mace/core/memory_optimizer.h b/mace/core/memory_optimizer.h index 555613e6a2043a47289bab0d8a44c282097bafc8..986c5450280184990b426b18d99b886ee6f8fcac 100644 --- a/mace/core/memory_optimizer.h +++ b/mace/core/memory_optimizer.h @@ -77,31 +77,44 @@ class MemoryBlock { }; class MemoryOptimizer { + public: + struct TensorMemInfo { + int mem_id; + DataType data_type; + bool has_data_format; + + TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) : + mem_id(mem_id), data_type(data_type), has_data_format(has_data_format) + {} + }; + public: static bool IsMemoryReuseOp(const std::string &op_type); void UpdateTensorRef(const std::string &tensor_name); void UpdateTensorRef(const OperatorDef *op_def); - void Optimize(const OperatorDef *op_def, - const std::unordered_map &mem_types); + void Optimize( + const OperatorDef *op_def, + const std::unordered_map *mem_types = nullptr); const std::vector &mem_blocks() const; - const std::unordered_map> &tensor_mem_map() const; + const std::unordered_map &tensor_mem_map() const; std::string DebugInfo() const; private: - MemoryBlock CreateMemoryBlock(std::vector shape, - DataType dt, - MemoryType mem_type); + MemoryBlock CreateMemoryBlock( + const OperatorDef *op_def, + int output_idx, + DataType dt, + MemoryType mem_type); private: std::unordered_map tensor_ref_count_; std::vector mem_blocks_; // tensor name : // Buffer Memory do not different data type, so store the data type. - std::unordered_map> tensor_mem_map_; + std::unordered_map tensor_mem_map_; std::unordered_map mem_ref_count_; std::set idle_blocks_; }; diff --git a/mace/core/net.cc b/mace/core/net.cc index 5ff777b0607715ac5caa9a3beb40c17841b00d3a..fbe1c1b8b9da81929732a77c176195f29dd688b9 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -19,14 +19,17 @@ #include #include "mace/core/future.h" -#include "mace/core/macros.h" #include "mace/core/memory_optimizer.h" #include "mace/core/net.h" #include "mace/core/op_context.h" #include "mace/public/mace.h" -#include "mace/utils/memory_logging.h" +#include "mace/port/env.h" +#include "mace/utils/conf_util.h" +#include "mace/utils/logging.h" +#include "mace/utils/macros.h" +#include "mace/utils/math.h" +#include "mace/utils/memory.h" #include "mace/utils/timer.h" -#include "mace/utils/utils.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/opencl_util.h" @@ -38,12 +41,15 @@ namespace { struct InternalOutputInfo { InternalOutputInfo(const MemoryType mem_type, const DataType dtype, + const DataFormat data_format, const std::vector &shape, int op_idx) - : mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {} + : mem_type(mem_type), dtype(dtype), data_format(data_format), + shape(shape), op_idx(op_idx) {} MemoryType mem_type; // transformed memory type DataType dtype; + DataFormat data_format; std::vector shape; // tensor shape int op_idx; // operation which generate the tensor }; @@ -70,12 +76,12 @@ std::unique_ptr SerialNet::CreateOperation( const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, - DataFormat data_format_flag, + bool has_data_format, bool is_quantize_model) { // Create the Operation DeviceType target_device_type = target_device_->device_type(); DeviceType device_type = DeviceType::CPU; - construct_context->set_device(cpu_device_); + construct_context->set_device(cpu_device_.get()); construct_context->set_operator_def(op_def); construct_context->set_output_mem_type(MemoryType::CPU_BUFFER); // Get available devices @@ -100,8 +106,7 @@ std::unique_ptr SerialNet::CreateOperation( if (!is_quantize_model && device_type == DeviceType::CPU && op_def->output_shape_size() == op_def->output_size()) { for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { - if (data_format_flag == NHWC && - op_def->output_shape(out_idx).dims_size() == 4) { + if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) { // NHWC -> NCHW std::vector output_shape = TransposeShape( @@ -115,9 +120,8 @@ std::unique_ptr SerialNet::CreateOperation( } } } - std::unique_ptr op( - op_registry->CreateOperation(construct_context, device_type)); - return std::move(op); + + return op_registry->CreateOperation(construct_context, device_type); } SerialNet::SerialNet(const OpRegistryBase *op_registry, @@ -129,17 +133,11 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, ws_(ws), target_device_(target_device), cpu_device_( - new CPUDevice(target_device->cpu_runtime()->num_threads(), - target_device->cpu_runtime()->policy(), - target_device->cpu_runtime()->use_gemmlowp())) { + make_unique( + target_device->cpu_runtime()->num_threads(), + target_device->cpu_runtime()->policy(), + target_device->cpu_runtime()->use_gemmlowp())) { MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); - // output tensor : related information - std::unordered_map output_map; - // used for memory optimization - std::unordered_map output_mem_map; - std::unordered_set transformed_set; - // add input information - MemoryType target_mem_type; // quantize model flag bool is_quantize_model = IsQuantizedModel(*net_def); // Tensor Shape map @@ -149,20 +147,18 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, continue; } for (int i = 0; i < op.output_size(); ++i) { - tensor_shape_map[op.output(i)] = - std::move(std::vector(op.output_shape(i).dims().begin(), - op.output_shape(i).dims().end())); + tensor_shape_map[op.output(i)] = std::vector( + op.output_shape(i).dims().begin(), + op.output_shape(i).dims().end()); } } for (auto &tensor : net_def->tensors()) { tensor_shape_map[tensor.name()] = - std::move(std::vector(tensor.dims().begin(), - tensor.dims().end())); + std::vector(tensor.dims().begin(), tensor.dims().end()); } - DataFormat data_format_flag = NHWC; + bool has_data_format = false; if (target_device_->device_type() == DeviceType::CPU) { - target_mem_type = MemoryType::CPU_BUFFER; for (auto &input_info : net_def->input_info()) { std::vector input_shape = std::vector(input_info.dims().begin(), @@ -170,38 +166,45 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, // update tensor shape map tensor_shape_map[input_info.name()] = input_shape; // Only could be NONE or NHWC - auto input_data_format = static_cast( + DataFormat input_data_format = static_cast( input_info.data_format()); - if (!is_quantize_model && input_data_format == NHWC && + has_data_format = has_data_format || + (input_data_format != DataFormat::DF_NONE); + if (!is_quantize_model && input_data_format == DataFormat::NHWC && input_info.dims_size() == 4) { // NHWC -> NCHW input_shape = TransposeShape(input_shape, {0, 3, 1, 2}); - } else if (input_data_format == DataFormat::DF_NONE) { - data_format_flag = DataFormat::DF_NONE; } - output_map.emplace(input_info.name(), InternalOutputInfo( - target_mem_type, DataType::DT_FLOAT, input_shape, -1)); } } - #ifdef MACE_ENABLE_OPENCL - else { // GPU NOLINT[readability/braces] + // output tensor : related information + std::unordered_map output_map; + // used for memory optimization + std::unordered_map output_mem_map; + std::unordered_set transformed_set; + // add input information + MemoryType target_mem_type; + // default data format of output tensor + DataFormat default_output_df = DataFormat::DF_NONE; + if (target_device_->device_type() == DeviceType::GPU) { target_mem_type = MemoryType::GPU_BUFFER; for (auto &input_info : net_def->input_info()) { - auto input_data_format = static_cast( + DataFormat input_data_format = static_cast( input_info.data_format()); - if (input_data_format == DataFormat::DF_NONE) { - data_format_flag = DataFormat::DF_NONE; - } + has_data_format = input_data_format != DataFormat::DF_NONE; std::vector input_shape = std::vector(input_info.dims().begin(), input_info.dims().end()); // update tensor shape map tensor_shape_map[input_info.name()] = input_shape; output_map.emplace(input_info.name(), InternalOutputInfo( - target_mem_type, DataType::DT_FLOAT, input_shape, -1)); + target_mem_type, DataType::DT_FLOAT, input_data_format, + input_shape, -1)); } + default_output_df = + has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE; } #endif // MACE_ENABLE_OPENCL @@ -212,7 +215,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, auto op = CreateOperation(op_registry, &construct_context, op_def, - data_format_flag, + has_data_format, is_quantize_model); #ifdef MACE_ENABLE_OPENCL // Add input transform operation if necessary @@ -246,11 +249,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, << output_info.mem_type << " to " << wanted_in_mem_type << ", from Data Type " << output_info.dtype << " to " - << wanted_in_dt; + << wanted_in_dt << ". with data format " + << output_info.data_format; std::string input_name = op_def->input(i); op_def->set_input(i, t_input_name); auto input_shape = output_info.shape; if (output_info.mem_type == MemoryType::CPU_BUFFER && + output_info.data_format == DataFormat::NCHW && input_shape.size() == 4) { // NCHW -> NHWC input_shape = @@ -258,14 +263,15 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, {0, 2, 3, 1}); } auto transform_op_def = OpenCLUtil::CreateTransformOpDef( - input_name, input_shape, t_input_name, - wanted_in_dt, wanted_in_mem_type, data_format_flag); + input_name, input_shape, t_input_name, wanted_in_dt, + construct_context.GetInputOpenCLBufferType(i), + wanted_in_mem_type, has_data_format); OpConstructContext t_construct_context(ws_); auto transform_op = CreateOperation( op_registry, &t_construct_context, transform_op_def, - data_format_flag); + has_data_format); operators_.emplace_back(std::move(transform_op)); transformed_set.insert(t_input_name); output_mem_map[t_input_name] = wanted_in_mem_type; @@ -299,6 +305,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, InternalOutputInfo( out_mem_type, dt, + default_output_df, op_def->output_shape().empty() ? std::vector() : std::vector( @@ -340,20 +347,21 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, output_mem_map[output_info.name()] = target_mem_type; } } - auto output_data_format = + bool output_has_data_format = static_cast(output_info.data_format()); auto transform_op_def = OpenCLUtil::CreateTransformOpDef( t_output_name, internal_output_info.shape, output_info.name(), output_info.data_type(), + OpenCLBufferType::IN_OUT_CHANNEL, target_mem_type, - data_format_flag); + output_has_data_format); auto transform_op = CreateOperation( op_registry, &construct_context, transform_op_def, - output_data_format); + output_has_data_format); operators_.emplace_back(std::move(transform_op)); // where to do graph reference count. mem_optimizer->UpdateTensorRef(transform_op_def.get()); @@ -370,7 +378,11 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, for (auto &op : operators_) { VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type() << ", " << op->debug_def().type() << ">"; - mem_optimizer->Optimize(op->operator_def().get(), output_mem_map); +#ifdef MACE_ENABLE_OPENCL + mem_optimizer->Optimize(op->operator_def().get(), &output_mem_map); +#else + mem_optimizer->Optimize(op->operator_def().get()); +#endif // MACE_ENABLE_OPENCL } VLOG(1) << mem_optimizer->DebugInfo(); } @@ -384,7 +396,7 @@ MaceStatus SerialNet::Init() { if (device_type == target_device_->device_type()) { init_context.set_device(target_device_); } else { - init_context.set_device(cpu_device_); + init_context.set_device(cpu_device_.get()); } // Initialize the operation MACE_RETURN_IF_ERROR(op->Init(&init_context)); @@ -395,7 +407,7 @@ MaceStatus SerialNet::Init() { MaceStatus SerialNet::Run(RunMetadata *run_metadata) { MACE_MEMORY_LOGGING_GUARD(); MACE_LATENCY_LOGGER(1, "Running net"); - OpContext context(ws_, cpu_device_); + OpContext context(ws_, cpu_device_.get()); for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; DeviceType device_type = op->device_type(); @@ -408,7 +420,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { if (device_type == target_device_->device_type()) { context.set_device(target_device_); } else { - context.set_device(cpu_device_); + context.set_device(cpu_device_.get()); } CallStats call_stats; @@ -452,7 +464,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { bool transpose_a = op->GetOptionalArg("transpose_a", false); kernels = op->Input(0)->shape(); if (transpose_a) { - std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]); + std::swap(kernels[kernels.size() - 2], kernels[kernels.size() - 1]); } } else if (type.compare("FullyConnected") == 0) { kernels = op->Input(1)->shape(); @@ -472,7 +484,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { VLOG(3) << "Operator " << op->debug_def().name() << " has shape: " << MakeString(op->Output(0)->shape()); - if (EnvEnabled("MACE_LOG_TENSOR_RANGE")) { + if (EnvConfEnabled("MACE_LOG_TENSOR_RANGE")) { for (int i = 0; i < op->OutputSize(); ++i) { if (op->debug_def().quantize_info_size() == 0) { int data_type = op->GetOptionalArg("T", static_cast(DT_FLOAT)); @@ -498,16 +510,16 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { Tensor::MappingGuard guard(op->Output(i)); auto *output_data = op->Output(i)->data(); for (index_t j = 0; j < op->Output(i)->size(); ++j) { - int index = static_cast((output_data[j] - min_v) / bin_v); - if (index < 0) - index = 0; - else if (index > bin_size-1) - index = bin_size-1; - bin_distribution[index]++; + int index = static_cast((output_data[j] - min_v) / bin_v); + if (index < 0) + index = 0; + else if (index > bin_size - 1) + index = bin_size - 1; + bin_distribution[index]++; } LOG(INFO) << "Tensor range @@" << op->debug_def().output(i) - << "@@" << min_v << "," << max_v<< "@@" - << MakeString(bin_distribution); + << "@@" << min_v << "," << max_v << "@@" + << MakeString(bin_distribution); } } } diff --git a/mace/core/net.h b/mace/core/net.h index 9945d04637d5eafa402297462b3e9adf1375abdd..788eb611a54158791f988d446153b4b50ef8a59e 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -59,14 +59,14 @@ class SerialNet : public NetBase { const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, - DataFormat input_format, + bool has_data_format, bool is_quantize_model = false); protected: Workspace *ws_; Device *target_device_; // CPU is base device. - Device *cpu_device_; + std::unique_ptr cpu_device_; std::vector > operators_; MACE_DISABLE_COPY_AND_ASSIGN(SerialNet); diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 319b0548d6b75794c3061862ee62599af38cdd7f..8fae1bd8a710f0fb9f6536960ae195ab6b94cba1 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -86,6 +86,27 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const { return input_data_types_[idx]; } +#ifdef MACE_ENABLE_OPENCL +void OpConstructContext::SetInputOpenCLBufferType( + size_t idx, OpenCLBufferType buffer_type) { + if (input_opencl_buffer_types_.empty()) { + // the default inputs' memory types are same as output memory type. + input_opencl_buffer_types_.resize(operator_def_->input_size(), + OpenCLBufferType::IN_OUT_CHANNEL); + } + MACE_CHECK(idx < input_opencl_buffer_types_.size()); + input_opencl_buffer_types_[idx] = buffer_type; +} +OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType( + size_t idx) const { + if (input_opencl_buffer_types_.empty()) { + return OpenCLBufferType::IN_OUT_CHANNEL; + } + MACE_CHECK(idx < input_opencl_buffer_types_.size()); + return input_opencl_buffer_types_[idx]; +} +#endif // MACE_ENABLE_OPENCL + OpInitContext::OpInitContext(Workspace *ws, Device *device) : ws_(ws), device_(device) {} diff --git a/mace/core/operator.h b/mace/core/operator.h index 03a0f0749954b052b9b2dae558c0fed36612f5e5..e59af9ab166a5ace99bc7cc59b17a025cc0b1645 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -26,6 +26,9 @@ #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/proto/mace.pb.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_util.h" +#endif // MACE_ENABLE_OPENCL namespace mace { @@ -72,6 +75,11 @@ class OpConstructContext { DataType GetInputDataType(size_t idx) const; +#ifdef MACE_ENABLE_OPENCL + void SetInputOpenCLBufferType(size_t idx, OpenCLBufferType buffer_type); + OpenCLBufferType GetInputOpenCLBufferType(size_t idx) const; +#endif // MACE_ENABLE_OPENCL + private: std::shared_ptr operator_def_; Workspace *ws_; @@ -81,6 +89,9 @@ class OpConstructContext { std::vector input_mem_types_; std::vector input_data_types_; MemoryType output_mem_type_; // there is only one output memory type now. +#ifdef MACE_ENABLE_OPENCL + std::vector input_opencl_buffer_types_; +#endif // MACE_ENABLE_OPENCL }; // memory_optimizer, device diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index c4ed389265a9881fd6505476ffe45f5852f1bc15..5db5b36b1bb8bd2d2399f1cfa4ba406e78654a40 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -18,9 +18,6 @@ #include #endif -#include -#include -#include #include #include #include @@ -29,8 +26,9 @@ #include #include -#include "mace/core/macros.h" +#include "mace/port/env.h" #include "mace/public/mace.h" +#include "mace/utils/macros.h" #include "mace/utils/logging.h" namespace mace { @@ -42,101 +40,36 @@ struct CPUFreq { float freq; }; -namespace { - -int GetCPUCount() { - int cpu_count = 0; - std::string cpu_sys_conf = "/proc/cpuinfo"; - std::ifstream f(cpu_sys_conf); - if (!f.is_open()) { - LOG(ERROR) << "failed to open " << cpu_sys_conf; - return -1; - } - std::string line; - const std::string processor_key = "processor"; - while (std::getline(f, line)) { - if (line.size() >= processor_key.size() - && line.compare(0, processor_key.size(), processor_key) == 0) { - ++cpu_count; - } - } - if (f.bad()) { - LOG(ERROR) << "failed to read " << cpu_sys_conf; - } - if (!f.eof()) { - LOG(ERROR) << "failed to read end of " << cpu_sys_conf; - } - f.close(); - VLOG(2) << "CPU cores: " << cpu_count; - return cpu_count; -} - -int GetCPUMaxFreq(std::vector *max_freqs) { - int cpu_count = GetCPUCount(); - for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) { - std::string cpuinfo_max_freq_sys_conf = MakeString( - "/sys/devices/system/cpu/cpu", - cpu_id, - "/cpufreq/cpuinfo_max_freq"); - std::ifstream f(cpuinfo_max_freq_sys_conf); - if (!f.is_open()) { - LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf; - return -1; - } - std::string line; - if (std::getline(f, line)) { - float freq = strtof(line.c_str(), nullptr); - max_freqs->push_back(freq); - } - if (f.bad()) { - LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf; - } - f.close(); - } - - for (float freq : *max_freqs) { - VLOG(2) << "CPU freq: " << freq; - } - - return 0; -} +enum SchedulePolicy { + SCHED_STATIC, + SCHED_GUIDED, +}; -MaceStatus SetThreadAffinity(cpu_set_t mask) { -#if defined(__ANDROID__) - pid_t pid = gettid(); -#else - pid_t pid = syscall(SYS_gettid); -#endif - int err = sched_setaffinity(pid, sizeof(mask), &mask); - if (err) { - LOG(WARNING) << "set affinity error: " << strerror(errno); - return MaceStatus(MaceStatus::MACE_INVALID_ARGS, - "set affinity error: " + std::string(strerror(errno))); - } else { - return MaceStatus::MACE_SUCCESS; - } -} +namespace { MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, - const std::vector &cpu_ids) { + const std::vector &cpu_ids, + SchedulePolicy schedule_policy) { MaceOpenMPThreadCount = omp_num_threads; #ifdef MACE_ENABLE_OPENMP VLOG(1) << "Set OpenMP threads number: " << omp_num_threads << ", CPU core IDs: " << MakeString(cpu_ids); - omp_set_schedule(omp_sched_guided, 1); + if (schedule_policy == SCHED_GUIDED) { + omp_set_schedule(omp_sched_guided, 1); + } else if (schedule_policy == SCHED_STATIC) { + omp_set_schedule(omp_sched_static, 0); + } else { + LOG(WARNING) << "Unknown schedule policy: " << schedule_policy; + } + omp_set_num_threads(omp_num_threads); #else MACE_UNUSED(omp_num_threads); + MACE_UNUSED(schedule_policy); LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif - // compute mask - cpu_set_t mask; - CPU_ZERO(&mask); - for (auto cpu_id : cpu_ids) { - CPU_SET(cpu_id, &mask); - } #ifdef MACE_ENABLE_OPENMP std::vector status(omp_num_threads, MaceStatus::MACE_INVALID_ARGS); @@ -144,7 +77,7 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, for (int i = 0; i < omp_num_threads; ++i) { VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num() << "/" << omp_get_num_threads(); - status[i] = SetThreadAffinity(mask); + status[i] = SchedSetAffinity(cpu_ids); } for (int i = 0; i < omp_num_threads; ++i) { if (status[i] != MaceStatus::MACE_SUCCESS) @@ -152,8 +85,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, } return MaceStatus::MACE_SUCCESS; #else - MaceStatus status = SetThreadAffinity(mask); - VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0]; + MaceStatus status = SchedSetAffinity(cpu_ids); + VLOG(1) << "Set affinity without OpenMP: " << MakeString(cpu_ids); return status; #endif } @@ -166,8 +99,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( void *gemm_context) { // get cpu frequency info std::vector cpu_max_freqs; - if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) { - return MaceStatus::MACE_INVALID_ARGS; + MACE_RETURN_IF_ERROR(GetCPUMaxFreq(&cpu_max_freqs)); + if (cpu_max_freqs.empty()) { + return MaceStatus::MACE_RUNTIME_ERROR; } std::vector cpu_freq(cpu_max_freqs.size()); @@ -228,6 +162,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( } else { cores_to_use = num_threads_hint; } + MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0"); VLOG(2) << "Use " << num_threads_hint << " threads"; std::vector cpu_ids(cores_to_use); @@ -236,6 +171,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( << cpu_freq[i].freq; cpu_ids[i] = cpu_freq[i].core_id; } + SchedulePolicy sched_policy = SCHED_GUIDED; + if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) { + sched_policy = SCHED_STATIC; + } #ifdef MACE_ENABLE_QUANTIZE if (gemm_context) { @@ -244,7 +183,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( } #endif // MACE_ENABLE_QUANTIZE - return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids); + return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, + cpu_ids, + sched_policy); } } // namespace mace diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index 95fee27f5424eeed2f29eb782bd085115ab430c9..ab067ebaae698e2296dcee5469c93961f654b628 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -22,7 +22,7 @@ #include "public/gemmlowp.h" #endif // MACE_ENABLE_QUANTIZE -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/public/mace.h" #include "mace/utils/logging.h" @@ -52,13 +52,13 @@ class CPURuntime { #ifdef MACE_ENABLE_QUANTIZE ~CPURuntime() { - if (!gemm_context_) { + if (gemm_context_ != nullptr) { delete static_cast(gemm_context_); } } gemmlowp::GemmContext *GetGemmlowpContext() { - if (!gemm_context_) { + if (gemm_context_ == nullptr) { gemm_context_ = new gemmlowp::GemmContext(); } return static_cast(gemm_context_); diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.h b/mace/core/runtime/hexagon/hexagon_control_wrapper.h index 1674e6cfdeefd3cfb1df9f5c71383715a6c3b1ba..eda740f400e47bab5fac2ab04057522ad9f9b7ce 100644 --- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h +++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h @@ -15,49 +15,68 @@ #ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_ #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_ +#include +#include #include -#include "mace/core/runtime/hexagon/quantize.h" #include "mace/core/tensor.h" #include "mace/public/mace.h" -#include "third_party/nnlib/hexagon_nn.h" namespace mace { +struct InOutInfo { + InOutInfo(const std::vector &shape, + const DataType data_type, + const float scale, + const int32_t zero_point, + std::unique_ptr tensor_u8) + : shape(shape), + data_type(data_type), + scale(scale), + zero_point(zero_point), + tensor_u8(std::move(tensor_u8)) {} + + std::vector shape; + DataType data_type; + float scale; + int32_t zero_point; + std::unique_ptr tensor_u8; +}; + class HexagonControlWrapper { public: - HexagonControlWrapper() {} - int GetVersion(); - bool Config(); - bool Init(); - bool Finalize(); - bool SetupGraph(const NetDef &net_def, const unsigned char *model_data); - bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor); - bool ExecuteGraphNew(const std::vector &input_tensors, - std::vector *output_tensors); + HexagonControlWrapper() = default; + virtual ~HexagonControlWrapper() = default; - bool TeardownGraph(); - void PrintLog(); - void PrintGraph(); - void GetPerfInfo(); - void ResetPerfInfo(); - void SetDebugLevel(int level); + virtual int GetVersion() = 0; + virtual bool Config() = 0; + virtual bool Init() = 0; + virtual bool Finalize() = 0; + virtual bool SetupGraph(const NetDef &net_def, + const unsigned char *model_data) = 0; + virtual bool ExecuteGraph(const Tensor &input_tensor, + Tensor *output_tensor) = 0; + virtual bool ExecuteGraphNew(const std::vector &input_tensors, + std::vector *output_tensors) = 0; + virtual bool TeardownGraph() = 0; + virtual void PrintLog() = 0; + virtual void PrintGraph() = 0; + virtual void GetPerfInfo() = 0; + virtual void ResetPerfInfo() = 0; + virtual void SetDebugLevel(int level) = 0; - private: - static constexpr int NODE_ID_OFFSET = 10000; - static constexpr int NUM_METADATA = 4; + protected: + static constexpr int kNodeIdOffset = 10000; + static constexpr int kNumMetaData = 4; - inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; } + inline uint32_t node_id(uint32_t nodeid) { return kNodeIdOffset + nodeid; } int nn_id_; - Quantizer quantizer_; - std::vector> input_shapes_; - std::vector> output_shapes_; - std::vector input_data_types_; - std::vector output_data_types_; - uint32_t num_inputs_; - uint32_t num_outputs_; + std::vector input_info_; + std::vector output_info_; + int num_inputs_; + int num_outputs_; MACE_DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper); }; diff --git a/mace/core/runtime/hexagon/hexagon_device.h b/mace/core/runtime/hexagon/hexagon_device.h index 0c933ae0b6ff2171008058cc074c293e1909b819..f80607d3196582f850d0911fec0429784cabaca0 100644 --- a/mace/core/runtime/hexagon/hexagon_device.h +++ b/mace/core/runtime/hexagon/hexagon_device.h @@ -15,18 +15,55 @@ #ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_ #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_ +#include +#include + #include "mace/core/device.h" +#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" +#ifdef MACE_ENABLE_HEXAGON +#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h" +#endif +#ifdef MACE_ENABLE_HTA +#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h" +#endif namespace mace { class HexagonDevice : public CPUDevice { public: - HexagonDevice() : CPUDevice(0, AFFINITY_NONE, false) {} + explicit HexagonDevice(DeviceType device_type) + : CPUDevice(0, AFFINITY_NONE, false), + device_type_(device_type) {} DeviceType device_type() const override { - return DeviceType::HEXAGON; + return device_type_; }; + + private: + DeviceType device_type_; }; +std::unique_ptr CreateHexagonControlWrapper( + DeviceType device_type) { + std::unique_ptr hexagon_controller; + + switch (device_type) { +#ifdef MACE_ENABLE_HEXAGON + case HEXAGON: + hexagon_controller = make_unique(); + break; +#endif +#ifdef MACE_ENABLE_HTA + case HTA: + hexagon_controller = make_unique(); + break; +#endif + default: + LOG(FATAL) << "Not supported Hexagon device type: " << device_type; + } + + return hexagon_controller; +} + } // namespace mace #endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_ diff --git a/mace/core/runtime/hexagon/hexagon_nn_ops.h b/mace/core/runtime/hexagon/hexagon_dsp_ops.h similarity index 89% rename from mace/core/runtime/hexagon/hexagon_nn_ops.h rename to mace/core/runtime/hexagon/hexagon_dsp_ops.h index 3ebedb8eb8d81850cd29383fd7667c42b2369262..1f50e13cb48bb8133fc31d71752a623fed16217f 100644 --- a/mace/core/runtime/hexagon/hexagon_nn_ops.h +++ b/mace/core/runtime/hexagon/hexagon_dsp_ops.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_ -#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_ +#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_ +#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_ #include #include @@ -57,4 +57,4 @@ class OpMap { }; } // namespace mace -#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_ +#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_ diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc similarity index 84% rename from mace/core/runtime/hexagon/hexagon_control_wrapper.cc rename to mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc index 5e0cb77213316f29b7f7f08a54d6380696d131a5..a98d9ad1499251a15d7b969cecee2eaf28f84347 100644 --- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc @@ -12,26 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include #include +#include #include // NOLINT(build/c++11) #include #include #include #include -#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" -#include "mace/core/runtime/hexagon/hexagon_nn_ops.h" +#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h" +#include "mace/core/runtime/hexagon/hexagon_dsp_ops.h" #include "mace/core/types.h" - -namespace { -inline int64_t NowMicros() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; -} -} +#include "mace/port/env.h" +#include "mace/utils/memory.h" +#include "third_party/nnlib/hexagon_nn.h" namespace mace { @@ -92,33 +87,33 @@ std::string FloatToString(const FloatType v, const int32_t precision) { } } // namespace -int HexagonControlWrapper::GetVersion() { +int HexagonDSPWrapper::GetVersion() { int version; MACE_CHECK(hexagon_nn_version(&version) == 0, "get version error"); return version; } -bool HexagonControlWrapper::Config() { +bool HexagonDSPWrapper::Config() { LOG(INFO) << "Hexagon config"; MACE_CHECK(hexagon_nn_set_powersave_level(0) == 0, "hexagon power error"); MACE_CHECK(hexagon_nn_config() == 0, "hexagon config error"); return true; } -bool HexagonControlWrapper::Init() { +bool HexagonDSPWrapper::Init() { LOG(INFO) << "Hexagon init"; MACE_CHECK(hexagon_nn_init(&nn_id_) == 0, "hexagon_nn_init failed"); ResetPerfInfo(); return true; } -bool HexagonControlWrapper::Finalize() { +bool HexagonDSPWrapper::Finalize() { LOG(INFO) << "Hexagon finalize"; return hexagon_nn_set_powersave_level(1) == 0; } -bool HexagonControlWrapper::SetupGraph(const NetDef &net_def, - unsigned const char *model_data) { +bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def, + unsigned const char *model_data) { LOG(INFO) << "Hexagon setup graph"; int64_t t0 = NowMicros(); @@ -236,29 +231,35 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def, cached_outputs.clear(); // input info - num_inputs_ = 0; - for (const InputInfo &input_info : net_def.input_info()) { + num_inputs_ = net_def.input_info_size(); + input_info_.reserve(num_inputs_); + for (const InputOutputInfo &input_info : net_def.input_info()) { std::vector input_shape(input_info.dims().begin(), input_info.dims().end()); while (input_shape.size() < 4) { input_shape.insert(input_shape.begin(), 1); } - input_shapes_.push_back(input_shape); - input_data_types_.push_back(input_info.data_type()); - num_inputs_ += 1; + input_info_.emplace_back(input_shape, + input_info.data_type(), + input_info.scale(), + input_info.zero_point(), + make_unique()); } // output info - num_outputs_ = 0; - for (const OutputInfo &output_info : net_def.output_info()) { + num_outputs_ = net_def.output_info_size(); + output_info_.reserve(num_outputs_); + for (const InputOutputInfo &output_info : net_def.output_info()) { std::vector output_shape(output_info.dims().begin(), output_info.dims().end()); while (output_shape.size() < 4) { output_shape.insert(output_shape.begin(), 1); } - output_shapes_.push_back(output_shape); - output_data_types_.push_back(output_info.data_type()); - num_outputs_ += 1; + output_info_.emplace_back(output_shape, + output_info.data_type(), + output_info.scale(), + output_info.zero_point(), + make_unique()); VLOG(1) << "OutputInfo: " << "\n\t shape: " << output_shape[0] << " " << output_shape[1] << " " << output_shape[2] << " " << output_shape[3] @@ -276,14 +277,14 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def, return true; } -bool HexagonControlWrapper::TeardownGraph() { +bool HexagonDSPWrapper::TeardownGraph() { LOG(INFO) << "Hexagon teardown graph"; return hexagon_nn_teardown(nn_id_) == 0; } #define MACE_PRINT_BUFSIZE (2 * 1024 * 1024) -void HexagonControlWrapper::PrintLog() { +void HexagonDSPWrapper::PrintLog() { char *buf; if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return; MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast(buf), @@ -293,7 +294,7 @@ void HexagonControlWrapper::PrintLog() { delete[] buf; } -void HexagonControlWrapper::PrintGraph() { +void HexagonDSPWrapper::PrintGraph() { LOG(INFO) << "Print Graph"; char *buf; if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return; @@ -304,13 +305,13 @@ void HexagonControlWrapper::PrintGraph() { delete[] buf; } -void HexagonControlWrapper::SetDebugLevel(int level) { +void HexagonDSPWrapper::SetDebugLevel(int level) { LOG(INFO) << "Set debug level: " << level; MACE_CHECK(hexagon_nn_set_debug_level(nn_id_, level) == 0, "set debug level error"); } -void HexagonControlWrapper::GetPerfInfo() { +void HexagonDSPWrapper::GetPerfInfo() { LOG(INFO) << "Get perf info"; std::vector perf_info(MACE_MAX_NODE); unsigned int n_items = 0; @@ -385,20 +386,20 @@ void HexagonControlWrapper::GetPerfInfo() { LOG(INFO) << "total duration: " << std::fixed << total_duration; } -void HexagonControlWrapper::ResetPerfInfo() { +void HexagonDSPWrapper::ResetPerfInfo() { LOG(INFO) << "Reset perf info"; MACE_CHECK(hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME) == 0, "reset perf error"); } -bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor, - Tensor *output_tensor) { +bool HexagonDSPWrapper::ExecuteGraph(const Tensor &input_tensor, + Tensor *output_tensor) { VLOG(2) << "Execute graph: " << nn_id_; // single input and single output MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num"); MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num"); - output_tensor->SetDtype(output_data_types_[0]); - output_tensor->Resize(output_shapes_[0]); + output_tensor->SetDtype(output_info_[0].data_type); + output_tensor->Resize(output_info_[0].shape); std::vector output_shape(4); uint32_t output_bytes; int res = hexagon_nn_execute( @@ -418,10 +419,11 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor, &output_bytes); MACE_CHECK(res == 0, "execute error"); - MACE_ASSERT(output_shape.size() == output_shapes_[0].size(), + MACE_ASSERT(output_shape.size() == output_info_[0].shape.size(), "wrong output shape inferred"); for (size_t i = 0; i < output_shape.size(); ++i) { - MACE_ASSERT(static_cast(output_shape[i]) == output_shapes_[0][i], + MACE_ASSERT(static_cast(output_shape[i]) + == output_info_[0].shape[i], "wrong output shape inferred"); } MACE_ASSERT(output_bytes == output_tensor->raw_size(), @@ -429,7 +431,7 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor, return res == 0; } -bool HexagonControlWrapper::ExecuteGraphNew( +bool HexagonDSPWrapper::ExecuteGraphNew( const std::vector &input_tensors, std::vector *output_tensors) { VLOG(2) << "Execute graph new: " << nn_id_; @@ -438,14 +440,15 @@ bool HexagonControlWrapper::ExecuteGraphNew( MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num"); MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num"); - std::vector inputs(num_inputs * NUM_METADATA); - std::vector outputs(num_outputs * NUM_METADATA); + std::vector inputs(num_inputs * kNumMetaData); + std::vector outputs(num_outputs * kNumMetaData); std::vector input_metadata(num_inputs); std::vector output_metadata(num_outputs); + // transform mace input to hexagon input for (size_t i = 0; i < num_inputs; ++i) { std::vector input_shape = input_tensors[i]->shape(); - size_t index = i * NUM_METADATA; + size_t index = i * kNumMetaData; inputs[index].batches = static_cast(input_shape[0]); inputs[index].height = static_cast(input_shape[1]); inputs[index].width = static_cast(input_shape[2]); @@ -453,8 +456,8 @@ bool HexagonControlWrapper::ExecuteGraphNew( inputs[index].data = const_cast( reinterpret_cast(input_tensors[i]->raw_data())); inputs[index].dataLen = static_cast(input_tensors[i]->raw_size()); - inputs[index].data_valid_len = static_cast( - input_tensors[i]->raw_size()); + inputs[index].data_valid_len = + static_cast(input_tensors[i]->raw_size()); inputs[index].unused = 0; input_metadata[i].Init(.0f, .0f, 1); AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]); @@ -462,38 +465,44 @@ bool HexagonControlWrapper::ExecuteGraphNew( AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]); } + // transform mace output to hexagon output for (size_t i = 0; i < num_outputs; ++i) { - size_t index = i * NUM_METADATA; - (*output_tensors)[i]->SetDtype(output_data_types_[i]); - (*output_tensors)[i]->Resize(output_shapes_[i]); + size_t index = i * kNumMetaData; + (*output_tensors)[i]->SetDtype(output_info_[i].data_type); + (*output_tensors)[i]->Resize(output_info_[i].shape); + outputs[index].data = reinterpret_cast( (*output_tensors)[i]->raw_mutable_data()); outputs[index].dataLen = static_cast((*output_tensors)[i]->raw_size()); output_metadata[i].Init(.0f, .0f, 1); + AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]); AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]); AddOutputMetadata(output_metadata[i].needs_quantization, &outputs[index + 3]); } + // Execute graph int res = hexagon_nn_execute_new(nn_id_, inputs.data(), - num_inputs * NUM_METADATA, + num_inputs * kNumMetaData, outputs.data(), - num_outputs * NUM_METADATA); + num_outputs * kNumMetaData); + // handle hexagon output for (size_t i = 0; i < num_outputs; ++i) { - size_t index = i * NUM_METADATA; + size_t index = i * kNumMetaData; std::vector output_shape{ outputs[index].batches, outputs[index].height, outputs[index].width, outputs[index].depth}; - MACE_ASSERT(output_shape.size() == output_shapes_[i].size(), + MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(), "wrong output shape inferred"); for (size_t j = 0; j < output_shape.size(); ++j) { MACE_ASSERT(static_cast(output_shape[j]) - == output_shapes_[i][j], + == output_info_[i].shape[j], "wrong output shape inferred"); } + MACE_ASSERT(static_cast(outputs[index].data_valid_len) == (*output_tensors)[i]->raw_size(), "wrong output bytes inferred."); diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..2c46414bf390b87af35f2000e2732b0e50663e95 --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h @@ -0,0 +1,51 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_ +#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_ + +#include + +#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" +#include "mace/core/tensor.h" +#include "mace/public/mace.h" + +namespace mace { + +class HexagonDSPWrapper : public HexagonControlWrapper { + public: + HexagonDSPWrapper() = default; + + int GetVersion() override; + bool Config() override; + bool Init() override; + bool Finalize() override; + bool SetupGraph(const NetDef &net_def, + const unsigned char *model_data) override; + bool ExecuteGraph(const Tensor &input_tensor, + Tensor *output_tensor) override; + bool ExecuteGraphNew(const std::vector &input_tensors, + std::vector *output_tensors) override; + bool TeardownGraph() override; + void PrintLog() override; + void PrintGraph() override; + void GetPerfInfo() override; + void ResetPerfInfo() override; + void SetDebugLevel(int level) override; + + MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper); +}; +} // namespace mace + +#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_ diff --git a/mace/core/runtime/hexagon/hexagon_hta_ops.h b/mace/core/runtime/hexagon/hexagon_hta_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..39a108609d815b2eeaf805d611b5fb4fbd69c564 --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_hta_ops.h @@ -0,0 +1,50 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_ +#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_ + +#include +#include + +#include "mace/utils/logging.h" +#include "third_party/hta/hta_hexagon_nn_ops.h" + +namespace mace { + +class OpMap { + public: + void Init() { +#define HTA_DEF_OP(NAME) op_map_[#NAME] = HTA_OP_##NAME; + +#include "third_party/hta/hta_ops.h" + +#undef HTA_DEF_OP + } + + hta_op_type GetOpId(const std::string &op_type) { + if (op_map_.find(op_type) != end(op_map_)) { + return op_map_[op_type]; + } else { + LOG(ERROR) << "HTA unsupported op type: " << op_type; + return HTA_NN_OPS_MAX; + } + } + + private: + std::unordered_map op_map_; +}; +} // namespace mace + +#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_ diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..e3754f19ca8f0528e0679816cd18c0ccfbb1197a --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc @@ -0,0 +1,318 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "mace/core/runtime/hexagon/hexagon_hta_ops.h" +#include "mace/core/types.h" +#include "mace/utils/memory.h" +#include "mace/utils/quantize.h" +#include "third_party/hta/hta_hexagon_api.h" + +namespace mace { + +int HexagonHTAWrapper::GetVersion() { + int version; + MACE_CHECK(hexagon_hta_nn_version(&version) == 0, "get version error"); + return version; +} + +bool HexagonHTAWrapper::Config() { + LOG(INFO) << "HTA config"; + MACE_CHECK(hexagon_hta_nn_config() == 0, "hexagon config error"); + return true; +} + +bool HexagonHTAWrapper::Init() { + LOG(INFO) << "Hexagon init"; + MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed"); + ResetPerfInfo(); + return true; +} + +bool HexagonHTAWrapper::Finalize() { + LOG(INFO) << "Hexagon finalize"; + return true; +} + +bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def, + unsigned const char *model_data) { + LOG(INFO) << "Hexagon setup graph"; + + int64_t t0 = NowMicros(); + + // const node + for (const ConstTensor &const_tensor : net_def.tensors()) { + std::vector tensor_shape(const_tensor.dims().begin(), + const_tensor.dims().end()); + while (tensor_shape.size() < 4) { + tensor_shape.insert(tensor_shape.begin(), 1); + } + + hexagon_nn_const_node const_node; + const_node.node_id = node_id(const_tensor.node_id()); + const_node.tensor.batches = tensor_shape[0]; + const_node.tensor.height = tensor_shape[1]; + const_node.tensor.width = tensor_shape[2]; + const_node.tensor.depth = tensor_shape[3]; + + if (const_tensor.data_type() == DataType::DT_INT32 && + const_tensor.data_size() == 0) { + const_node.tensor.data = NULL; + const_node.tensor.dataLen = 0; + } else { + const_node.tensor.data = + const_cast(model_data + const_tensor.offset()); + const_node.tensor.dataLen = const_tensor.data_size() * + GetEnumTypeSize(const_tensor.data_type()); + } + + hexagon_hta_nn_append_const_node(nn_id_, + const_node.node_id, + const_node.tensor.batches, + const_node.tensor.height, + const_node.tensor.width, + const_node.tensor.depth, + const_node.tensor.data, + const_node.tensor.dataLen); + } + + // op node + OpMap op_map; + op_map.Init(); + std::vector> cached_inputs; + std::vector> cached_outputs; + std::vector inputs; + std::vector outputs; + + for (const OperatorDef &op : net_def.op()) { + hta_op_type op_id = op_map.GetOpId(op.type()); + inputs.resize(op.node_input().size()); + for (int i = 0; i < op.node_input().size(); ++i) { + inputs[i].src_id = node_id(op.node_input()[i].node_id()); + inputs[i].output_idx = op.node_input()[i].output_port(); + } + outputs.resize(op.output_shape().size()); + for (int i = 0; i < op.output_shape().size(); ++i) { + outputs[i].rank = op.output_shape()[i].dims().size(); + for (size_t j = 0; j < outputs[i].rank; ++j) { + outputs[i].max_sizes[j] = op.output_shape()[i].dims()[j]; + } + if (outputs[i].rank == 0) { + outputs[i].rank = 1; + outputs[i].max_sizes[0] = 1; + } + outputs[i].max_sizes[outputs[i].rank] = 0; + outputs[i].elementsize = GetEnumTypeSize( + static_cast(op.output_type()[i])); + outputs[i].zero_offset = 0; + outputs[i].stepsize = 0; + } + cached_inputs.push_back(inputs); + cached_outputs.push_back(outputs); + + auto padding_type = static_cast(op.padding()); + + hexagon_nn_op_node op_node; + op_node.node_id = node_id(op.node_id()); + op_node.operation = op_id; + op_node.padding = padding_type; + op_node.inputs = cached_inputs.back().data(); + op_node.inputsLen = inputs.size(); + op_node.outputs = cached_outputs.back().data(); + op_node.outputsLen = outputs.size(); + + hexagon_hta_nn_append_node(nn_id_, + op_node.node_id, + op_node.operation, + op_node.padding, + op_node.inputs, + op_node.inputsLen, + op_node.outputs, + op_node.outputsLen); + } + + // input info + num_inputs_ = net_def.input_info_size(); + input_info_.reserve(num_inputs_); + for (const InputOutputInfo &input_info : net_def.input_info()) { + std::vector input_shape(input_info.dims().begin(), + input_info.dims().end()); + while (input_shape.size() < 4) { + input_shape.insert(input_shape.begin(), 1); + } + input_info_.emplace_back(input_shape, + input_info.data_type(), + input_info.scale(), + input_info.zero_point(), + make_unique()); + } + + // output info + num_outputs_ = net_def.output_info_size(); + output_info_.reserve(num_outputs_); + for (const InputOutputInfo &output_info : net_def.output_info()) { + std::vector output_shape(output_info.dims().begin(), + output_info.dims().end()); + while (output_shape.size() < 4) { + output_shape.insert(output_shape.begin(), 1); + } + output_info_.emplace_back(output_shape, + output_info.data_type(), + output_info.scale(), + output_info.zero_point(), + make_unique()); + VLOG(1) << "OutputInfo: " + << "\n\t shape: " << output_shape[0] << " " << output_shape[1] + << " " << output_shape[2] << " " << output_shape[3] + << "\n\t type: " << output_info.data_type(); + } + + int64_t t1 = NowMicros(); + + MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed"); + + int64_t t2 = NowMicros(); + + VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1; + + return true; +} + +bool HexagonHTAWrapper::TeardownGraph() { + LOG(INFO) << "Hexagon teardown graph"; + return hexagon_hta_nn_teardown(nn_id_) == 0; +} + +void HexagonHTAWrapper::PrintLog() { + LOG(INFO) << "Print Log"; +} + +void HexagonHTAWrapper::PrintGraph() { + LOG(INFO) << "Print Graph"; +} + +void HexagonHTAWrapper::SetDebugLevel(int level) { + LOG(INFO) << "Set debug level: " << level; + MACE_CHECK(hexagon_hta_nn_set_debug_level(nn_id_, level) == 0, + "set debug level error"); +} + +void HexagonHTAWrapper::GetPerfInfo() { + LOG(INFO) << "Get perf info"; +} + +void HexagonHTAWrapper::ResetPerfInfo() { + LOG(INFO) << "Reset perf info"; +} + +bool HexagonHTAWrapper::ExecuteGraph(const Tensor &input_tensor, + Tensor *output_tensor) { + MACE_UNUSED(input_tensor); + MACE_UNUSED(output_tensor); + MACE_NOT_IMPLEMENTED; + return false; +} + +bool HexagonHTAWrapper::ExecuteGraphNew( + const std::vector &input_tensors, + std::vector *output_tensors) { + VLOG(2) << "Execute graph new: " << nn_id_; + uint32_t num_inputs = static_cast(input_tensors.size()); + uint32_t num_outputs = static_cast(output_tensors->size()); + MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num"); + MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num"); + + std::vector inputs(num_inputs); + std::vector outputs(num_outputs); + + for (size_t i = 0; i < num_inputs; ++i) { + std::vector input_shape = input_tensors[i]->shape(); + inputs[i].batches = static_cast(input_shape[0]); + inputs[i].height = static_cast(input_shape[1]); + inputs[i].width = static_cast(input_shape[2]); + inputs[i].depth = static_cast(input_shape[3]); + input_info_[i].tensor_u8->SetDtype(DT_UINT8); + input_info_[i].tensor_u8->Resize(input_shape); + + const float *input_data = input_tensors[i]->data(); + uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data(); + QuantizeWithScaleAndZeropoint(input_data, + input_tensors[i]->size(), + input_info_[i].scale, + input_info_[i].zero_point, + input_data_u8); + + inputs[i].data = const_cast( + reinterpret_cast( + input_info_[i].tensor_u8->raw_data())); + inputs[i].dataLen = static_cast(input_info_[i].tensor_u8->raw_size()); + inputs[i].data_valid_len = static_cast( + input_info_[i].tensor_u8->raw_size()); + inputs[i].unused = 0; + } + + for (size_t i = 0; i < num_outputs; ++i) { + (*output_tensors)[i]->SetDtype(output_info_[i].data_type); + (*output_tensors)[i]->Resize(output_info_[i].shape); + output_info_[i].tensor_u8->SetDtype(DT_UINT8); + output_info_[i].tensor_u8->Resize(output_info_[i].shape); + outputs[i].data = reinterpret_cast( + output_info_[i].tensor_u8->raw_mutable_data()); + outputs[i].dataLen = + static_cast(output_info_[i].tensor_u8->raw_size()); + } + + int res = hexagon_hta_nn_execute_new(nn_id_, + inputs.data(), + num_inputs, + outputs.data(), + num_outputs); + + for (size_t i = 0; i < num_outputs; ++i) { + std::vector output_shape{ + outputs[i].batches, outputs[i].height, outputs[i].width, + outputs[i].depth}; + MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(), + "wrong output shape inferred"); + for (size_t j = 0; j < output_shape.size(); ++j) { + MACE_ASSERT(static_cast(output_shape[j]) + == output_info_[i].shape[j], + "wrong output shape inferred"); + } + MACE_ASSERT(static_cast(outputs[i].data_valid_len) + == (*output_tensors)[i]->raw_size(), + "wrong output bytes inferred."); + + const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data(); + float *output_data = (*output_tensors)[i]->mutable_data(); + Dequantize(output_data_u8, + output_info_[i].tensor_u8->size(), + output_info_[i].scale, + output_info_[i].zero_point, + output_data); + } + + return res == 0; +} + +} // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..26ea17bde45da1853efe222e9f7d30baa25d3471 --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h @@ -0,0 +1,51 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_ +#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_ + +#include + +#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" +#include "mace/core/tensor.h" +#include "mace/public/mace.h" + +namespace mace { + +class HexagonHTAWrapper : public HexagonControlWrapper { + public: + HexagonHTAWrapper() = default; + + int GetVersion() override; + bool Config() override; + bool Init() override; + bool Finalize() override; + bool SetupGraph(const NetDef &net_def, + const unsigned char *model_data) override; + bool ExecuteGraph(const Tensor &input_tensor, + Tensor *output_tensor) override; + bool ExecuteGraphNew(const std::vector &input_tensors, + std::vector *output_tensors) override; + bool TeardownGraph() override; + void PrintLog() override; + void PrintGraph() override; + void GetPerfInfo() override; + void ResetPerfInfo() override; + void SetDebugLevel(int level) override; + + MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper); +}; +} // namespace mace + +#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_ diff --git a/mace/core/runtime/hexagon/quantize.cc b/mace/core/runtime/hexagon/quantize.cc deleted file mode 100644 index 31a62288f6bf6b4cec8fd0b5692d427ca9376b94..0000000000000000000000000000000000000000 --- a/mace/core/runtime/hexagon/quantize.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "mace/core/runtime/hexagon/quantize.h" - -namespace mace { - -void Quantizer::Quantize(const Tensor &in_tensor, - Tensor *out_tensor, - float *min_out, - float *max_out) { - if (in_tensor.size() == 0) return; - const float *in_data = in_tensor.data(); - float min_in = in_data[0]; - float max_in = in_data[0]; - for (index_t i = 0; i < in_tensor.size(); ++i) { - min_in = std::min(min_in, in_data[i]); - max_in = std::max(max_in, in_data[i]); - } - Quantize(in_tensor, min_in, max_in, out_tensor, min_out, max_out); -} - -void Quantizer::Quantize(const Tensor &in_tensor, - const float min_in, - const float max_in, - Tensor *out_tensor, - float *min_out, - float *max_out) { - float stepsize; - float recip_stepsize; - QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize, - &recip_stepsize); - - const float *in = in_tensor.data(); - uint8_t *out = out_tensor->mutable_data(); - - for (int i = 0; i < in_tensor.size(); i++) { - const float inval = in[i]; - float ival = - static_cast((inval - *min_out) * recip_stepsize + 0.5f); - if (ival < 0) ival = 0; - if (ival > 255) ival = 255; - out[i] = static_cast(ival); - } -} - -void Quantizer::QuantizeAdjustRange(float min_in, - float max_in, - float *min_out, - float *max_out, - float *stepsize_out, - float *recip_stepsize_out) { - float minval = std::min(0.0f, min_in); - float maxval = std::max(0.0f, max_in); - float range = std::max(0.0001f, maxval - minval); - float recip_stepsize = 255.0f / range; - // make z(q0) integer - if (minval < 0.0f) { - float z = -minval * recip_stepsize; - float zi = floorf(z); - float zf = z - zi; - if (zf > 0.0001f && zf < 0.9999f) { - if (zi > 0.0f && (zi >= 254.0f || (zf - 1.0f) * minval > zf * maxval)) { - range = -255.0f * minval / zi; - maxval = minval + range; - } else { - range = 255.0f * maxval / (254.0f - zi); - minval = maxval - range; - } - recip_stepsize = 255.0f / range; - } - } - - *min_out = minval; - *max_out = maxval; - *stepsize_out = range / 255.0f; - *recip_stepsize_out = recip_stepsize; -} - -void Quantizer::DeQuantize(const Tensor &in_tensor, - const float min_in, - const float max_in, - Tensor *out_tensor) { - float range = std::max(0.0001f, max_in - min_in); - float stepsize = range / 255.0f; - - const uint8_t *in = in_tensor.data(); - float *out = out_tensor->mutable_data(); - - for (int i = 0; i < out_tensor->size(); ++i) { - out[i] = (in[i] * stepsize) + min_in; - } -} - - -} // namespace mace diff --git a/mace/core/runtime/hexagon/quantize.h b/mace/core/runtime/hexagon/quantize.h deleted file mode 100644 index f121b0d07448d9c53070d25c74aaa91a8cde7015..0000000000000000000000000000000000000000 --- a/mace/core/runtime/hexagon/quantize.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_ -#define MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_ - -#include "mace/core/tensor.h" - -namespace mace { - -class Quantizer { - public: - Quantizer() {} - ~Quantizer() {} - - void Quantize(const Tensor &in_tensor, - Tensor *out_tensor, - float *min_out, - float *max_out); - void Quantize(const Tensor &in_tensor, - const float min_in, - const float max_in, - Tensor *out_tensor, - float *min_out, - float *max_out); - void DeQuantize(const Tensor &in_tensor, - const float min_in, - const float max_in, - Tensor *out_tensor); - - private: - void QuantizeAdjustRange(float min_in, - float max_in, - float *min_out, - float *max_out, - float *stepsize, - float *recip_stepsize); - - MACE_DISABLE_COPY_AND_ASSIGN(Quantizer); -}; - -} // namespace mace - -#endif // MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_ diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 31cd5a541ee809d53f065e9ef63c67d819963c5f..0a5f9460f1026670224dfa28738cca15486a206e 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -24,7 +24,7 @@ #include #include -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/core/kv_storage.h" #include "mace/core/runtime/opencl/opencl_extension.h" #include "mace/utils/tuner.h" @@ -273,7 +273,7 @@ OpenCLRuntime::OpenCLRuntime( gpu_type_(UNKNOWN) { std::vector all_platforms; cl::Platform::get(&all_platforms); - if (all_platforms.size() == 0) { + if (all_platforms.empty()) { LOG(ERROR) << "No OpenCL platforms found"; return; } @@ -289,7 +289,7 @@ OpenCLRuntime::OpenCLRuntime( // get default device (CPUs, GPUs) of the default platform std::vector all_devices; default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices); - if (all_devices.size() == 0) { + if (all_devices.empty()) { LOG(ERROR) << "No OpenCL devices found"; return; } diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc index b190f05f4f258c27aabc0f209e271572257fb4f3..ca11414668d6e95f3d6bd70a13f48a312ea1c616 100644 --- a/mace/core/runtime/opencl/opencl_util.cc +++ b/mace/core/runtime/opencl/opencl_util.cc @@ -17,6 +17,7 @@ #include #include "mace/utils/logging.h" +#include "mace/utils/math.h" namespace mace { @@ -151,8 +152,9 @@ std::shared_ptr OpenCLUtil::CreateTransformOpDef( const std::vector &input_shape, const std::string &output_name, const mace::DataType dt, + const OpenCLBufferType buffer_type, const mace::MemoryType mem_type, - const DataFormat data_format) { + bool has_data_format) { std::unique_ptr op(new OperatorDef); std::string op_name = "mace_node_" + output_name; op->set_name(op_name); @@ -161,7 +163,7 @@ std::shared_ptr OpenCLUtil::CreateTransformOpDef( op->add_output(output_name); Argument *arg = op->add_arg(); arg->set_name("buffer_type"); - arg->set_i(static_cast(OpenCLBufferType::IN_OUT_CHANNEL)); + arg->set_i(static_cast(buffer_type)); arg = op->add_arg(); arg->set_name("mem_type"); arg->set_i(static_cast(mem_type)); @@ -169,8 +171,8 @@ std::shared_ptr OpenCLUtil::CreateTransformOpDef( arg->set_name("T"); arg->set_i(static_cast(dt)); arg = op->add_arg(); - arg->set_name("data_format"); - arg->set_i(data_format); + arg->set_name("has_data_format"); + arg->set_i(has_data_format); if (!input_shape.empty()) { OutputShape *shape = op->add_output_shape(); for (auto value : input_shape) { diff --git a/mace/core/runtime/opencl/opencl_util.h b/mace/core/runtime/opencl/opencl_util.h index ec399d87600dc9529c9d94f909ab6d45cd6f4a3e..ea0e239ee17c6826f23a73412ebc0a71d6dd25cf 100644 --- a/mace/core/runtime/opencl/opencl_util.h +++ b/mace/core/runtime/opencl/opencl_util.h @@ -48,8 +48,9 @@ class OpenCLUtil { const std::vector &input_shape, const std::string &output_name, const mace::DataType dt, + const OpenCLBufferType buffer_type, const MemoryType mem_type, - const DataFormat data_format); + bool has_data_format); }; } // namespace mace diff --git a/mace/core/tensor.h b/mace/core/tensor.h index ae999b05df7b7cc1df91cf4a716ea1b48da1b7e8..dc6c8f62d09cf52d2149c18e0ff9239856cbc2ac 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -97,8 +97,6 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) { } } // namespace numerical_chars -enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 }; - class Tensor { public: Tensor(Allocator *alloc, DataType type, @@ -304,10 +302,14 @@ class Tensor { if (buffer_ != nullptr) { MACE_CHECK(!has_opencl_image(), name_, ": Cannot resize image, use ResizeImage."); - if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { + const index_t apply_size = raw_size() + + ((buffer_ != &buffer_slice_) ? MACE_EXTRA_BUFFER_PAD_SIZE : 0); + if (apply_size > buffer_->size()) { LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size() - << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE; - return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE); + << " to " << apply_size; + MACE_CHECK(buffer_ != &buffer_slice_, + ": Cannot resize tensor with buffer slice"); + return buffer_->Resize(apply_size); } return MaceStatus::MACE_SUCCESS; } else { diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc index 03442869230066d081bef599d74d277283d386f0..a7cd149579bc6d6bf875a7b993010d6243a4d49a 100644 --- a/mace/core/testing/test_benchmark.cc +++ b/mace/core/testing/test_benchmark.cc @@ -20,7 +20,7 @@ #include #include "mace/core/testing/test_benchmark.h" -#include "mace/utils/env_time.h" +#include "mace/port/env.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index eb10dc89bed268fc1bd8d5772e5acac551c90d0e..8009fda180a7d186ec9e27b0c0751cd34eeb0a11 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -68,7 +68,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const { if (tensor_map_.count(name)) { return tensor_map_.at(name).get(); } else { - LOG(WARNING) << "Tensor " << name << " does not exist."; + VLOG(1) << "Tensor " << name << " does not exist."; } return nullptr; } @@ -264,31 +264,35 @@ MaceStatus Workspace::PreallocateOutputTensor( bool is_quantize_model = IsQuantizedModel(net_def); for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) { std::unique_ptr tensor - (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first), - tensor_mem.second.second, + (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id), + tensor_mem.second.data_type, false, tensor_mem.first)); - if (mem_blocks[tensor_mem.second.first].mem_type() - == MemoryType::GPU_IMAGE) { - VLOG(1) << "Tensor: " << tensor_mem.first - << " Mem: " << tensor_mem.second.first - << " Data type: " << tensor->dtype() - << " Image shape: " - << tensor->UnderlyingBuffer()->shape()[0] - << ", " - << tensor->UnderlyingBuffer()->shape()[1]; - tensor->set_data_format(DataFormat::NHWC); - } else { - VLOG(1) << "Tensor: " << tensor_mem.first - << " Mem: " << tensor_mem.second.first - << " Data type: " << tensor->dtype() - << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); - if (mem_blocks[tensor_mem.second.first].mem_type() - == MemoryType::GPU_BUFFER || - is_quantize_model) { + if (tensor_mem.second.has_data_format) { + if (mem_blocks[tensor_mem.second.mem_id].mem_type() + == MemoryType::GPU_IMAGE) { + VLOG(1) << "Tensor: " << tensor_mem.first + << " Mem: " << tensor_mem.second.mem_id + << " Data type: " << tensor->dtype() + << " Image shape: " + << tensor->UnderlyingBuffer()->shape()[0] + << ", " + << tensor->UnderlyingBuffer()->shape()[1]; tensor->set_data_format(DataFormat::NHWC); } else { - tensor->set_data_format(DataFormat::NCHW); + VLOG(1) << "Tensor: " << tensor_mem.first + << " Mem: " << tensor_mem.second.mem_id + << " Data type: " << tensor->dtype() + << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); + if (mem_blocks[tensor_mem.second.mem_id].mem_type() + == MemoryType::GPU_BUFFER || + is_quantize_model) { + tensor->set_data_format(DataFormat::NHWC); + } else { + tensor->set_data_format(DataFormat::NCHW); + } } + } else { + tensor->set_data_format(DataFormat::DF_NONE); } tensor_map_[tensor_mem.first] = std::move(tensor); } diff --git a/mace/examples/android/README.md b/mace/examples/android/README.md index 5d2154901a7bd7e270fa67ec5eeaa818459b8d9c..d94a51367b3443d119515348a2111737c492dcad 100644 --- a/mace/examples/android/README.md +++ b/mace/examples/android/README.md @@ -5,7 +5,7 @@ How to build --------------- ```sh -cd mace/exampls/android +cd mace/examples/android ./build.sh dynamic # if libmace.a is needed, update `macelibrary/CMakeLists.txt` and run with `./build.sh static`. ``` diff --git a/mace/examples/cli/BUILD b/mace/examples/cli/BUILD.bazel similarity index 91% rename from mace/examples/cli/BUILD rename to mace/examples/cli/BUILD.bazel index 97e42b7df148e94bd11ab0d1f3cd7bc5470e3fd2..693009e37f0a5a49fc1ca4ffab771c67de25b7c5 100644 --- a/mace/examples/cli/BUILD +++ b/mace/examples/cli/BUILD.bazel @@ -3,6 +3,7 @@ load( "//mace:mace.bzl", "if_android", "if_hexagon_enabled", + "if_hta_enabled", "if_opencl_enabled", "if_openmp_enabled", ) @@ -33,8 +34,11 @@ cc_binary( "//mace/codegen:generated_libmace", "//mace/codegen:generated_opencl_binary", "//mace/codegen:generated_opencl_parameter", + "//mace/utils:utils_hdrs", ] + if_hexagon_enabled([ "//third_party/nnlib:libhexagon", + ]) + if_hta_enabled([ + "//third_party/hta", ]), ) @@ -63,5 +67,6 @@ cc_binary( "//mace/codegen:generated_mace_engine_factory", "//mace/codegen:generated_opencl_binary", "//mace/codegen:generated_opencl_parameter", + "//mace/utils:utils_hdrs", ], ) diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index 97d3608116914423be21b05f02307b64a850eabd..26f615d132421011207429be6cffc516751863bb 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -26,7 +27,11 @@ #include #include "gflags/gflags.h" +#include "mace/port/env.h" +#include "mace/port/file_system.h" #include "mace/public/mace.h" +#include "mace/utils/logging.h" +#include "mace/utils/string_util.h" // if convert model to code. #ifdef MODEL_GRAPH_FORMAT_CODE #include "mace/codegen/engine/mace_engine_factory.h" @@ -45,97 +50,6 @@ size_t OpenCLParameterSize(); namespace mace { namespace examples { -namespace str_util { - -std::vector Split(const std::string &str, char delims) { - std::vector result; - std::string tmp = str; - while (!tmp.empty()) { - size_t next_offset = tmp.find(delims); - result.push_back(tmp.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp = tmp.substr(next_offset + 1); - } - } - return result; -} - -} // namespace str_util - -namespace { -bool ReadBinaryFile(std::vector *data, - const std::string &filename) { - std::ifstream ifs(filename, std::ios::in | std::ios::binary); - if (!ifs.is_open()) { - return false; - } - ifs.seekg(0, ifs.end); - size_t length = ifs.tellg(); - ifs.seekg(0, ifs.beg); - - data->reserve(length); - data->insert(data->begin(), std::istreambuf_iterator(ifs), - std::istreambuf_iterator()); - if (ifs.fail()) { - return false; - } - ifs.close(); - - return true; -} - -bool MemoryMap(const std::string &file, - const unsigned char **data, - size_t *size) { - bool ret = true; - int fd = open(file.c_str(), O_RDONLY); - if (fd < 0) { - std::cerr << "Failed to open file " << file - << ", error code: " << strerror(errno) << std::endl; - ret = false; - } - struct stat st; - fstat(fd, &st); - *size = static_cast(st.st_size); - - *data = static_cast( - mmap(nullptr, *size, PROT_READ, MAP_PRIVATE, fd, 0)); - if (*data == static_cast(MAP_FAILED)) { - std::cerr << "Failed to map file " << file - << ", error code: " << strerror(errno) << std::endl; - ret = false; - } - - if (close(fd) < 0) { - std::cerr << "Failed to close file " << file - << ", error code: " << strerror(errno) << std::endl; - ret = false; - } - - return ret; -} - -bool MemoryUnMap(const unsigned char *data, - const size_t &size) { - bool ret = true; - if (data == nullptr || size == 0) { - std::cerr << "data is null or size is 0" << std::endl; - ret = false; - } - - if (munmap(const_cast(data), size) < 0) { - std::cerr << "Failed to unmap file, error code: " - << strerror(errno) << std::endl; - ret = false; - } - - return ret; -} - -} // namespace - void ParseShape(const std::string &str, std::vector *shape) { std::string tmp = str; while (!tmp.empty()) { @@ -165,11 +79,24 @@ DeviceType ParseDeviceType(const std::string &device_str) { return DeviceType::GPU; } else if (device_str.compare("HEXAGON") == 0) { return DeviceType::HEXAGON; + } else if (device_str.compare("HTA") == 0) { + return DeviceType::HTA; } else { return DeviceType::CPU; } } +DataFormat ParseDataFormat(const std::string &data_format_str) { + if (data_format_str == "NHWC") { + return DataFormat::NHWC; + } else if (data_format_str == "NCHW") { + return DataFormat::NCHW; + } else if (data_format_str == "OIHW") { + return DataFormat::OIHW; + } else { + return DataFormat::DF_NONE; + } +} DEFINE_string(model_name, "", @@ -186,6 +113,12 @@ DEFINE_string(output_node, DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma"); +DEFINE_string(input_data_format, + "NHWC", + "input data formats, NONE|NHWC|NCHW"); +DEFINE_string(output_data_format, + "NHWC", + "output data formats, NONE|NHWC|NCHW"); DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs."); @@ -222,8 +155,10 @@ DEFINE_int32(cpu_affinity_policy, 1, bool RunModel(const std::vector &input_names, const std::vector> &input_shapes, + const std::vector &input_data_formats, const std::vector &output_names, - const std::vector> &output_shapes) { + const std::vector> &output_shapes, + const std::vector &output_data_formats) { // load model DeviceType device_type = ParseDeviceType(FLAGS_device); // configuration @@ -266,16 +201,26 @@ bool RunModel(const std::vector &input_names, std::shared_ptr engine; MaceStatus create_engine_status; - std::vector model_graph_data; - if (!ReadBinaryFile(&model_graph_data, FLAGS_model_file)) { - std::cerr << "Failed to read file: " << FLAGS_model_file << std::endl; + std::unique_ptr model_graph_data; + if (FLAGS_model_file != "") { + auto fs = GetFileSystem(); + auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(), + &model_graph_data); + if (status != MaceStatus::MACE_SUCCESS) { + LOG(FATAL) << "Failed to read file: " << FLAGS_model_file; + } } - const unsigned char *model_weights_data = nullptr; - size_t model_weights_data_size = 0; - if (!MemoryMap(FLAGS_model_data_file, - &model_weights_data, - &model_weights_data_size)) { - std::cerr << "Failed to read file: " << FLAGS_model_data_file << std::endl; + + std::unique_ptr model_weights_data; + if (FLAGS_model_data_file != "") { + auto fs = GetFileSystem(); + auto status = fs->NewReadOnlyMemoryRegionFromFile( + FLAGS_model_data_file.c_str(), + &model_weights_data); + if (status != MaceStatus::MACE_SUCCESS) { + LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file; + } + MACE_CHECK(model_weights_data->length() > 0); } // Only choose one of the two type based on the `model_graph_format` @@ -283,24 +228,24 @@ bool RunModel(const std::vector &input_names, #ifdef MODEL_GRAPH_FORMAT_CODE // if model_data_format == code, just pass an empty string("") // to model_data_file parameter. - create_engine_status = - CreateMaceEngineFromCode(FLAGS_model_name, - model_weights_data, - model_weights_data_size, - input_names, - output_names, - config, - &engine); + create_engine_status = CreateMaceEngineFromCode( + FLAGS_model_name, + reinterpret_cast(model_weights_data->data()), + model_weights_data->length(), + input_names, + output_names, + config, + &engine); #else - create_engine_status = - CreateMaceEngineFromProto(model_graph_data.data(), - model_graph_data.size(), - model_weights_data, - model_weights_data_size, - input_names, - output_names, - config, - &engine); + create_engine_status = CreateMaceEngineFromProto( + reinterpret_cast(model_graph_data->data()), + model_graph_data->length(), + reinterpret_cast(model_weights_data->data()), + model_weights_data->length(), + input_names, + output_names, + config, + &engine); #endif if (create_engine_status != MaceStatus::MACE_SUCCESS) { @@ -324,7 +269,8 @@ bool RunModel(const std::vector &input_names, inputs_size[input_names[i]] = input_size; auto buffer_in = std::shared_ptr(new float[input_size], std::default_delete()); - inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); + inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in, + input_data_formats[i]); } for (size_t i = 0; i < output_count; ++i) { @@ -333,7 +279,8 @@ bool RunModel(const std::vector &input_names, std::multiplies()); auto buffer_out = std::shared_ptr(new float[output_size], std::default_delete()); - outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); + outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out, + output_data_formats[i]); } if (!FLAGS_input_dir.empty()) { @@ -430,10 +377,6 @@ bool RunModel(const std::vector &input_names, } } - if (model_weights_data != nullptr) { - MemoryUnMap(model_weights_data, model_weights_data_size); - } - std::cout << "Finished" << std::endl; return true; @@ -466,13 +409,10 @@ int Main(int argc, char **argv) { << FLAGS_cpu_affinity_policy << std::endl; - std::vector input_names = str_util::Split(FLAGS_input_node, ','); - std::vector output_names = - str_util::Split(FLAGS_output_node, ','); - std::vector input_shapes = - str_util::Split(FLAGS_input_shape, ':'); - std::vector output_shapes = - str_util::Split(FLAGS_output_shape, ':'); + std::vector input_names = Split(FLAGS_input_node, ','); + std::vector output_names = Split(FLAGS_output_node, ','); + std::vector input_shapes = Split(FLAGS_input_shape, ':'); + std::vector output_shapes = Split(FLAGS_output_shape, ':'); const size_t input_count = input_shapes.size(); const size_t output_count = output_shapes.size(); @@ -485,11 +425,25 @@ int Main(int argc, char **argv) { ParseShape(output_shapes[i], &output_shape_vec[i]); } + std::vector raw_input_data_formats = + Split(FLAGS_input_data_format, ','); + std::vector raw_output_data_formats = + Split(FLAGS_output_data_format, ','); + std::vector input_data_formats(input_count); + std::vector output_data_formats(output_count); + for (size_t i = 0; i < input_count; ++i) { + input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]); + } + for (size_t i = 0; i < output_count; ++i) { + output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]); + } + bool ret = false; for (int i = 0; i < FLAGS_restart_round; ++i) { std::cout << "restart round " << i << std::endl; ret = - RunModel(input_names, input_shape_vec, output_names, output_shape_vec); + RunModel(input_names, input_shape_vec, input_data_formats, + output_names, output_shape_vec, output_data_formats); } if (ret) { return 0; diff --git a/mace/libmace/BUILD b/mace/libmace/BUILD.bazel similarity index 65% rename from mace/libmace/BUILD rename to mace/libmace/BUILD.bazel index 1cecc7f60f86ca15904d40eb57188a2e42a83006..36eff0c80a76c3adb0b9e8738281974bf1aa2280 100644 --- a/mace/libmace/BUILD +++ b/mace/libmace/BUILD.bazel @@ -10,13 +10,14 @@ licenses(["notice"]) # Apache 2.0 load( "//mace:mace.bzl", "if_android", + "if_linux", + "if_darwin", "if_neon_enabled", - "if_neon_enabled_str", "if_openmp_enabled", "if_android_armv7", "if_hexagon_enabled", + "if_hta_enabled", "if_opencl_enabled", - "if_opencl_enabled_str", "if_quantize_enabled", ) @@ -40,6 +41,8 @@ cc_library( "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_hta_enabled([ + "-DMACE_ENABLE_HTA", ]), deps = [ "//mace/ops", @@ -77,6 +80,7 @@ cc_library( visibility = ["//visibility:public"], ) +# For details, see https://github.com/bazelbuild/bazel/issues/5200 genrule( name = "libmace_static", srcs = [ @@ -87,10 +91,19 @@ genrule( "//mace/ops:internal_ops", "//mace/ops", "//mace/libmace", + "//mace/port:port_base", + "//mace/port/posix:port_posix", + "//mace/public", "//mace/utils", "//mace/proto:mace_cc", "@com_google_protobuf//:protobuf_lite", - ] + if_opencl_enabled([ + ] + if_android([ + "//mace/port/android:port_android", + ]) + if_linux([ + "//mace/port/linux:port_linux", + ]) + if_darwin([ + "//mace/port/darwin:port_darwin", + ]) + if_opencl_enabled([ "//mace/ops:opencl_kernels", "//mace/codegen:generated_opencl", ]) + if_neon_enabled([ @@ -103,20 +116,44 @@ genrule( "$(locations //mace/core:core) " + "$(locations //mace/ops:common) " + "$(locations //mace/ops:ref_kernels) " + - if_neon_enabled_str("$(locations //mace/ops:arm_neon_kernels) ") + - if_opencl_enabled_str("$(locations //mace/ops:opencl_kernels) ") + + if_neon_enabled( + "$(locations //mace/ops:arm_neon_kernels) ", + default_value = "", + ) + + if_opencl_enabled( + "$(locations //mace/ops:opencl_kernels) ", + default_value = "", + ) + "$(locations //mace/ops:internal_ops) " + "$(locations //mace/ops:ops) " + "$(locations //mace/libmace:libmace) " + + "$(locations //mace/port:port_base) " + + "$(locations //mace/port/posix:port_posix) " + + if_android( + "$(locations //mace/port/android:port_android) ", + default_value = "", + ) + + if_linux( + "$(locations //mace/port/linux:port_linux) ", + default_value = "", + ) + + if_darwin( + "$(locations //mace/port/darwin:port_darwin) ", + default_value = "", + ) + + "$(locations //mace/public:public) " + "$(locations //mace/utils:utils) " + "$(locations //mace/proto:mace_cc) " + "$(locations @com_google_protobuf//:protobuf_lite) " + - if_opencl_enabled_str("$(locations //mace/codegen:generated_opencl) ") + + if_opencl_enabled( + "$(locations //mace/codegen:generated_opencl) ", + default_value = "", + ) + "$@ " + "$$tmp_mri_file);" + "$(AR) -M <$$tmp_mri_file;" + - "rm -rf $$tmp_mri_file;" + - "$(STRIP) -x $@;", + "rm -rf $$tmp_mri_file;", + # "$(STRIP) -x $@;", # FIXME this will crash tools = ["//mace/python/tools:archive_static_lib"], visibility = ["//visibility:public"], ) diff --git a/mace/libmace/capability.cc b/mace/libmace/capability.cc index 2989cbc16f8432842858af66e7682678d7a09f2f..d37a62b6616b03bc476e7549b4e1b5d73357148d 100644 --- a/mace/libmace/capability.cc +++ b/mace/libmace/capability.cc @@ -142,14 +142,15 @@ void BMNet::SetUp() { // Add input and output information for (size_t i = 0; i < input_names_.size(); ++i) { - InputInfo *info = net_.add_input_info(); + InputOutputInfo *info = net_.add_input_info(); + info->set_data_format(DataFormat::NHWC); info->set_name(input_names_[i]); for (auto d : input_shapes_[i]) { info->add_dims(static_cast(d)); } } for (auto output_name : output_names_) { - OutputInfo *info = net_.add_output_info(); + InputOutputInfo *info = net_.add_output_info(); info->set_name(output_name); } // allocate weight data @@ -243,8 +244,8 @@ void BMNet::AddConv(const std::string &conv_type, op_def->add_output(output_name); AddIntsArg(op_def, "strides", strides); AddIntArg(op_def, "padding", padding_type); + AddIntArg(op_def, "has_data_format", 1); AddIntArg(op_def, "T", DT_HALF); - AddIntArg(op_def, "data_format", 1); if (has_relu6) { AddStringArg(op_def, "activation", "RELUX"); AddFloatArg(op_def, "max_limit", 6); @@ -270,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name, op_def->add_output(output); AddIntArg(op_def, "type", type); AddIntArg(op_def, "T", DT_HALF); - AddIntArg(op_def, "data_format", 1); + AddIntArg(op_def, "has_data_format", 1); OutputShape *shape = op_def->add_output_shape(); for (auto dim : output_shape) { shape->add_dims(dim); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index ce8a1cc77af08e027c91ed5c57e3b49a55ba7ada..927930fec485769b44a9df48284af3940034d9da 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -21,17 +21,21 @@ #include "mace/core/net.h" #include "mace/ops/ops_registry.h" #include "mace/ops/common/transpose.h" +#include "mace/utils/math.h" +#include "mace/utils/memory.h" +#include "mace/utils/stl_util.h" #include "mace/public/mace.h" +#include "mace/port/env.h" +#include "mace/port/file_system.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #endif // MACE_ENABLE_OPENCL -#ifdef MACE_ENABLE_HEXAGON -#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) #include "mace/core/runtime/hexagon/hexagon_device.h" -#endif // MACE_ENABLE_HEXAGON +#endif namespace mace { namespace { @@ -289,7 +293,10 @@ MaceTensor::MaceTensor(const std::vector &shape, std::shared_ptr data, const DataFormat format) { MACE_CHECK_NOTNULL(data.get()); - impl_ = std::unique_ptr(new MaceTensor::Impl()); + MACE_CHECK(format == DataFormat::NHWC || format == DataFormat::NCHW + || format == OIHW, + "MACE only support NHWC, NCHW and OIHW formats of input now."); + impl_ = make_unique(); impl_->shape = shape; impl_->data = data; impl_->format = format; @@ -298,11 +305,11 @@ MaceTensor::MaceTensor(const std::vector &shape, } MaceTensor::MaceTensor() { - impl_ = std::unique_ptr(new MaceTensor::Impl()); + impl_ = make_unique(); } MaceTensor::MaceTensor(const MaceTensor &other) { - impl_ = std::unique_ptr(new MaceTensor::Impl()); + impl_ = make_unique(); impl_->shape = other.shape(); impl_->data = other.data(); impl_->format = other.data_format(); @@ -310,7 +317,7 @@ MaceTensor::MaceTensor(const MaceTensor &other) { } MaceTensor::MaceTensor(const MaceTensor &&other) { - impl_ = std::unique_ptr(new MaceTensor::Impl()); + impl_ = make_unique(); impl_->shape = other.shape(); impl_->data = other.data(); impl_->format = other.data_format(); @@ -375,33 +382,31 @@ class MaceEngine::Impl { std::pair *output); private: - const unsigned char *model_data_; - size_t model_data_size_; + std::unique_ptr model_data_; std::unique_ptr op_registry_; DeviceType device_type_; std::unique_ptr device_; std::unique_ptr ws_; std::unique_ptr net_; bool is_quantized_model_; -#ifdef MACE_ENABLE_HEXAGON +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) std::unique_ptr hexagon_controller_; #endif - std::map input_info_map_; - std::map output_info_map_; + std::map input_info_map_; + std::map output_info_map_; MACE_DISABLE_COPY_AND_ASSIGN(Impl); }; MaceEngine::Impl::Impl(const MaceEngineConfig &config) : model_data_(nullptr), - model_data_size_(0), op_registry_(new OpRegistry), device_type_(config.impl_->device_type()), device_(nullptr), ws_(new Workspace()), net_(nullptr), is_quantized_model_(false) -#ifdef MACE_ENABLE_HEXAGON +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) , hexagon_controller_(nullptr) #endif { @@ -424,9 +429,9 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) config.impl_->use_gemmlowp())); } #endif -#ifdef MACE_ENABLE_HEXAGON - if (device_type_ == DeviceType::HEXAGON) { - device_.reset(new HexagonDevice()); +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) + if (device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA) { + device_.reset(new HexagonDevice(device_type_)); } #endif MACE_CHECK_NOTNULL(device_); @@ -468,6 +473,9 @@ MaceStatus MaceEngine::Impl::Init( shape[i] = input_info_map_[input_name].dims(i); } input_tensor->Resize(shape); + // Set to the default data format + input_tensor->set_data_format(static_cast( + input_info_map_[input_name].data_format())); } for (auto output_name : output_nodes) { if (output_info_map_.find(output_name) == output_info_map_.end()) { @@ -475,15 +483,17 @@ MaceStatus MaceEngine::Impl::Init( << "' does not belong to model's outputs " << MakeString(MapKeys(output_info_map_)); } +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT); +#endif } -#ifdef MACE_ENABLE_HEXAGON - if (device_type_ == HEXAGON) { - hexagon_controller_.reset(new HexagonControlWrapper()); +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) + if (device_type_ == HEXAGON || device_type_ == HTA) { + hexagon_controller_ = CreateHexagonControlWrapper(device_type_); MACE_CHECK(hexagon_controller_->Config(), "hexagon config error"); MACE_CHECK(hexagon_controller_->Init(), "hexagon init error"); hexagon_controller_->SetDebugLevel( - static_cast(mace::logging::LogMessage::MinVLogLevel())); + static_cast(mace::port::MinVLogLevelFromEnv())); MACE_CHECK(hexagon_controller_->SetupGraph(*net_def, model_data), "hexagon setup graph error"); if (VLOG_IS_ON(2)) { @@ -511,7 +521,7 @@ MaceStatus MaceEngine::Impl::Init( ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); } MACE_RETURN_IF_ERROR(net_->Init()); -#ifdef MACE_ENABLE_HEXAGON +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) } #endif @@ -525,25 +535,25 @@ MaceStatus MaceEngine::Impl::Init( const std::string &model_data_file) { LOG(INFO) << "Loading Model Data"; - MemoryMap(model_data_file, &model_data_, &model_data_size_); + auto fs = GetFileSystem(); + MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile( + model_data_file.c_str(), &model_data_)); - MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_)); + MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, + reinterpret_cast(model_data_->data()))); if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON || + device_type_ == DeviceType::HTA || (device_type_ == DeviceType::CPU && ws_->diffused_buffer())) { - MemoryUnMap(model_data_, model_data_size_); - model_data_ = nullptr; + model_data_.reset(); } return MaceStatus::MACE_SUCCESS; } MaceEngine::Impl::~Impl() { LOG(INFO) << "Destroying MaceEngine"; - if (model_data_ != nullptr) { - MemoryUnMap(model_data_, model_data_size_); - } -#ifdef MACE_ENABLE_HEXAGON - if (device_type_ == HEXAGON) { +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) + if (device_type_ == HEXAGON || device_type_ == HTA) { if (VLOG_IS_ON(2)) { hexagon_controller_->GetPerfInfo(); hexagon_controller_->PrintLog(); @@ -557,47 +567,51 @@ MaceEngine::Impl::~Impl() { MaceStatus MaceEngine::Impl::TransposeInput( const std::pair &input, Tensor *input_tensor) { - if (device_->device_type() == DeviceType::CPU && - input.second.shape().size() == 4 && - input.second.data_format() == NHWC && - !is_quantized_model_) { - VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; - input_tensor->set_data_format(DataFormat::NCHW); - std::vector dst_dims = {0, 3, 1, 2}; - std::vector output_shape = - TransposeShape(input.second.shape(), dst_dims); - MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), - input.second.shape(), - dst_dims, - input_data); - } else if ( - (is_quantized_model_ || device_->device_type() == DeviceType::GPU) && - input.second.shape().size() == 4 && - input.second.data_format() == DataFormat::NCHW) { - VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC"; - std::vector dst_dims = {0, 2, 3, 1}; - input_tensor->set_data_format(DataFormat::NHWC); - std::vector output_shape = - TransposeShape(input.second.shape(), dst_dims); - MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), - input.second.shape(), - dst_dims, - input_data); - } else { - input_tensor->set_data_format(input.second.data_format()); - MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - memcpy(input_data, input.second.data().get(), - input_tensor->size() * sizeof(float)); - return MaceStatus::MACE_SUCCESS; + bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE; + DataFormat data_format = DataFormat::DF_NONE; + if (has_data_format) { + if (device_->device_type() == DeviceType::CPU && + input.second.shape().size() == 4 && + input.second.data_format() == NHWC && + !is_quantized_model_) { + VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; + input_tensor->set_data_format(DataFormat::NCHW); + std::vector dst_dims = {0, 3, 1, 2}; + std::vector output_shape = + TransposeShape(input.second.shape(), dst_dims); + MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); + } else if ( + (is_quantized_model_ || device_->device_type() == DeviceType::GPU) && + input.second.shape().size() == 4 && + input.second.data_format() == DataFormat::NCHW) { + VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC"; + std::vector dst_dims = {0, 2, 3, 1}; + input_tensor->set_data_format(DataFormat::NHWC); + std::vector output_shape = + TransposeShape(input.second.shape(), dst_dims); + MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); + } + data_format = input.second.data_format(); } + input_tensor->set_data_format(data_format); + MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + memcpy(input_data, input.second.data().get(), + input_tensor->size() * sizeof(float)); + return MaceStatus::MACE_SUCCESS; } MaceStatus MaceEngine::Impl::TransposeOutput( @@ -605,38 +619,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput( std::pair *output) { // save output if (output_tensor != nullptr && output->second.data() != nullptr) { - if (device_->device_type() == DeviceType::CPU && - output->second.shape().size() == 4 && - output->second.data_format() != output_tensor->data_format()) { - MACE_CHECK(output_tensor->data_format() == NCHW); - VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC"; - std::vector dst_dims = {0, 2, 3, 1}; - std::vector shape = - TransposeShape(output_tensor->shape(), - dst_dims); - int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1, - std::multiplies()); - MACE_CHECK(output_size <= output->second.impl_->buffer_size) - << "Output size exceeds buffer size: shape" - << MakeString(shape) << " vs buffer size " - << output->second.impl_->buffer_size; - output->second.impl_->shape = shape; - Tensor::MappingGuard output_guard(output_tensor); - const float *output_data = output_tensor->data(); - return ops::Transpose(output_data, - output_tensor->shape(), - dst_dims, - output->second.data().get()); - } else if (device_->device_type() == DeviceType::GPU && + if (output_tensor->data_format() != DataFormat::DF_NONE && + output->second.data_format() != DataFormat::DF_NONE && output->second.shape().size() == 4 && output->second.data_format() != output_tensor->data_format()) { VLOG(1) << "Transform output " << output->first << " from " << output_tensor->data_format() << " to " << output->second.data_format(); - std::vector dst_dims = {0, 3, 1, 2}; - if (output_tensor->data_format() == NCHW) { + std::vector dst_dims; + if (output_tensor->data_format() == NCHW && + output->second.data_format() == NHWC) { dst_dims = {0, 2, 3, 1}; + } else if (output_tensor->data_format() == NHWC && + output->second.data_format() == NCHW) { + dst_dims = {0, 3, 1, 2}; + } else { + LOG(FATAL) <<"Not supported output data format: " + << output->second.data_format() << " vs " + << output_tensor->data_format(); } + VLOG(1) << "Transform output " << output->first << " from " + << output_tensor->data_format() << " to " + << output->second.data_format(); std::vector shape = TransposeShape(output_tensor->shape(), dst_dims); @@ -698,15 +702,15 @@ MaceStatus MaceEngine::Impl::Run( Tensor *output_tensor = ws_->GetTensor(output.first); output_tensors.push_back(output_tensor); } -#ifdef MACE_ENABLE_HEXAGON - if (device_type_ == HEXAGON) { +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) + if (device_type_ == HEXAGON || device_type_ == HTA) { MACE_CHECK(input_tensors.size() == 1 && output_tensors.size() == 1, "HEXAGON not support multiple inputs and outputs yet."); hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors); } else { #endif MACE_RETURN_IF_ERROR(net_->Run(run_metadata)); -#ifdef MACE_ENABLE_HEXAGON +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) } #endif @@ -725,7 +729,7 @@ MaceStatus MaceEngine::Impl::Run( } MaceEngine::MaceEngine(const MaceEngineConfig &config): - impl_(new MaceEngine::Impl(config)) {} + impl_(make_unique(config)) {} MaceEngine::~MaceEngine() = default; diff --git a/mace/libmace/mace_version_script.lds b/mace/libmace/mace_version_script.lds index 9b7d34538ad20417e59051420048e98998c5afd7..a088736de4d1e6c0ab07a397ae5d4164689726b7 100644 --- a/mace/libmace/mace_version_script.lds +++ b/mace/libmace/mace_version_script.lds @@ -7,19 +7,20 @@ mace { *CreateMaceEngineFromProto*; *GetBigLittleCoreIDs*; *MaceVersion*; + *GetCapability*; # api for static library of models - *mace*logging*LogMessage*; + *mace*port**; *mace*MaceStatus*; *mace*NetDef*; *mace*MemoryType*; *mace*DataType*; - *mace*InputInfo*; - *mace*OutputInfo*; + *mace*InputOutputInfo*; *mace*OutputShape*; *mace*OperatorDef*; *mace*ConstTensor*; *mace*Argument*; + *mace*Split*; *mace*MemoryBlock*; *google*protobuf*; diff --git a/mace/mace.bzl b/mace/mace.bzl index 2afe4560e323d2ad1cbe731832c5a918b09b177b..1f577e7e47d02f6ce23391205110687b49d1efdf 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -1,15 +1,21 @@ # -*- Python -*- -def if_android(a): +def if_android(a, default_value = []): return select({ "//mace:android": a, - "//conditions:default": [], + "//conditions:default": default_value, }) -def if_not_android(a): +def if_linux(a, default_value = []): return select({ - "//mace:android": [], - "//conditions:default": a, + "//mace:linux": a, + "//conditions:default": default_value, + }) + +def if_darwin(a, default_value = []): + return select({ + "//mace:darwin": a, + "//conditions:default": default_value, }) def if_android_armv7(a): @@ -36,16 +42,10 @@ def if_arm_linux_armhf(a): "//conditions:default": [] }) -def if_neon_enabled(a): - return select({ - "//mace:neon_enabled": a, - "//conditions:default": [], - }) - -def if_neon_enabled_str(a): +def if_neon_enabled(a, default_value = []): return select({ "//mace:neon_enabled": a, - "//conditions:default": "", + "//conditions:default": default_value, }) def if_hexagon_enabled(a): @@ -60,22 +60,29 @@ def if_not_hexagon_enabled(a): "//conditions:default": a, }) -def if_openmp_enabled(a): +def if_hta_enabled(a): return select({ - "//mace:openmp_enabled": a, + "//mace:hta_enabled": a, "//conditions:default": [], }) -def if_opencl_enabled(a): +def if_hexagon_or_hta_enabled(a): return select({ - "//mace:opencl_enabled": a, + "//mace:hexagon_enabled": a, + "//mace:hta_enabled": a, + "//conditions:default": [], + }) + +def if_openmp_enabled(a): + return select({ + "//mace:openmp_enabled": a, "//conditions:default": [], }) -def if_opencl_enabled_str(a): +def if_opencl_enabled(a, default_value = []): return select({ "//mace:opencl_enabled": a, - "//conditions:default": "", + "//conditions:default": default_value, }) def if_quantize_enabled(a): diff --git a/mace/ops/BUILD b/mace/ops/BUILD.bazel similarity index 94% rename from mace/ops/BUILD rename to mace/ops/BUILD.bazel index 7f03ce12221a7e074e59a34cdb38f918b86ff51a..bbf5f34822b734eb6555702cc219454bcf4ec051 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD.bazel @@ -54,37 +54,17 @@ cc_library( cc_library( name = "testing", - srcs = glob( - [ - "testing/*.cc", - ], - ), - hdrs = glob( - [ - "testing/*.h", - ], - ), + hdrs = [ + "testing/test_utils.h", + ], copts = [ "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ - "-DMACE_ENABLE_NEON", - ]) + if_android_armv7([ - "-mfpu=neon", - "-mfloat-abi=softfp", - ]) + if_opencl_enabled([ - "-DMACE_ENABLE_OPENCL", - ]) + if_quantize_enabled([ - "-DMACE_ENABLE_QUANTIZE", - ]) + if_hexagon_enabled([ - "-DMACE_ENABLE_HEXAGON", - ]), + ], deps = [ "//mace/core", - "@gtest//:gtest", + "@gtest", ], ) @@ -254,7 +234,7 @@ cc_library( ":arm_neon_kernels", ":ref_kernels", ":testing", - "@gtest//:gtest", + "@gtest", ], alwayslink = 1, ) @@ -289,7 +269,7 @@ cc_library( ":opencl_kernels", ":ref_kernels", ":testing", - "@gtest//:gtest", + "@gtest", ], alwayslink = 1, ) @@ -329,12 +309,12 @@ cc_library( "ops_registry.h", "ops_test_util.h", "fixpoint.h", - "gemmlowp_util.h", + "common/gemmlowp_util.h", "quantization_util.h", ], ) + if_quantize_enabled(glob([ "fixpoint.h", - "gemmlowp_util.h", + "common/gemmlowp_util.h", "quantization_util.h", ])), copts = [ diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index a9e28f1e2b08d985f657d3fa10a9a431a542c9e1..29fee227df0ebac83d9a2e8c9a275a62aff8c68a 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -22,6 +22,7 @@ #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/activation.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -88,9 +89,8 @@ class ActivationOp : public Operation { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; - kernel_.reset( - new opencl::image::ActivationKernel(type, relux_max_limit, - leakyrelu_coefficient)); + kernel_ = make_unique>( + type, relux_max_limit, leakyrelu_coefficient); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index cc11a0efc55fe9568c3635c5a72b54f81b60b1ac..5e387d87684d833eb40c5ebe30e564ef74bb55cd 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -24,6 +24,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/addn.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -107,7 +108,7 @@ class AddNOp : public Operation { explicit AddNOp(OpConstructContext *context) : Operation(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::AddNKernel); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc index 6010d71419dc9ec8f7f091281555f824e0e6e99b..09cfd8d4e0e0bd7ba09bf5f7e31c1bb57afa818b 100644 --- a/mace/ops/arm/activation_neon.cc +++ b/mace/ops/arm/activation_neon.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/activation_neon.h b/mace/ops/arm/activation_neon.h index a61b974b3c0dd002dece670a20381f0b9a4a4103..d640e689a2c1e91cb614826b9af1b53d7c90ef94 100644 --- a/mace/ops/arm/activation_neon.h +++ b/mace/ops/arm/activation_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/common_neon.h index c3451ea0e473b97b8befeb86d20a3743bdd83de9..8d28f5581c6ad43dd90fe1965e16e6ab7bec48c8 100644 --- a/mace/ops/arm/common_neon.h +++ b/mace/ops/arm/common_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/conv_2d_neon.h b/mace/ops/arm/conv_2d_neon.h deleted file mode 100644 index 711ef2c8ecf72bad68c8577338218a36e58e140a..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_2d_neon.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_ -#define MACE_OPS_ARM_CONV_2D_NEON_H_ - -#include "mace/core/types.h" -#include "mace/ops/sgemm.h" - -namespace mace { -namespace ops { - -void Conv2dNeonK1x1S1(const float *input, - const float *filter, - const index_t batch, - const index_t height, - const index_t width, - const index_t in_channels, - const index_t out_channels, - float *output, - SGemm *sgemm, - ScratchBuffer *scratch_buffer); - -void Conv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK5x5S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK1x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x7S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK7x7S3(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK1x15S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -void Conv2dNeonK15x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output); - -// calculate one output channel and one input channel -inline void Conv2dCPUKHxKWCalc(const float *in_ptr, - const float *filter_ptr, - const index_t in_width, - const index_t filter_height, - const index_t filter_width, - const index_t out_height, - const index_t out_width, - float *out_ptr, - const int stride) { - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - for (int i = 0; i < filter_height; ++i) { - for (int j = 0; j < filter_width; ++j) { - out_ptr[h * out_width + w] += - in_ptr[(h * stride + i) * in_width + (w * stride + j)] * - filter_ptr[i * filter_width + j]; - } - } - } - } -} - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_CONV_2D_NEON_H_ diff --git a/mace/ops/arm/conv_2d_neon_15x1.cc b/mace/ops/arm/conv_2d_neon_15x1.cc deleted file mode 100644 index 8523e494cebf92e359b0d53c9a3e2a7ab8cc2fcb..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_2d_neon_15x1.cc +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" -#include "mace/utils/utils.h" - -namespace mace { -namespace ops { - -inline void Conv2dCPUK15x1Calc(const float *in_ptr, - const float *filter_ptr, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t w, - const index_t tile_width, - const index_t out_image_size, - float *out_ptr, - const index_t io, - const int stride) { - for (index_t ih = 0; ih < out_height; ++ih) { - for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) { - for (int i = 0; i < 15; ++i) { - for (int j = 0; j < 1; ++j) { - out_ptr[io * out_image_size + ih * out_width + w + iw] += - in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] * - filter_ptr[io * in_channels * 15 + i * 1 + j]; - } - } - } - } -} - -// Ho = 4, Wo = 1, Co = 1 -void Conv2dNeonK15x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - const index_t tile_width = - out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - for (index_t w = 0; w < out_shape[3]; w += tile_width) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - float *out_ptr_base = output + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + m * in_channels * 15 + c * 15; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf0, vf1, vf2, vf3; - vf0 = vld1q_f32(filter_ptr); - vf1 = vld1q_f32(filter_ptr + 4); - vf2 = vld1q_f32(filter_ptr + 8); - vf3 = vld1q_f32(filter_ptr + 11); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) { - // load output - index_t out_offset = h * out_width + w + wt; - // output (1 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo = {out_ptr_base[out_offset], - out_ptr_base[out_offset + out_width], - out_ptr_base[out_offset + 2 * out_width], - out_ptr_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w + wt; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; - float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], - in_ptr_base[in_offset + 13 * in_width], - in_ptr_base[in_offset + 14 * in_width], - in_ptr_base[in_offset + 15 * in_width]}; - float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], - in_ptr_base[in_offset + 17 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - float32x4_t vi7 = vextq_f32(vi4, vi8, 3); - float32x4_t vi9 = vextq_f32(vi8, vi12, 1); - float32x4_t vi10 = vextq_f32(vi8, vi12, 2); - float32x4_t vi11 = vextq_f32(vi8, vi12, 3); - float32x4_t vi13 = vextq_f32(vi12, vi16, 1); - float32x4_t vi14 = vextq_f32(vi12, vi16, 2); - - vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); - vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); - vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); - - out_ptr_base[out_offset] = vo[0]; - out_ptr_base[out_offset + out_width] = vo[1]; - out_ptr_base[out_offset + 2 * out_width] = vo[2]; - out_ptr_base[out_offset + 3 * out_width] = vo[3]; - } // wt - } // h -#else - Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels, - out_height, out_width, w, tile_width, - out_image_size, out_ptr_base, 0, 1); -#endif - } // c - } // w - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_2d_neon_1x1.cc b/mace/ops/arm/conv_2d_neon_1x1.cc deleted file mode 100644 index 819f5f334f466508f3e7b1affae07a2a156ea358..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_2d_neon_1x1.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/arm/conv_2d_neon.h" - -namespace mace { -namespace ops { - -void Conv2dNeonK1x1S1(const float *input, - const float *filter, - const index_t batch, - const index_t height, - const index_t width, - const index_t in_channels, - const index_t out_channels, - float *output, - SGemm *sgemm, - ScratchBuffer *scratch_buffer) { - for (index_t b = 0; b < batch; ++b) { - sgemm->Run(filter, - input + b * in_channels * height * width, - 1, - out_channels, - in_channels, - in_channels, - height * width, - false, - false, - true, - false, - output + b * out_channels * height * width, - scratch_buffer); - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_2d_neon_1x15.cc b/mace/ops/arm/conv_2d_neon_1x15.cc deleted file mode 100644 index 33b9abbfebc2c921423b15288012487038d2b370..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_2d_neon_1x15.cc +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" -#include "mace/utils/logging.h" -#include "mace/utils/utils.h" - -namespace mace { -namespace ops { - -inline void Conv2dCPUK1x15Calc(const float *in_ptr, - const float *filter_ptr, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t h, - const index_t tile_height, - const index_t out_width, - const index_t out_image_size, - float *out_ptr, - const index_t io, - const int stride) { - for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) { - for (index_t iw = 0; iw < out_width; ++iw) { - for (int i = 0; i < 1; ++i) { - for (int j = 0; j < 15; ++j) { - out_ptr[io * out_image_size + (h + ih) * out_width + iw] += - in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] * - filter_ptr[io * in_channels * 15 + i * 15 + j]; - } - } - } - } -} - -// Ho = 1, Wo = 4, Co = 1 -void Conv2dNeonK1x15S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - const index_t tile_height = - out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; - -#pragma omp parallel for collapse(3) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; ++m) { - for (index_t h = 0; h < out_shape[2]; h += tile_height) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - float *out_ptr_base = output + b * out_batch_size + m * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + m * in_channels * 15 + c * 15; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf0, vf1, vf2, vf3; - vf0 = vld1q_f32(filter_ptr); - vf1 = vld1q_f32(filter_ptr + 4); - vf2 = vld1q_f32(filter_ptr + 8); - vf3 = vld1q_f32(filter_ptr + 11); - - for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo; - // load output - index_t out_offset = (h + ht) * out_width + w; - vo = vld1q_f32(out_ptr_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, - vi10, vi11, vi12, vi13, vi14, vi16; - // input offset - index_t in_offset = (h + ht) * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi12 = vld1q_f32(in_ptr_base + in_offset + 12); - vi16 = vld1q_f32(in_ptr_base + in_offset + 16); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - vi7 = vextq_f32(vi4, vi8, 3); - vi9 = vextq_f32(vi8, vi12, 1); - vi10 = vextq_f32(vi8, vi12, 2); - vi11 = vextq_f32(vi8, vi12, 3); - vi13 = vextq_f32(vi12, vi16, 1); - vi14 = vextq_f32(vi12, vi16, 2); - - vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); - vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); - vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); - vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); - vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); - vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); - vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); - vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); - vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); - - vst1q_f32(out_ptr_base + out_offset, vo); - } // w - } // ht -#else - Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels, - out_height, h, tile_height, out_width, - out_image_size, out_ptr_base, 0, 1); -#endif - } // c - } // h - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_2d_neon_1x7.cc b/mace/ops/arm/conv_2d_neon_1x7.cc deleted file mode 100644 index e5e249d39b3b51e4c0525d4c6777520c1ff4d846..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_2d_neon_1x7.cc +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" - -namespace mace { -namespace ops { - -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK1x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) - float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7; - const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7; - const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7; - /* load filter (4 outch x 1 height x 4 width) */ - float32x4_t vf00, vf01; - float32x4_t vf10, vf11; - float32x4_t vf20, vf21; - float32x4_t vf30, vf31; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - vf10 = vld1q_f32(filter_ptr1); - vf11 = vld1q_f32(filter_ptr1 + 3); - vf20 = vld1q_f32(filter_ptr2); - vf21 = vld1q_f32(filter_ptr2 + 3); - vf30 = vld1q_f32(filter_ptr3); - vf31 = vld1q_f32(filter_ptr3 + 3); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (4 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0, vo1, vo2, vo3; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - vo1 = vld1q_f32(out_ptr1_base + out_offset); - vo2 = vld1q_f32(out_ptr2_base + out_offset); - vo3 = vld1q_f32(out_ptr3_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; - // input offset - index_t in_offset = h * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - /* outch 0 */ - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); - /* outch 1 */ - vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); - vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); - vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); - vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); - vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); - vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); - vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); - /* outch 2 */ - vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); - vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); - vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); - vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); - vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); - vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); - vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); - /* outch 3 */ - vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); - vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); - vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); - vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); - vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); - vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); - vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); -#else - /* outch 0 */ - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); - /* outch 1 */ - vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); - vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); - vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); - /* outch 2 */ - vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); - vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); - vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); - /* outch 3 */ - vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); - vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); - vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#endif - - vst1q_f32(out_ptr0_base + out_offset, vo0); - vst1q_f32(out_ptr1_base + out_offset, vo1); - vst1q_f32(out_ptr2_base + out_offset, vo2); - vst1q_f32(out_ptr3_base + out_offset, vo3); - } // w - } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, - in_width, 1, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - /* load filter (1 outch x 1 height x 4 width) */ - float32x4_t vf00, vf01; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // output (1 outch x 1 height x 4 width): vo_outch_height - float32x4_t vo0; - // load output - index_t out_offset = h * out_width + w; - vo0 = vld1q_f32(out_ptr0_base + out_offset); - - // input (3 slide) - float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; - // input offset - index_t in_offset = h * in_width + w; - // load input - vi0 = vld1q_f32(in_ptr_base + in_offset); - vi4 = vld1q_f32(in_ptr_base + in_offset + 4); - vi8 = vld1q_f32(in_ptr_base + in_offset + 8); - vi1 = vextq_f32(vi0, vi4, 1); - vi2 = vextq_f32(vi0, vi4, 2); - vi3 = vextq_f32(vi0, vi4, 3); - vi5 = vextq_f32(vi4, vi8, 1); - vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#else - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); -#endif - - vst1q_f32(out_ptr0_base + out_offset, vo0); - } // w - } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7, - out_height, out_width, out_ptr0_base, 1); -#endif - } // c - } - } // if - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_2d_neon_7x1.cc b/mace/ops/arm/conv_2d_neon_7x1.cc deleted file mode 100644 index 7aa9309bfd605faa51a833e99eb0c15dd06ded3a..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_2d_neon_7x1.cc +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" - -namespace mace { -namespace ops { - -// Ho = 4, Wo = 1, Co = 4 -void Conv2dNeonK7x1S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t m = 0; m < out_shape[1]; m += 4) { - const index_t out_channels = out_shape[1]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_channels = in_shape[1]; - const index_t in_width = in_shape[3]; - if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) - float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7; - const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7; - const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7; - /* load filter (4 outch x 4 height x 1 width) */ - float32x4_t vf00, vf01; - float32x4_t vf10, vf11; - float32x4_t vf20, vf21; - float32x4_t vf30, vf31; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - vf10 = vld1q_f32(filter_ptr1); - vf11 = vld1q_f32(filter_ptr1 + 3); - vf20 = vld1q_f32(filter_ptr2); - vf21 = vld1q_f32(filter_ptr2 + 3); - vf30 = vld1q_f32(filter_ptr3); - vf31 = vld1q_f32(filter_ptr3 + 3); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t w = 0; w < out_width; ++w) { - // load output - index_t out_offset = h * out_width + w; - // output (4 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo0 = {out_ptr0_base[out_offset], - out_ptr0_base[out_offset + out_width], - out_ptr0_base[out_offset + 2 * out_width], - out_ptr0_base[out_offset + 3 * out_width]}; - float32x4_t vo1 = {out_ptr1_base[out_offset], - out_ptr1_base[out_offset + out_width], - out_ptr1_base[out_offset + 2 * out_width], - out_ptr1_base[out_offset + 3 * out_width]}; - float32x4_t vo2 = {out_ptr2_base[out_offset], - out_ptr2_base[out_offset + out_width], - out_ptr2_base[out_offset + 2 * out_width], - out_ptr2_base[out_offset + 3 * out_width]}; - float32x4_t vo3 = {out_ptr3_base[out_offset], - out_ptr3_base[out_offset + out_width], - out_ptr3_base[out_offset + 2 * out_width], - out_ptr3_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - /* outch 0 */ - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); - /* outch 1 */ - vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); - vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); - vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); - vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); - vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); - vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); - vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); - /* outch 2 */ - vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); - vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); - vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); - vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); - vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); - vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); - vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); - /* outch 3 */ - vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); - vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); - vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); - vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); - vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); - vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); - vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); -#else - /* outch 0 */ - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); - /* outch 1 */ - vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); - vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); - vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); - vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); - vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); - /* outch 2 */ - vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); - vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); - vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); - vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); - vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); - /* outch 3 */ - vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); - vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); - vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); - vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); - vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); -#endif - - out_ptr0_base[out_offset] = vo0[0]; - out_ptr0_base[out_offset + out_width] = vo0[1]; - out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; - out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; - out_ptr1_base[out_offset] = vo1[0]; - out_ptr1_base[out_offset + out_width] = vo1[1]; - out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; - out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; - out_ptr2_base[out_offset] = vo2[0]; - out_ptr2_base[out_offset + out_width] = vo2[1]; - out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; - out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; - out_ptr3_base[out_offset] = vo3[0]; - out_ptr3_base[out_offset + out_width] = vo3[1]; - out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; - out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; - } // w - } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7, - in_width, 7, 1, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7; -#if defined(MACE_ENABLE_NEON) - /* load filter (1 outch x 4 height x 1 width) */ - float32x4_t vf00, vf01; - vf00 = vld1q_f32(filter_ptr0); - vf01 = vld1q_f32(filter_ptr0 + 3); - - for (index_t h = 0; h + 3 < out_height; h += 4) { - for (index_t w = 0; w < out_width; ++w) { - // load output - index_t out_offset = h * out_width + w; - // output (1 outch x 4 height x 1 width): vo_outch_height - float32x4_t vo0 = {out_ptr0_base[out_offset], - out_ptr0_base[out_offset + out_width], - out_ptr0_base[out_offset + 2 * out_width], - out_ptr0_base[out_offset + 3 * out_width]}; - - // input offset - index_t in_offset = h * in_width + w; - // input (3 slide) - float32x4_t vi0 = {in_ptr_base[in_offset], - in_ptr_base[in_offset + in_width], - in_ptr_base[in_offset + 2 * in_width], - in_ptr_base[in_offset + 3 * in_width]}; - float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], - in_ptr_base[in_offset + 5 * in_width], - in_ptr_base[in_offset + 6 * in_width], - in_ptr_base[in_offset + 7 * in_width]}; - float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], - in_ptr_base[in_offset + 9 * in_width], - in_ptr_base[in_offset + 10 * in_width], - in_ptr_base[in_offset + 11 * in_width]}; - float32x4_t vi1 = vextq_f32(vi0, vi4, 1); - float32x4_t vi2 = vextq_f32(vi0, vi4, 2); - float32x4_t vi3 = vextq_f32(vi0, vi4, 3); - float32x4_t vi5 = vextq_f32(vi4, vi8, 1); - float32x4_t vi6 = vextq_f32(vi4, vi8, 2); - -#if defined(__aarch64__) - vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); - vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); - vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); - vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); - vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); - vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); - vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); -#else - vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); - vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); - vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); - vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); - vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); -#endif - - out_ptr0_base[out_offset] = vo0[0]; - out_ptr0_base[out_offset + out_width] = vo0[1]; - out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; - out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; - } // w - } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 1, - out_height, out_width, out_ptr0_base, 1); -#endif - } // c - } - } // if - } // m - } // b -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/conv_winograd.h b/mace/ops/arm/conv_winograd.h deleted file mode 100644 index 396d1870b96a4565e56ea5d48faf3e46d616a4da..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_winograd.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARM_CONV_WINOGRAD_H_ -#define MACE_OPS_ARM_CONV_WINOGRAD_H_ - -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) -#include -#endif - -#include "mace/core/types.h" -#include "mace/ops/sgemm.h" - -namespace mace { -namespace ops { - -void TransformFilter4x4(const float *filter, - const index_t in_channels, - const index_t out_channels, - float *output); - -void TransformFilter8x8(const float *filter, - const index_t in_channels, - const index_t out_channels, - float *output); - -void WinoGradConv3x3s1(const float *input, - const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_channels, - const int out_tile_size, - float *output, - SGemm *sgemm, - ScratchBuffer *scratch_buffer); - -void WinoGradConv3x3s1(const float *input, - const float *transformed_filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_channels, - const int out_tile_size, - float *transformed_input, - float *transformed_output, - float *output, - SGemm *sgemm, - ScratchBuffer *scratch_buffer); - -void ConvRef3x3s1(const float *input, - const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_channels, - float *output); - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARM_CONV_WINOGRAD_H_ diff --git a/mace/ops/arm/conv_winograd_test.cc b/mace/ops/arm/conv_winograd_test.cc deleted file mode 100644 index 4f28472d5199dcb2f72667e30da10db82c0ba7d2..0000000000000000000000000000000000000000 --- a/mace/ops/arm/conv_winograd_test.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "mace/core/tensor.h" -#include "mace/core/types.h" -#include "mace/ops/arm/conv_winograd.h" - -namespace mace { -namespace ops { - -TEST(ConvWinogradTest, winograd) { - index_t batch = 1; - index_t in_height = 32; - index_t in_width = 32; - index_t in_channels = 64; - index_t out_channels = 128; - - index_t out_height = in_height - 2; - index_t out_width = in_width - 2; - index_t input_size = batch * in_channels * in_height * in_width; - index_t filter_size = 3 * 3 * in_channels * out_channels; - index_t output_size = batch * out_channels * out_height * out_width; - - Tensor input(GetCPUAllocator(), DataType::DT_FLOAT); - Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT); - Tensor output(GetCPUAllocator(), DataType::DT_FLOAT); - Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT); - - input.Resize({batch, in_channels, in_height, in_width}); - filter.Resize({out_channels, in_channels, 3, 3}); - output.Resize({batch, out_channels, out_height, out_width}); - output_ref.Resize({batch, out_channels, out_height, out_width}); - - float *input_data = input.mutable_data(); - float *filter_data = filter.mutable_data(); - float *output_data = output.mutable_data(); - float *output_data_ref = output.mutable_data(); - - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution nd(0, 1); - std::generate(input_data, input_data + input_size, [&gen, &nd] { - return std::max(-1.0f, std::min(1.0f, nd(gen))); - }); - std::generate(filter_data, filter_data + filter_size, [&gen, &nd] { - return std::max(-1.0f, std::min(1.0f, nd(gen))); - }); - - ops::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width, - in_channels, out_channels, output_data_ref); - - SGemm sgemm; - ops::WinoGradConv3x3s1(input_data, filter_data, batch, in_height, - in_width, in_channels, out_channels, 6, - output_data, &sgemm, nullptr); - - // test - for (index_t i = 0; i < output_size; ++i) { - EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1) << " with index " << i; - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h index 62e3e9199b00345a8e41751bfb1b165e96cdd634..f45fa923bdd19c6420a4ab0e6b751541ce3b1f76 100644 --- a/mace/ops/arm/deconv_2d_neon.h +++ b/mace/ops/arm/deconv_2d_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc index 74ddbecc48c367c07692e43b6260ece23aee6abb..674864c8b6527631d4d5800a9e892bc662826bc7 100644 --- a/mace/ops/arm/deconv_2d_neon_2x2.cc +++ b/mace/ops/arm/deconv_2d_neon_2x2.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/ops/arm/deconv_2d_neon.h" namespace mace { diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc index 356680949af572838a070c47f91a69427751d596..04f62325817f5a02919ea859c3e5c5ba4a974f40 100644 --- a/mace/ops/arm/deconv_2d_neon_3x3.cc +++ b/mace/ops/arm/deconv_2d_neon_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/ops/arm/deconv_2d_neon.h" namespace mace { diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc index a023154aec04c94ff4dcc77767999522a87a0368..443a188f322c448c6e8bf36b14b3babc91725cf4 100644 --- a/mace/ops/arm/deconv_2d_neon_4x4.cc +++ b/mace/ops/arm/deconv_2d_neon_4x4.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/ops/arm/deconv_2d_neon.h" namespace mace { diff --git a/mace/ops/arm/depthwise_conv2d_neon.h b/mace/ops/arm/depthwise_conv2d_neon.h index a4973ed59e0d31b4dfd97359e0cf3c99b3377c31..b610178c54fd097beb92bf0135d152cd4a96ed29 100644 --- a/mace/ops/arm/depthwise_conv2d_neon.h +++ b/mace/ops/arm/depthwise_conv2d_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc index 652d0231bdc60c6f3b53c65b3a94131e7de47d15..ced509e0d87d796b7ff2ecedc5ae187a926502af 100644 --- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc +++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #include #endif -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/ops/arm/depthwise_conv2d_neon.h" namespace mace { diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h index 70f2bb40545cde307ff1c8f75e69607bf6864486..8df6dba15bd61d22054f0d0ecac2b35bd060ec76 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon.h +++ b/mace/ops/arm/depthwise_deconv2d_neon.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc index 404c903d30d0ca30695c94d889f9346764967c64..6bba47c280bfb1fe22055c7440e9180b6afdc98e 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/ops/arm/depthwise_deconv2d_neon.h" namespace mace { diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc index 1b59264e600c064f76dddfbcf3b6b4ec83d535a2..677eb152bb5f7d984a9f7bd003bcbf0e42a1da1f 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/ops/arm/deconv_2d_neon.h" namespace mace { diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/fp32/conv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..799ee521b83dc22b3e192dc364f486b929b7df7f --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d.cc @@ -0,0 +1,247 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "mace/ops/arm/fp32/conv_2d.h" +#include "mace/utils/memory.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input, + const Tensor *filter, + const int out_tile_height, + const int out_tile_width, + std::vector *output_shape, + std::vector *in_pad_size, + std::vector *out_pad_size) { + in_pad_size->resize(4); + out_pad_size->resize(4); + output_shape->resize(4); + + const index_t in_height = input->dim(2); + const index_t in_width = input->dim(3); + + const index_t stride_h = strides_[0]; + const index_t stride_w = strides_[1]; + const index_t dilation_h = dilations_[0]; + const index_t dilation_w = dilations_[1]; + const index_t filter_h = filter->dim(2); + const index_t filter_w = filter->dim(3); + + std::vector paddings(2); + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input->shape().data(), + filter->shape().data(), + dilations_.data(), + strides_.data(), + padding_type_, + output_shape->data(), + paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input->shape().data(), + filter->shape().data(), + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::FLOOR, + output_shape->data()); + } + const index_t out_height = (*output_shape)[2]; + const index_t out_width = (*output_shape)[3]; + const index_t + padded_out_height = RoundUp(out_height, out_tile_height); + const index_t padded_out_width = RoundUp(out_width, out_tile_width); + const index_t padded_in_height = + std::max(in_height + paddings[0], (padded_out_height - 1) * stride_h + + (filter_h - 1) * dilation_h + 1); + const index_t padded_in_width = + std::max(in_width + paddings[1], (padded_out_width - 1) * stride_w + + (filter_w - 1) * dilation_w + 1); + + (*in_pad_size)[0] = paddings[0] >> 1; + (*in_pad_size)[1] = + static_cast(padded_in_height - in_height - (*in_pad_size)[0]); + (*in_pad_size)[2] = paddings[1] >> 1; + (*in_pad_size)[3] = + static_cast(padded_in_width - in_width - (*in_pad_size)[2]); + + (*out_pad_size)[0] = 0; + (*out_pad_size)[1] = static_cast(padded_out_height - out_height); + (*out_pad_size)[2] = 0; + (*out_pad_size)[3] = static_cast(padded_out_width - out_width); +} + +MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output, + const int out_tile_height, + const int out_tile_width, + std::unique_ptr + *padded_input, + std::unique_ptr + *padded_output) { + std::vector output_shape; + std::vector in_pad_size; + std::vector out_pad_size; + CalOutputShapeAndPadSize(input, + filter, + out_tile_height, + out_tile_width, + &output_shape, + &in_pad_size, + &out_pad_size); + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + const index_t batch = input->dim(0); + const index_t in_channels = input->dim(1); + const index_t in_height = input->dim(2); + const index_t in_width = input->dim(3); + const index_t out_channels = output->dim(1); + const index_t out_height = output->dim(2); + const index_t out_width = output->dim(3); + + const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1]; + const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3]; + const index_t + padded_out_height = out_height + out_pad_size[0] + out_pad_size[1]; + const index_t + padded_out_width = out_width + out_pad_size[2] + out_pad_size[3]; + const bool is_in_padded = + padded_in_height != in_height || padded_in_width != in_width; + const bool is_out_padded = + padded_out_height != out_height || padded_out_width != out_width; + + auto scratch_buffer = context->device()->scratch_buffer(); + const index_t padded_in_size = + MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize( + sizeof(float) * batch * in_channels * padded_in_height + * padded_in_width) : 0); + const index_t padded_out_size = is_out_padded ? PadAlignSize( + sizeof(float) * batch * out_channels * padded_out_height + * padded_out_width) : 0; + + scratch_buffer->Rewind(); + scratch_buffer->GrowSize(padded_in_size + padded_out_size); + if (is_in_padded) { + std::unique_ptr + padded_in = + make_unique(scratch_buffer->Scratch(padded_in_size), + DataType::DT_FLOAT); + padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width}); + PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get()); + *padded_input = std::move(padded_in); + } + if (is_out_padded) { + std::unique_ptr + padded_out = make_unique(scratch_buffer->Scratch(padded_out_size), + DataType::DT_FLOAT); + padded_out->Resize({batch, out_channels, padded_out_height, + padded_out_width}); + *padded_output = std::move(padded_out); + } + return MaceStatus::MACE_SUCCESS; +} + +void Conv2dBase::PadInput(const Tensor &src, + const int pad_top, + const int pad_left, + mace::Tensor *dst) { + if (dst == &src) return; + const index_t batch = src.dim(0); + const index_t channels = src.dim(1); + const index_t height = src.dim(2); + const index_t width = src.dim(3); + const index_t padded_height = dst->dim(2); + const index_t padded_width = dst->dim(3); + const int pad_bottom = static_cast(padded_height - height - pad_top); + const int pad_right = static_cast(padded_width - width - pad_left); + auto in_data = src.data(); + auto padded_in_data = dst->mutable_data(); + + const index_t img_size = height * width; + const index_t padded_img_size = padded_height * padded_width; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t bc = b * channels + c; + const float *in_base = in_data + bc * img_size; + float *padded_in_base = padded_in_data + bc * padded_img_size; + + memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width); + padded_in_base += pad_top * padded_width; + for (index_t h = 0; h < height; ++h) { + memset(padded_in_base, + 0, + sizeof(float) * pad_left); + memcpy(padded_in_base + pad_left, + in_base, + sizeof(float) * width); + memset(padded_in_base + pad_left + width, + 0, + sizeof(float) * pad_right); + in_base += width; + padded_in_base += padded_width; + } + memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width); + } + } +} + +void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) { + if (dst == &src) return; + const index_t batch = dst->dim(0); + const index_t channels = dst->dim(1); + const index_t height = dst->dim(2); + const index_t width = dst->dim(3); + const index_t padded_height = src.dim(2); + const index_t padded_width = src.dim(3); + + auto padded_out_data = src.data(); + auto out_data = dst->mutable_data(); + + const index_t img_size = height * width; + const index_t padded_img_size = padded_height * padded_width; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t bc = (b * channels + c); + float *out_base = out_data + bc * img_size; + const float *padded_out_base = padded_out_data + bc * padded_img_size; + + for (index_t h = 0; h < height; ++h) { + memcpy(out_base, + padded_out_base, + sizeof(float) * width); + out_base += width; + padded_out_base += padded_width; + } // h + } // c + } // b +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h index 7d77cf14941d30c24227ef10d948688519a7e995..832f6f2fa35d999ee6192e61f340f070776f5d1f 100644 --- a/mace/ops/arm/fp32/conv_2d.h +++ b/mace/ops/arm/fp32/conv_2d.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,10 +15,14 @@ #ifndef MACE_OPS_ARM_FP32_CONV_2D_H_ #define MACE_OPS_ARM_FP32_CONV_2D_H_ +#include +#include + #include "mace/public/mace.h" #include "mace/core/tensor.h" #include "mace/core/op_context.h" #include "mace/ops/arm/fp32/gemm.h" +#include "mace/ops/common/conv_pool_2d_util.h" namespace mace { namespace ops { @@ -27,13 +31,51 @@ namespace fp32 { class Conv2dBase { public: - Conv2dBase() = default; + Conv2dBase(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} + virtual ~Conv2dBase() = default; + virtual MaceStatus Compute( const OpContext *context, const Tensor *input, const Tensor *filter, Tensor *output) = 0; + + protected: + void CalOutputShapeAndPadSize(const Tensor *input, + const Tensor *filter, + const int out_tile_height, + const int out_tile_width, + std::vector *output_shape, + std::vector *in_pad_size, + std::vector *out_pad_size); + + MaceStatus ResizeOutAndPadInOut(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output, + const int out_tile_height, + const int out_tile_width, + std::unique_ptr *padded_input, + std::unique_ptr *padded_output); + + void PadInput(const Tensor &src, + const int pad_top, + const int pad_left, + Tensor *dst); + void UnPadOutput(const Tensor &src, Tensor *dst); + + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; }; } // namespace fp32 diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/fp32/conv_2d_1x1.cc index b34e19aae8c8712bf08052deaff7abfe6bd1eb95..d5e03652bbd25bad8eb43bfb67b2ef98092b9b2f 100644 --- a/mace/ops/arm/fp32/conv_2d_1x1.cc +++ b/mace/ops/arm/fp32/conv_2d_1x1.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "mace/ops/arm/fp32/conv_2d_1x1.h" namespace mace { @@ -25,20 +24,68 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, const Tensor *filter, Tensor *output) { index_t batch = input->dim(0); - index_t height = input->dim(2); - index_t width = input->dim(3); + index_t in_height = input->dim(2); + index_t in_width = input->dim(3); index_t in_channels = input->dim(1); - index_t out_channels = filter->dim(0); - MACE_RETURN_IF_ERROR(output->Resize({batch, out_channels, height, width})); - context->device()->scratch_buffer()->Rewind(); + + std::vector output_shape; + std::vector in_pad_size; + std::vector out_pad_size; + CalOutputShapeAndPadSize(input, + filter, + 1, + 1, + &output_shape, + &in_pad_size, + &out_pad_size); + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + const index_t out_channels = output_shape[1]; + const index_t out_height = output_shape[2]; + const index_t out_width = output_shape[3]; + const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1]; + const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3]; + + // pad input and transform input + const bool is_in_padded = + in_height != padded_in_height || in_width != padded_in_width; + auto scratch_buffer = context->device()->scratch_buffer(); + const index_t padded_in_size = is_in_padded ? PadAlignSize( + sizeof(float) * batch * in_channels * padded_in_height + * padded_in_width) : 0; + const index_t pack_filter_size = + PadAlignSize(sizeof(float) * out_channels * in_channels); + const index_t pack_input_size = + PadAlignSize( + sizeof(float) * in_channels * padded_in_height * padded_in_width); + const index_t pack_output_size = + PadAlignSize( + sizeof(float) * out_channels * padded_in_height * padded_in_width); + + const index_t gemm_pack_size = + pack_filter_size + pack_input_size + pack_output_size; + + scratch_buffer->Rewind(); + scratch_buffer->GrowSize(padded_in_size + gemm_pack_size); + + const Tensor *padded_in = input; + Tensor tmp_padded_in + (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT); + if (is_in_padded) { + tmp_padded_in.Resize({batch, in_channels, padded_in_height, + padded_in_width}); + PadInput(*input, in_pad_size[0], in_pad_size[2], &tmp_padded_in); + padded_in = &tmp_padded_in; + } + return gemm_.Compute(context, filter, - input, + padded_in, batch, out_channels, in_channels, in_channels, - height * width, + out_height * out_width, false, false, false, diff --git a/mace/ops/arm/fp32/conv_2d_1x1.h b/mace/ops/arm/fp32/conv_2d_1x1.h index fd2077ec2a0f0458ee980ff6c35e2f11e1a6d0ad..68b792fd96b3c5dd77504614894d3008bbd01e01 100644 --- a/mace/ops/arm/fp32/conv_2d_1x1.h +++ b/mace/ops/arm/fp32/conv_2d_1x1.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ #ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_ #define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_ +#include #include "mace/public/mace.h" #include "mace/core/tensor.h" #include "mace/core/op_context.h" @@ -28,7 +29,8 @@ namespace fp32 { class Conv2dK1x1 : public Conv2dBase { public: - Conv2dK1x1() : gemm_(true) {} + Conv2dK1x1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} virtual ~Conv2dK1x1() {} MaceStatus Compute( diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ff99d8021438d8b851b65d6ee2c662e01e72917 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_1xn.cc @@ -0,0 +1,821 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "mace/ops/arm/fp32/conv_2d_1xn.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + if (m + 3 < out_channels) { + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; + const float + *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; + const float + *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + /* load filter (4 outch x 1 height x 4 width) */ + float32x4_t vf00, vf01; + float32x4_t vf10, vf11; + float32x4_t vf20, vf21; + float32x4_t vf30, vf31; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf20 = vld1q_f32(filter_ptr2); + vf21 = vld1q_f32(filter_ptr2 + 3); + vf30 = vld1q_f32(filter_ptr3); + vf31 = vld1q_f32(filter_ptr3 + 3); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; + // input offset + index_t in_offset = h * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + /* outch 0 */ + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); + /* outch 1 */ + vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); + vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); + vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); + vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); + vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); + vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); + vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); + /* outch 2 */ + vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); + vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); + vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); + vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); + vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); + vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); + vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); + /* outch 3 */ + vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); + vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); + vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); + vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); + vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); + vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); + vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); +#else + /* outch 0 */ + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); + /* outch 1 */ + vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); + vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); + /* outch 2 */ + vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); + vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); + /* outch 3 */ + vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); + vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); +#endif + + vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + } // w + } // h + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + /* load filter (1 outch x 1 height x 4 width) */ + float32x4_t vf00, vf01; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; + // input offset + index_t in_offset = h * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); +#else + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); +#endif + + vst1q_f32(out_ptr0_base + out_offset, vo0); + } // w + } // h + } // c + } + } // if + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 4, + 1, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + if (m + 3 < out_channels) { + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output_data + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output_data + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; + const float + *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; + const float + *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; + /* load filter (4 outch x 4 height x 1 width) */ + float32x4_t vf00, vf01; + float32x4_t vf10, vf11; + float32x4_t vf20, vf21; + float32x4_t vf30, vf31; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf20 = vld1q_f32(filter_ptr2); + vf21 = vld1q_f32(filter_ptr2 + 3); + vf30 = vld1q_f32(filter_ptr3); + vf31 = vld1q_f32(filter_ptr3 + 3); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t w = 0; w < out_width; ++w) { + // load output + index_t out_offset = h * out_width + w; + // output (4 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo0 = {out_ptr0_base[out_offset], + out_ptr0_base[out_offset + out_width], + out_ptr0_base[out_offset + 2 * out_width], + out_ptr0_base[out_offset + 3 * out_width]}; + float32x4_t vo1 = {out_ptr1_base[out_offset], + out_ptr1_base[out_offset + out_width], + out_ptr1_base[out_offset + 2 * out_width], + out_ptr1_base[out_offset + 3 * out_width]}; + float32x4_t vo2 = {out_ptr2_base[out_offset], + out_ptr2_base[out_offset + out_width], + out_ptr2_base[out_offset + 2 * out_width], + out_ptr2_base[out_offset + 3 * out_width]}; + float32x4_t vo3 = {out_ptr3_base[out_offset], + out_ptr3_base[out_offset + out_width], + out_ptr3_base[out_offset + 2 * out_width], + out_ptr3_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + /* outch 0 */ + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); + /* outch 1 */ + vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0); + vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1); + vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2); + vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3); + vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1); + vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2); + vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3); + /* outch 2 */ + vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0); + vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1); + vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2); + vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3); + vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1); + vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2); + vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3); + /* outch 3 */ + vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0); + vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1); + vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2); + vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3); + vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1); + vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); + vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3); +#else + /* outch 0 */ + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); + /* outch 1 */ + vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); + vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1); + vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0); + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1); + /* outch 2 */ + vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); + vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1); + vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0); + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1); + /* outch 3 */ + vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); + vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1); + vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1); +#endif + + out_ptr0_base[out_offset] = vo0[0]; + out_ptr0_base[out_offset + out_width] = vo0[1]; + out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; + out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + out_ptr1_base[out_offset] = vo1[0]; + out_ptr1_base[out_offset + out_width] = vo1[1]; + out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; + out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; + out_ptr2_base[out_offset] = vo2[0]; + out_ptr2_base[out_offset + out_width] = vo2[1]; + out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; + out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; + out_ptr3_base[out_offset] = vo3[0]; + out_ptr3_base[out_offset + out_width] = vo3[1]; + out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; + out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; + } // w + } // h + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf00, vf01; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t w = 0; w < out_width; ++w) { + // load output + index_t out_offset = h * out_width + w; + // output (1 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo0 = {out_ptr0_base[out_offset], + out_ptr0_base[out_offset + out_width], + out_ptr0_base[out_offset + 2 * out_width], + out_ptr0_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width], + in_ptr_base[in_offset + 10 * in_width], + in_ptr_base[in_offset + 11 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + +#if defined(__aarch64__) + vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0); + vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1); + vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2); + vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3); + vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1); + vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); + vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3); +#else + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1); + vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); +#endif + + out_ptr0_base[out_offset] = vo0[0]; + out_ptr0_base[out_offset + out_width] = vo0[1]; + out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; + out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; + } // w + } // h + } // c + } + } // if + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + + +// ==== + +MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t tile_height = + out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; + +#pragma omp parallel for collapse(3) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t h = 0; h < out_shape[2]; h += tile_height) { + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + float *out_ptr_base = + output_data + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo; + // load output + index_t out_offset = (h + ht) * out_width + w; + vo = vld1q_f32(out_ptr_base + out_offset); + + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, + vi10, vi11, vi12, vi13, vi14, vi16; + // input offset + index_t in_offset = (h + ht) * in_width + w; + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi12 = vld1q_f32(in_ptr_base + in_offset + 12); + vi16 = vld1q_f32(in_ptr_base + in_offset + 16); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + vi7 = vextq_f32(vi4, vi8, 3); + vi9 = vextq_f32(vi8, vi12, 1); + vi10 = vextq_f32(vi8, vi12, 2); + vi11 = vextq_f32(vi8, vi12, 3); + vi13 = vextq_f32(vi12, vi16, 1); + vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + vst1q_f32(out_ptr_base + out_offset, vo); + } // w + } // ht + } // c + } // h + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 4, + 1, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + const index_t tile_width = + out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; + +#pragma omp parallel for collapse(3) schedule(runtime) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t w = 0; w < out_shape[3]; w += tile_width) { + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + float *out_ptr_base = + output_data + b * out_batch_size + m * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15; + /* load filter (1 outch x 4 height x 1 width) */ + float32x4_t vf0, vf1, vf2, vf3; + vf0 = vld1q_f32(filter_ptr); + vf1 = vld1q_f32(filter_ptr + 4); + vf2 = vld1q_f32(filter_ptr + 8); + vf3 = vld1q_f32(filter_ptr + 11); + + for (index_t h = 0; h + 3 < out_height; h += 4) { + for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) { + // load output + index_t out_offset = h * out_width + w + wt; + // output (1 outch x 4 height x 1 width): vo_outch_height + float32x4_t vo = {out_ptr_base[out_offset], + out_ptr_base[out_offset + out_width], + out_ptr_base[out_offset + 2 * out_width], + out_ptr_base[out_offset + 3 * out_width]}; + + // input offset + index_t in_offset = h * in_width + w + wt; + // input (3 slide) + float32x4_t vi0 = {in_ptr_base[in_offset], + in_ptr_base[in_offset + in_width], + in_ptr_base[in_offset + 2 * in_width], + in_ptr_base[in_offset + 3 * in_width]}; + float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], + in_ptr_base[in_offset + 5 * in_width], + in_ptr_base[in_offset + 6 * in_width], + in_ptr_base[in_offset + 7 * in_width]}; + float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], + in_ptr_base[in_offset + 9 * in_width], + in_ptr_base[in_offset + 10 * in_width], + in_ptr_base[in_offset + 11 * in_width]}; + float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], + in_ptr_base[in_offset + 13 * in_width], + in_ptr_base[in_offset + 14 * in_width], + in_ptr_base[in_offset + 15 * in_width]}; + float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], + in_ptr_base[in_offset + 17 * in_width]}; + float32x4_t vi1 = vextq_f32(vi0, vi4, 1); + float32x4_t vi2 = vextq_f32(vi0, vi4, 2); + float32x4_t vi3 = vextq_f32(vi0, vi4, 3); + float32x4_t vi5 = vextq_f32(vi4, vi8, 1); + float32x4_t vi6 = vextq_f32(vi4, vi8, 2); + float32x4_t vi7 = vextq_f32(vi4, vi8, 3); + float32x4_t vi9 = vextq_f32(vi8, vi12, 1); + float32x4_t vi10 = vextq_f32(vi8, vi12, 2); + float32x4_t vi11 = vextq_f32(vi8, vi12, 3); + float32x4_t vi13 = vextq_f32(vi12, vi16, 1); + float32x4_t vi14 = vextq_f32(vi12, vi16, 2); + + vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0); + vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1); + vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0); + vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1); + vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0); + vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1); + vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1); + vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0); + vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); + + out_ptr_base[out_offset] = vo[0]; + out_ptr_base[out_offset + out_width] = vo[1]; + out_ptr_base[out_offset + 2 * out_width] = vo[2]; + out_ptr_base[out_offset + 3 * out_width] = vo[3]; + } // wt + } // h + } // c + } // w + } // m + } // b + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/fp32/conv_2d_1xn.h new file mode 100644 index 0000000000000000000000000000000000000000..a4a5e8995f9ebf5b85c2622684c13e558eb2900f --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_1xn.h @@ -0,0 +1,86 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK1x7S1 : public Conv2dBase { + public: + Conv2dK1x7S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK1x7S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK7x1S1 : public Conv2dBase { + public: + Conv2dK7x1S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x1S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK1x15S1 : public Conv2dBase { + public: + Conv2dK1x15S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK1x15S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK15x1S1 : public Conv2dBase { + public: + Conv2dK15x1S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK15x1S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ diff --git a/mace/ops/arm/conv_2d_neon_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc similarity index 85% rename from mace/ops/arm/conv_2d_neon_3x3.cc rename to mace/ops/arm/fp32/conv_2d_3x3.cc index ecae6810696d07d82d688a183720c7acb3243f8d..a8ce5fa64074c08362d0e839a80d111221bc19cb 100644 --- a/mace/ops/arm/conv_2d_neon_3x3.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,22 +12,49 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) #include -#endif - -#include "mace/core/macros.h" -#include "mace/ops/arm/conv_2d_neon.h" +#include +#include "mace/ops/arm/fp32/conv_2d_3x3.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 2, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); -// Ho = 2, Wo = 4, Co = 2 -void Conv2dNeonK3x3S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -42,26 +69,26 @@ void Conv2dNeonK3x3S1(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 1 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 1) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr0 = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9; + const float + *in_ptr0 = input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9; -#if defined(MACE_ENABLE_NEON) float *out_ptr1 = out_ptr1_base; const float *in_ptr1 = - input + b * in_batch_size + c * in_image_size + 1 * in_width; + input_data + b * in_batch_size + c * in_image_size + 1 * in_width; const float *in_ptr2 = - input + b * in_batch_size + c * in_image_size + 2 * in_width; + input_data + b * in_batch_size + c * in_image_size + 2 * in_width; const float *in_ptr3 = - input + b * in_batch_size + c * in_image_size + 3 * in_width; - const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9; -#endif -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + input_data + b * in_batch_size + c * in_image_size + 3 * in_width; + const float + *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9; + +#if defined(__aarch64__) float *out_ptr0 = out_ptr0_base; // load filter (2 outch x 3 height x 3 width): vf_outch_height @@ -179,7 +206,7 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; out_ptr1 += out_width; } // h -#elif defined(MACE_ENABLE_NEON) // arm v7 +#else // arm v7 float *out_ptr0 = out_ptr0_base; // load filter (2 outch x 3 height x 3 width): vf_outch_height @@ -301,32 +328,28 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; out_ptr1 += out_width; } // h -#else - for (index_t oc = 0; oc < 2; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0 + oc * in_channels * 9, - in_width, 3, 3, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } #endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr0 = - input + b * in_batch_size + c * in_image_size; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; const float *in_ptr1 = - input + b * in_batch_size + c * in_image_size + 1 * in_width; + input_data + b * in_batch_size + c * in_image_size + + 1 * in_width; const float *in_ptr2 = - input + b * in_batch_size + c * in_image_size + 2 * in_width; + input_data + b * in_batch_size + c * in_image_size + + 2 * in_width; const float *in_ptr3 = - input + b * in_batch_size + c * in_image_size + 3 * in_width; -#endif - const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9; + input_data + b * in_batch_size + c * in_image_size + + 3 * in_width; + const float + *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9; -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#if defined(__aarch64__) float *out_ptr0 = out_ptr0_base; // load filter (1 outch x 3 height x 3 width): vf_outch_height @@ -409,7 +432,7 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; } // h -#elif defined(MACE_ENABLE_NEON) // arm v7 +#else // arm v7 float *out_ptr0 = out_ptr0_base; // load filter (1 outch x 3 height x 3 width): vf_outch_height @@ -494,22 +517,52 @@ void Conv2dNeonK3x3S1(const float *input, out_ptr0 += out_width; } // h -#else - Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height, - out_width, out_ptr0_base, 1); #endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } -void Conv2dNeonK3x3S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -523,11 +576,12 @@ void Conv2dNeonK3x3S2(const float *input, const index_t in_width = in_shape[3]; const index_t out_height = out_shape[2]; const index_t out_width = out_shape[3]; - const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + m * in_channels * 9 + c * 9; - float *out_base = output + b * out_batch_size + m * out_image_size; + const float + *in_base = input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9; + float *out_base = output_data + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#if defined(__aarch64__) // load filter (1 outch x 3 height x 3 width): vf_outch_height float32x4_t vf00, vf01, vf02; vf00 = vld1q_f32(filter_ptr); @@ -587,7 +641,7 @@ void Conv2dNeonK3x3S2(const float *input, vst1q_f32(out_base + out_offset, vo); } // w } // h -#elif defined(MACE_ENABLE_NEON) // arm v7 +#else // arm v7 // load filter (1 outch x 3 height x 3 width): vf_outch_height float32x2_t vf01, vf23, vf45, vf67, vf78; vf01 = vld1_f32(filter_ptr); @@ -649,14 +703,16 @@ void Conv2dNeonK3x3S2(const float *input, vst1q_f32(out_base + out_offset, vo); } // w } // h -#else - Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height, - out_width, out_base, 2); #endif } // c } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/fp32/conv_2d_3x3.h new file mode 100644 index 0000000000000000000000000000000000000000..66d47801c39fee076ca0fd0bddff806a8e30c127 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_3x3.h @@ -0,0 +1,60 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK3x3S1 : public Conv2dBase { + public: + Conv2dK3x3S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK3x3S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK3x3S2 : public Conv2dBase { + public: + Conv2dK3x3S2(const std::vector paddings, const Padding padding_type) + : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK3x3S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ diff --git a/mace/ops/arm/conv_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc similarity index 60% rename from mace/ops/arm/conv_winograd.cc rename to mace/ops/arm/fp32/conv_2d_3x3_winograd.cc index 11d4fbf0d52eac3d8c7abab87a5f5b95693c5df5..b894a60a964ff9b149abc5d93852f76a658b9b94 100644 --- a/mace/ops/arm/conv_winograd.cc +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,20 +14,375 @@ #include -#include "mace/ops/arm/conv_winograd.h" +#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h" +#include "mace/ops/common/conv_pool_2d_util.h" +#include "mace/utils/memory.h" +#include "mace/utils/math.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + const index_t batch = input->dim(0); + const index_t in_channels = input->dim(1); + const index_t in_height = input->dim(2); + const index_t in_width = input->dim(3); + const index_t out_channels = filter->dim(0); + + // When size of input feature map is bigger than 16x16, + // set winograd out tile size to 6 to get higher performance. + index_t out_tile_size = 2; + if (in_height > 16 && in_width > 16) { + out_tile_size = 6; + } + + std::vector output_shape; + std::vector in_pad_size; + std::vector out_pad_size; + CalOutputShapeAndPadSize(input, + filter, + out_tile_size, + out_tile_size, + &output_shape, + &in_pad_size, + &out_pad_size); + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + + const index_t out_height = output_shape[2]; + const index_t out_width = output_shape[3]; + const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1]; + const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3]; + const index_t + padded_out_height = out_height + out_pad_size[0] + out_pad_size[1]; + const index_t + padded_out_width = out_width + out_pad_size[2] + out_pad_size[3]; + const int pad_top = in_pad_size[0]; + const int pad_left = in_pad_size[2]; + + bool is_in_padded = + padded_in_height != in_height || padded_in_width != in_width; + bool is_out_padded = + padded_out_height != out_height || padded_out_width != out_width; + + const index_t + tile_height_count = padded_out_height / out_tile_size; + const index_t tile_width_count = padded_out_width / out_tile_size; + const index_t tile_count = tile_height_count * tile_width_count; + const index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2); + + // pad input and transform input + auto scratch_buffer = context->device()->scratch_buffer(); + const index_t padded_in_size = is_in_padded ? PadAlignSize( + sizeof(float) * batch * in_channels * padded_in_height + * padded_in_width) : 0; + const index_t padded_out_size = is_out_padded ? PadAlignSize( + sizeof(float) * batch * out_channels * padded_out_height + * padded_out_width) : 0; + const index_t transformed_in_size = PadAlignSize( + sizeof(float) * batch * in_tile_area * in_channels * tile_count); + const index_t transformed_out_size = PadAlignSize( + sizeof(float) * batch * in_tile_area * out_channels * tile_count); + const index_t transformed_filter_size = + PadAlignSize(sizeof(float) * in_tile_area * out_channels * in_channels); + const index_t gemm_pack_size = + transformed_in_size + transformed_filter_size + transformed_filter_size; + + scratch_buffer->Rewind(); + scratch_buffer->GrowSize( + padded_in_size + padded_out_size + transformed_in_size + + transformed_out_size + gemm_pack_size); + + const Tensor *padded_in = input; + Tensor tmp_padded_in + (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT); + if (is_in_padded) { + tmp_padded_in.Resize({batch, in_channels, padded_in_height, + padded_in_width}); + Tensor::MappingGuard guard(&tmp_padded_in); + PadInput(*input, pad_top, pad_left, &tmp_padded_in); + padded_in = &tmp_padded_in; + } + + Tensor *padded_out = output; + Tensor tmp_padded_out + (scratch_buffer->Scratch(padded_out_size), DataType::DT_FLOAT); + if (is_out_padded) { + padded_out = &tmp_padded_out; + padded_out->Resize({batch, out_channels, padded_out_height, + padded_out_width}); + } + + auto transformed_in = scratch_buffer->Scratch(transformed_in_size); + auto transformed_out = scratch_buffer->Scratch(transformed_out_size); + auto padded_in_data = padded_in->data(); + auto padded_out_data = padded_out->mutable_data(); + auto transformed_in_data = transformed_in.mutable_data(); + auto transformed_out_data = transformed_out.mutable_data(); + auto filter_data = filter->data(); + + if (!filter->is_weight() || out_tile_size != out_tile_size_) { + out_tile_size_ = out_tile_size; + transformed_filter_.reset(new Tensor); + transformed_filter_->Resize({in_tile_area, out_channels, in_channels}); + auto transformed_filter_data = transformed_filter_->mutable_data(); + switch (out_tile_size) { + case 2: + TransformFilter4x4(filter_data, + in_channels, + out_channels, + transformed_filter_data); + break; + case 6: + TransformFilter8x8(filter_data, + in_channels, + out_channels, + transformed_filter_data); + break; + default:MACE_NOT_IMPLEMENTED; + } + } + + switch (out_tile_size) { + case 2: + TransformInput4x4(padded_in_data, + batch, + padded_in_height, + padded_in_width, + in_channels, + tile_count, + transformed_in_data); + break; + case 6: + TransformInput8x8(padded_in_data, + batch, + padded_in_height, + padded_in_width, + in_channels, + tile_count, + transformed_in_data); + break; + default:MACE_NOT_IMPLEMENTED; + } + + const index_t scratch_buffer_offset = scratch_buffer->offset(); + const index_t transformed_in_size_per_batch = + in_tile_area * in_channels * tile_count * sizeof(float); + const index_t transformed_out_size_per_batch = + in_tile_area * out_channels * tile_count * sizeof(float); + for (index_t b = 0; b < batch; ++b) { + scratch_buffer->Rewind(scratch_buffer_offset); + + BufferSlice transformed_in_slice(&transformed_in, + b * transformed_in_size_per_batch, + transformed_in_size_per_batch); + BufferSlice transformed_out_slice(&transformed_out, + b * transformed_out_size_per_batch, + transformed_out_size_per_batch); + + Tensor transformed_in_this_batch(transformed_in_slice, DataType::DT_FLOAT); + transformed_in_this_batch.Resize({in_tile_area, in_channels, tile_count}); + Tensor + transformed_out_this_batch(transformed_out_slice, DataType::DT_FLOAT); + transformed_out_this_batch.Resize({in_tile_area, out_channels, tile_count}); + + gemm_.Compute(context, + transformed_filter_.get(), + &transformed_in_this_batch, + in_tile_area, + out_channels, + in_channels, + in_channels, + tile_count, + false, + false, + false, + true, + true, + &transformed_out_this_batch); + } + + switch (out_tile_size) { + case 2: + TransformOutput4x4(transformed_out_data, + batch, + padded_out_height, + padded_out_width, + out_channels, + tile_count, + padded_out_data); + break; + case 6: + TransformOutput8x8(transformed_out_data, + batch, + padded_out_height, + padded_out_width, + out_channels, + tile_count, + padded_out_data); + break; + default:MACE_NOT_IMPLEMENTED; + } + + UnPadOutput(*padded_out, output); + + return MaceStatus::MACE_SUCCESS; +} + +// OCHW => TOC +void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter, + const index_t in_channels, + const index_t out_channels, + float *output) { + const index_t stride = out_channels * in_channels; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t m = 0; m < out_channels; ++m) { + for (index_t c = 0; c < in_channels; ++c) { + float g0, g1, g2, g3, g4, g5, g6, g7, g8; + float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + + // load filter + index_t filter_offset = (m * in_channels + c) * 9; + g0 = filter[filter_offset]; + g1 = filter[filter_offset + 1]; + g2 = filter[filter_offset + 2]; + g3 = filter[filter_offset + 3]; + g4 = filter[filter_offset + 4]; + g5 = filter[filter_offset + 5]; + g6 = filter[filter_offset + 6]; + g7 = filter[filter_offset + 7]; + g8 = filter[filter_offset + 8]; + + // s = G * g * GT + s0 = g0; + s1 = (g0 + g2 + g1) * 0.5f; + s2 = (g0 + g2 - g1) * 0.5f; + s3 = g2; + s4 = (g0 + g6 + g3) * 0.5f; + s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f; + s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f; + s7 = (g2 + g8 + g5) * 0.5f; + s8 = (g0 + g6 - g3) * 0.5f; + s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f; + s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f; + s11 = (g2 + g8 - g5) * 0.5f; + s12 = g6; + s13 = (g6 + g8 + g7) * 0.5f; + s14 = (g6 + g8 - g7) * 0.5f; + s15 = g8; + + // store output + index_t output_offset = m * in_channels + c; + output[output_offset + 0 * stride] = s0; + output[output_offset + 1 * stride] = s1; + output[output_offset + 2 * stride] = s2; + output[output_offset + 3 * stride] = s3; + + output[output_offset + 4 * stride] = s4; + output[output_offset + 5 * stride] = s5; + output[output_offset + 6 * stride] = s6; + output[output_offset + 7 * stride] = s7; + + output[output_offset + 8 * stride] = s8; + output[output_offset + 9 * stride] = s9; + output[output_offset + 10 * stride] = s10; + output[output_offset + 11 * stride] = s11; + + output[output_offset + 12 * stride] = s12; + output[output_offset + 13 * stride] = s13; + output[output_offset + 14 * stride] = s14; + output[output_offset + 15 * stride] = s15; + } + } +} + +// OCHW => TOC +/** + * G = +⎡ 1 0 0 ⎤ +⎢ ⎥ +⎢-2/9 -2/9 -2/9 ⎥ +⎢ ⎥ +⎢-2/9 2/9 -2/9 ⎥ +⎢ ⎥ +⎢1/90 1/45 2/45 ⎥ +⎢ ⎥ +⎢1/90 -1/45 2/45 ⎥ +⎢ ⎥ +⎢1/45 1/90 1/180⎥ +⎢ ⎥ +⎢1/45 -1/90 1/180⎥ +⎢ ⎥ +⎣ 0 0 1 ⎦ + */ +void Conv2dK3x3Winograd::TransformFilter8x8(const float *filter, + const index_t in_channels, + const index_t out_channels, + float *output) { + const index_t stride = out_channels * in_channels; + + const float G[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f}}; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t m = 0; m < out_channels; ++m) { + for (index_t c = 0; c < in_channels; ++c) { + // load filter + index_t filter_offset = (m * in_channels + c) * 9; + float g0, g1, g2, g3, g4, g5, g6, g7, g8; + g0 = filter[filter_offset]; + g1 = filter[filter_offset + 1]; + g2 = filter[filter_offset + 2]; + g3 = filter[filter_offset + 3]; + g4 = filter[filter_offset + 4]; + g5 = filter[filter_offset + 5]; + g6 = filter[filter_offset + 6]; + g7 = filter[filter_offset + 7]; + g8 = filter[filter_offset + 8]; + + float s[3][8]; + for (int i = 0; i < 8; ++i) { + s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2]; + s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2]; + s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2]; + } + + // store output + index_t output_offset = m * in_channels + c; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + output[output_offset + (i * 8 + j) * stride] = + G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j]; + } + } + } + } +} -namespace { // NCHW => NTCB (T: in tile pixels, B: tile indices) -void TransformInput4x4(const float *input, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t tile_count, - float *output) { +void Conv2dK3x3Winograd::TransformInput4x4(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t tile_count, + float *output) { const index_t stride = in_channels * tile_count; const index_t in_height_width = in_height * in_width; const index_t input_batch_size = in_height_width * in_channels; @@ -46,7 +401,7 @@ void TransformInput4x4(const float *input, // load tile data const float *input_ptr = input + n * input_batch_size + - c * in_height_width + h * in_width + w; + c * in_height_width + h * in_width + w; d0 = input_ptr[0]; d1 = input_ptr[1]; d2 = input_ptr[2]; @@ -133,22 +488,14 @@ void TransformInput4x4(const float *input, ⎢0 -2 4 5/2 -5 -1/2 1 0⎥ ⎢ ⎥ ⎣0 -1 0 21/4 0 -21/4 0 1⎦ - - * @param input - * @param batch - * @param in_height - * @param in_width - * @param in_channels - * @param tile_count - * @param output */ -void TransformInput8x8(const float *input, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t tile_count, - float *output) { +void Conv2dK3x3Winograd::TransformInput8x8(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t tile_count, + float *output) { const index_t stride = in_channels * tile_count; const index_t in_height_width = in_height * in_width; const index_t input_batch_size = in_height_width * in_channels; @@ -162,7 +509,7 @@ void TransformInput8x8(const float *input, for (index_t h = 0; h < in_height - 2; h += 6) { for (index_t w = 0; w < in_width - 2; w += 6) { const float *input_ptr = input + n * input_batch_size + - c * in_height_width + h * in_width + w; + c * in_height_width + h * in_width + w; for (int i = 0; i < 8; ++i) { float d0, d1, d2, d3, d4, d5, d6, d7; @@ -235,57 +582,14 @@ void TransformInput8x8(const float *input, } } -// TOC * NTCB => NTOB -void BatchGemm(const float *input, - const float *filter, - index_t batch, - index_t in_channels, - index_t out_channels, - index_t tile_count, - int out_tile_size, - float *output, - SGemm *sgemm, - ScratchBuffer *scratch_buffer) { - const int in_tile_area = (out_tile_size + 2) * (out_tile_size + 2); - const index_t in_batch_size = in_tile_area * in_channels * tile_count; - const index_t out_batch_size = in_tile_area * out_channels * tile_count; - - index_t scratch_buffer_offset = 0; - if (scratch_buffer) { - scratch_buffer_offset = scratch_buffer->offset(); - } - // 'batch' is not gemm batch, 'in_tile_area' is. gemm is not thread safe, - // so we loop batch using single thread. - // Scratch buffer should be rewind to the initial position to use same - // scratch memory for each batch. - for (int b = 0; b < batch; ++b) { - if (scratch_buffer) { - scratch_buffer->Rewind(scratch_buffer_offset); - } - sgemm->Run(filter, - input + b * in_batch_size, - in_tile_area, - out_channels, - in_channels, - in_channels, - tile_count, - false, - false, - true, - false, - output + b * out_batch_size, - scratch_buffer); - } -} - // NTOB => NToOB => NOHoWo -void TransformOutput4x4(const float *input, - index_t batch, - index_t out_height, - index_t out_width, - index_t out_channels, - index_t tile_count, - float *output) { +void Conv2dK3x3Winograd::TransformOutput4x4(const float *input, + index_t batch, + index_t out_height, + index_t out_width, + index_t out_channels, + index_t tile_count, + float *output) { const index_t stride = out_channels * tile_count; const index_t input_batch_size = 16 * stride; const index_t out_image_size = out_height * out_width; @@ -339,7 +643,7 @@ void TransformOutput4x4(const float *input, v3 = s3 - s5 - s7; float *output_ptr = output + n * output_batch_size + - m * out_image_size + h * out_width + w; + m * out_image_size + h * out_width + w; output_ptr[0] = v0; output_ptr[1] = v1; output_ptr[out_width] = v2; @@ -366,22 +670,14 @@ void TransformOutput4x4(const float *input, ⎢0 1 1 16 16 2 2 0⎥ ⎢ ⎥ ⎣0 1 -1 32 -32 1 -1 1⎦ - * - * @param input - * @param batch - * @param out_height - * @param out_width - * @param out_channels - * @param tile_count - * @param output */ -void TransformOutput8x8(const float *input, - index_t batch, - index_t out_height, - index_t out_width, - index_t out_channels, - index_t tile_count, - float *output) { +void Conv2dK3x3Winograd::TransformOutput8x8(const float *input, + index_t batch, + index_t out_height, + index_t out_width, + index_t out_channels, + index_t tile_count, + float *output) { const index_t stride = out_channels * tile_count; const index_t input_batch_size = 64 * stride; const index_t out_image_size = out_height * out_width; @@ -426,7 +722,7 @@ void TransformOutput8x8(const float *input, } float *output_ptr = output + n * output_batch_size + - m * out_image_size + h * out_width + w; + m * out_image_size + h * out_width + w; for (int i = 0; i < 6; ++i) { float d0, d1, d2, d3, d4, d5, d6, d7; @@ -460,291 +756,8 @@ void TransformOutput8x8(const float *input, } } } -} // namespace - -// OCHW => TOC -// no need to optimize, it will exist in converter -void TransformFilter4x4(const float *filter, - const index_t in_channels, - const index_t out_channels, - float *output) { - const index_t stride = out_channels * in_channels; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t m = 0; m < out_channels; ++m) { - for (index_t c = 0; c < in_channels; ++c) { - float g0, g1, g2, g3, g4, g5, g6, g7, g8; - float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, - s15; - - // load filter - index_t filter_offset = (m * in_channels + c) * 9; - g0 = filter[filter_offset]; - g1 = filter[filter_offset + 1]; - g2 = filter[filter_offset + 2]; - g3 = filter[filter_offset + 3]; - g4 = filter[filter_offset + 4]; - g5 = filter[filter_offset + 5]; - g6 = filter[filter_offset + 6]; - g7 = filter[filter_offset + 7]; - g8 = filter[filter_offset + 8]; - - // s = G * g * GT - s0 = g0; - s1 = (g0 + g2 + g1) * 0.5f; - s2 = (g0 + g2 - g1) * 0.5f; - s3 = g2; - s4 = (g0 + g6 + g3) * 0.5f; - s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f; - s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f; - s7 = (g2 + g8 + g5) * 0.5f; - s8 = (g0 + g6 - g3) * 0.5f; - s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f; - s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f; - s11 = (g2 + g8 - g5) * 0.5f; - s12 = g6; - s13 = (g6 + g8 + g7) * 0.5f; - s14 = (g6 + g8 - g7) * 0.5f; - s15 = g8; - - // store output - index_t output_offset = m * in_channels + c; - output[output_offset + 0 * stride] = s0; - output[output_offset + 1 * stride] = s1; - output[output_offset + 2 * stride] = s2; - output[output_offset + 3 * stride] = s3; - - output[output_offset + 4 * stride] = s4; - output[output_offset + 5 * stride] = s5; - output[output_offset + 6 * stride] = s6; - output[output_offset + 7 * stride] = s7; - - output[output_offset + 8 * stride] = s8; - output[output_offset + 9 * stride] = s9; - output[output_offset + 10 * stride] = s10; - output[output_offset + 11 * stride] = s11; - - output[output_offset + 12 * stride] = s12; - output[output_offset + 13 * stride] = s13; - output[output_offset + 14 * stride] = s14; - output[output_offset + 15 * stride] = s15; - } - } -} - -// OCHW => TOC -// no need to optimize, it will exist in converter -/** - * G = -⎡ 1 0 0 ⎤ -⎢ ⎥ -⎢-2/9 -2/9 -2/9 ⎥ -⎢ ⎥ -⎢-2/9 2/9 -2/9 ⎥ -⎢ ⎥ -⎢1/90 1/45 2/45 ⎥ -⎢ ⎥ -⎢1/90 -1/45 2/45 ⎥ -⎢ ⎥ -⎢1/45 1/90 1/180⎥ -⎢ ⎥ -⎢1/45 -1/90 1/180⎥ -⎢ ⎥ -⎣ 0 0 1 ⎦ - * - * @param filter - * @param in_channels - * @param out_channels - * @param output - */ -void TransformFilter8x8(const float *filter, - const index_t in_channels, - const index_t out_channels, - float *output) { - const index_t stride = out_channels * in_channels; - - const float G[8][3] = {{1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {1.0f / 45, 1.0f / 90, 1.0f / 180}, - {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f}}; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t m = 0; m < out_channels; ++m) { - for (index_t c = 0; c < in_channels; ++c) { - // load filter - index_t filter_offset = (m * in_channels + c) * 9; - float g0, g1, g2, g3, g4, g5, g6, g7, g8; - g0 = filter[filter_offset]; - g1 = filter[filter_offset + 1]; - g2 = filter[filter_offset + 2]; - g3 = filter[filter_offset + 3]; - g4 = filter[filter_offset + 4]; - g5 = filter[filter_offset + 5]; - g6 = filter[filter_offset + 6]; - g7 = filter[filter_offset + 7]; - g8 = filter[filter_offset + 8]; - - float s[3][8]; - for (int i = 0; i < 8; ++i) { - s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2]; - s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2]; - s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2]; - } - - // store output - index_t output_offset = m * in_channels + c; - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { - output[output_offset + (i * 8 + j) * stride] = - G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j]; - } - } - } - } -} - -void WinoGradConv3x3s1(const float *input, - const float *transformed_filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_channels, - const int out_tile_size, - float *transformed_input, - float *transformed_output, - float *output, - SGemm *sgemm, - ScratchBuffer *scratch_buffer) { - index_t out_height = in_height - 2; - index_t out_width = in_width - 2; - index_t tile_height_count = - RoundUpDiv(out_height, static_cast(out_tile_size)); - index_t tile_width_count = - RoundUpDiv(out_width, static_cast(out_tile_size)); - index_t tile_count = tile_height_count * tile_width_count; - - switch (out_tile_size) { - case 2: - TransformInput4x4(input, batch, in_height, in_width, in_channels, - tile_count, transformed_input); - break; - case 6: - TransformInput8x8(input, batch, in_height, in_width, in_channels, - tile_count, transformed_input); - break; - default: - MACE_NOT_IMPLEMENTED; - } - - BatchGemm(transformed_input, transformed_filter, batch, in_channels, - out_channels, tile_count, out_tile_size, transformed_output, - sgemm, scratch_buffer); - - switch (out_tile_size) { - case 2: - TransformOutput4x4(transformed_output, batch, out_height, out_width, - out_channels, tile_count, output); - break; - case 6: - TransformOutput8x8(transformed_output, batch, out_height, out_width, - out_channels, tile_count, output); - break; - default: - MACE_NOT_IMPLEMENTED; - } -} - -void WinoGradConv3x3s1(const float *input, - const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_channels, - const int out_tile_size, - float *output, - SGemm *sgemm, - ScratchBuffer *scratch_buffer) { - index_t out_height = in_height - 2; - index_t out_width = in_width - 2; - index_t tile_height_count = - RoundUpDiv(out_height, static_cast(out_tile_size)); - index_t tile_width_count = - RoundUpDiv(out_width, static_cast(out_tile_size)); - index_t tile_count = tile_height_count * tile_width_count; - index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2); - index_t transformed_input_size = - in_tile_area * batch * in_channels * tile_count; - index_t transformed_filter_size = in_tile_area * out_channels * in_channels; - index_t transformed_output_size = - in_tile_area * batch * out_channels * tile_count; - - float *transformed_input = new float[transformed_input_size]; // TNCB - float *transformed_filter = new float[transformed_filter_size]; // TOC - float *transformed_output = new float[transformed_output_size]; - - switch (out_tile_size) { - case 2: - TransformFilter4x4(filter, in_channels, out_channels, transformed_filter); - break; - case 6: - TransformFilter8x8(filter, in_channels, out_channels, transformed_filter); - break; - default: - MACE_NOT_IMPLEMENTED; - } - - WinoGradConv3x3s1(input, transformed_filter, batch, in_height, in_width, - in_channels, out_channels, out_tile_size, transformed_input, - transformed_output, output, sgemm, scratch_buffer); - - delete[] transformed_input; - delete[] transformed_filter; - delete[] transformed_output; -} - -void ConvRef3x3s1(const float *input, - const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_channels, - float *output) { - index_t out_height = in_height - 2; - index_t out_width = in_width - 2; - -#pragma omp parallel for collapse(4) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - index_t out_offset = - ((b * out_channels + m) * out_height + h) * out_width + w; - output[out_offset] = 0; - for (index_t c = 0; c < in_channels; ++c) { - for (index_t kh = 0; kh < 3; ++kh) { - for (index_t kw = 0; kw < 3; ++kw) { - index_t ih = h + kh; - index_t iw = w + kw; - index_t in_offset = - ((b * in_channels + c) * in_height + ih) * in_width + iw; - index_t filter_offset = - (((m * in_channels) + c) * 3 + kh) * 3 + kw; - output[out_offset] += input[in_offset] * filter[filter_offset]; - } - } - } - } - } - } - } -} +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h new file mode 100644 index 0000000000000000000000000000000000000000..3ed8646b17c12424a884611ac22698c6d3a9bf05 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h @@ -0,0 +1,102 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_ + +#include +#include + +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/gemm.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK3x3Winograd : public Conv2dBase { + public: + Conv2dK3x3Winograd(const std::vector paddings, + const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type), + gemm_(), + transformed_filter_(nullptr), + out_tile_size_(0) {} + + virtual ~Conv2dK3x3Winograd() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); + + private: + void TransformFilter4x4(const float *filter, + const index_t in_channels, + const index_t out_channels, + float *output); + + void TransformFilter8x8(const float *filter, + const index_t in_channels, + const index_t out_channels, + float *output); + + void TransformInput4x4(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t tile_count, + float *output); + + void TransformInput8x8(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t tile_count, + float *output); + + void TransformOutput4x4(const float *input, + index_t batch, + index_t out_height, + index_t out_width, + index_t out_channels, + index_t tile_count, + float *output); + + void TransformOutput8x8(const float *input, + index_t batch, + index_t out_height, + index_t out_width, + index_t out_channels, + index_t tile_count, + float *output); + + Gemm gemm_; + std::unique_ptr transformed_filter_; + index_t out_tile_size_; +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_ diff --git a/mace/ops/arm/conv_2d_neon_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc similarity index 77% rename from mace/ops/arm/conv_2d_neon_5x5.cc rename to mace/ops/arm/fp32/conv_2d_5x5.cc index 81d892975ae1c431708d986f5ff7f0666a399e9a..264e48fa13f91756c47fae6f5b9db9ed7f2cc57c 100644 --- a/mace/ops/arm/conv_2d_neon_5x5.cc +++ b/mace/ops/arm/fp32/conv_2d_5x5.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) #include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" +#include +#include "mace/ops/arm/fp32/conv_2d_5x5.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { #define MACE_Conv2dNeonK5x5SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ @@ -76,12 +76,40 @@ namespace ops { vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK5x5S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -96,26 +124,26 @@ void Conv2dNeonK5x5S1(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 25 + c * 25; + filter_data + (m + 1) * in_channels * 25 + c * 25; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 25 + c * 25; + filter_data + (m + 2) * in_channels * 25 + c * 25; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 25 + c * 25; + filter_data + (m + 3) * in_channels * 25 + c * 25; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -158,23 +186,16 @@ void Conv2dNeonK5x5S1(const float *input, filter_ptr3 -= 25; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25, - in_width, 5, 5, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 25 + c * 25; -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -204,16 +225,17 @@ void Conv2dNeonK5x5S1(const float *input, filter_ptr0 -= 25; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5, - out_height, out_width, out_ptr0_base, 1); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_5x5.h b/mace/ops/arm/fp32/conv_2d_5x5.h new file mode 100644 index 0000000000000000000000000000000000000000..154d74a849f38c5b114f70d897946a220a722d2c --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_5x5.h @@ -0,0 +1,48 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK5x5S1 : public Conv2dBase { + public: + Conv2dK5x5S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK5x5S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_ diff --git a/mace/ops/arm/conv_2d_neon_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc similarity index 78% rename from mace/ops/arm/conv_2d_neon_7x7.cc rename to mace/ops/arm/fp32/conv_2d_7x7.cc index 2411aad6761835970ad77e8cf980bd27f045d1e8..86d3e468f494bb42e3f5c3ecaf608adca72cea5a 100644 --- a/mace/ops/arm/conv_2d_neon_7x7.cc +++ b/mace/ops/arm/fp32/conv_2d_7x7.cc @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) #include -#endif - -#include "mace/ops/arm/conv_2d_neon.h" +#include +#include "mace/ops/arm/fp32/conv_2d_7x7.h" namespace mace { namespace ops { +namespace arm { +namespace fp32 { #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \ /* load filter (4 outch x 1 height x 4 width) */ \ @@ -153,12 +153,40 @@ namespace ops { vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK7x7S1(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -173,26 +201,25 @@ void Conv2dNeonK7x7S1(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * in_channels * 49 + c * 49; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * in_channels * 49 + c * 49; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 49 + c * 49; + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -243,23 +270,16 @@ void Conv2dNeonK7x7S1(const float *input, filter_ptr3 -= 49; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, - in_width, 7, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 1); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -297,23 +317,50 @@ void Conv2dNeonK7x7S1(const float *input, filter_ptr0 -= 49; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7, - out_height, out_width, out_ptr0_base, 1); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK7x7S2(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -328,26 +375,25 @@ void Conv2dNeonK7x7S2(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * in_channels * 49 + c * 49; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * in_channels * 49 + c * 49; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 49 + c * 49; + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -403,23 +449,16 @@ void Conv2dNeonK7x7S2(const float *input, filter_ptr3 -= 49; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, - in_width, 7, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 2); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -462,23 +501,50 @@ void Conv2dNeonK7x7S2(const float *input, filter_ptr0 -= 49; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7, - out_height, out_width, out_ptr0_base, 2); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } -// Ho = 1, Wo = 4, Co = 4 -void Conv2dNeonK7x7S3(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - float *output) { +MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = in_shape[1] * in_image_size; @@ -493,26 +559,25 @@ void Conv2dNeonK7x7S3(const float *input, const index_t in_channels = in_shape[1]; const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { - float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; -#if defined(MACE_ENABLE_NEON) + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; + output_data + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; + output_data + b * out_batch_size + (m + 2) * out_image_size; float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; -#endif + output_data + b * out_batch_size + (m + 3) * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; const float *filter_ptr1 = - filter + (m + 1) * in_channels * 49 + c * 49; + filter_data + (m + 1) * in_channels * 49 + c * 49; const float *filter_ptr2 = - filter + (m + 2) * in_channels * 49 + c * 49; + filter_data + (m + 2) * in_channels * 49 + c * 49; const float *filter_ptr3 = - filter + (m + 3) * in_channels * 49 + c * 49; + filter_data + (m + 3) * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -568,23 +633,16 @@ void Conv2dNeonK7x7S3(const float *input, filter_ptr3 -= 49; } // w } // h -#else - for (index_t oc = 0; oc < 4; ++oc) { - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49, - in_width, 7, 7, out_height, out_width, - out_ptr0_base + oc * out_image_size, 3); - } -#endif } // c } else { for (index_t mm = m; mm < out_channels; ++mm) { float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; + output_data + b * out_batch_size + mm * out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; -#if defined(MACE_ENABLE_NEON) + input_data + b * in_batch_size + c * in_image_size; + const float + *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -627,16 +685,17 @@ void Conv2dNeonK7x7S3(const float *input, filter_ptr0 -= 49; } // w } // h -#else - Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7, - out_height, out_width, out_ptr0_base, 3); -#endif } // c } // mm } // if } // m } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; } +} // namespace fp32 +} // namespace arm } // namespace ops } // namespace mace diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/fp32/conv_2d_7x7.h new file mode 100644 index 0000000000000000000000000000000000000000..e64780bab2bb4c22c2107da29d85b9040ef86460 --- /dev/null +++ b/mace/ops/arm/fp32/conv_2d_7x7.h @@ -0,0 +1,73 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ +#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dK7x7S1 : public Conv2dBase { + public: + Conv2dK7x7S1(const std::vector paddings, const Padding padding_type) + : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x7S1() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK7x7S2 : public Conv2dBase { + public: + Conv2dK7x7S2(const std::vector paddings, const Padding padding_type) + : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x7S2() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +class Conv2dK7x7S3 : public Conv2dBase { + public: + Conv2dK7x7S3(const std::vector paddings, const Padding padding_type) + : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {} + virtual ~Conv2dK7x7S3() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc new file mode 100644 index 0000000000000000000000000000000000000000..a12c5d53b83c275a470f04accdeee07d65317330 --- /dev/null +++ b/mace/ops/arm/fp32/conv_general.cc @@ -0,0 +1,232 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "mace/ops/arm/fp32/conv_general.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +MaceStatus Conv2dGeneral::Compute(const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output) { + std::unique_ptr padded_input; + std::unique_ptr padded_output; + + ResizeOutAndPadInOut(context, + input, + filter, + output, + 1, + 4, + &padded_input, + &padded_output); + + const Tensor *in_tensor = input; + if (padded_input.get() != nullptr) { + in_tensor = padded_input.get(); + } + Tensor *out_tensor = output; + if (padded_output.get() != nullptr) { + out_tensor = padded_output.get(); + } + out_tensor->Clear(); + + Tensor::MappingGuard in_guard(input); + Tensor::MappingGuard filter_guard(filter); + Tensor::MappingGuard out_guard(output); + auto filter_data = filter->data(); + auto input_data = in_tensor->data(); + auto output_data = out_tensor->mutable_data(); + + auto in_shape = in_tensor->shape(); + auto out_shape = out_tensor->shape(); + auto filter_shape = filter->shape(); + + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = filter_shape[1] * in_image_size; + const index_t out_batch_size = filter_shape[0] * out_image_size; + const index_t filter_size = filter_shape[2] * filter_shape[3]; + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t b = 0; b < in_shape[0]; b++) { + for (index_t m = 0; m < filter_shape[0]; m += 4) { + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; + + const int stride_h = strides_[0]; + const int stride_w = strides_[1]; + const int dilation_h = dilations_[0]; + const int dilation_w = dilations_[1]; + if (m + 3 < out_channels) { + float *out_ptr0_base = + output_data + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = out_ptr0_base + out_image_size; + float *out_ptr2_base = out_ptr1_base + out_image_size; + float *out_ptr3_base = out_ptr2_base + out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = + filter_data + m * in_channels * filter_size + c * filter_size; + const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; + const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; + const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t ih = h * stride_h; + index_t iw = w * stride_w; + index_t in_offset = ih * in_width + iw; + // output (4 outch x 1 height x 4 width): vo_outch_height + float vo0[4], vo1[4], vo2[4], vo3[4]; + // load output + index_t out_offset = h * out_width + w; + for (index_t ow = 0; ow < 4; ++ow) { + vo0[ow] = out_ptr0_base[out_offset + ow]; + vo1[ow] = out_ptr1_base[out_offset + ow]; + vo2[ow] = out_ptr2_base[out_offset + ow]; + vo3[ow] = out_ptr3_base[out_offset + ow]; + } + // calc by row + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + // outch 0 + vo0[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr0[kw]; + vo0[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + // outch 1 + vo1[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr1[kw]; + vo1[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + // outch 2 + vo2[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr2[kw]; + vo2[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + // outch 3 + vo3[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr3[kw]; + vo3[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; + } // kw + + in_offset += dilation_h * in_width; + filter_ptr0 += filter_shape[3]; + filter_ptr1 += filter_shape[3]; + filter_ptr2 += filter_shape[3]; + filter_ptr3 += filter_shape[3]; + } // kh + + for (index_t ow = 0; ow < 4; ++ow) { + out_ptr0_base[out_offset + ow] = vo0[ow]; + out_ptr1_base[out_offset + ow] = vo1[ow]; + out_ptr2_base[out_offset + ow] = vo2[ow]; + out_ptr3_base[out_offset + ow] = vo3[ow]; + } + + filter_ptr0 -= filter_size; + filter_ptr1 -= filter_size; + filter_ptr2 -= filter_size; + filter_ptr3 -= filter_size; + } // w + } // h + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output_data + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input_data + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = + filter_data + mm * in_channels * filter_size + c * filter_size; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t ih = h * stride_h; + index_t iw = w * stride_w; + index_t in_offset = ih * in_width + iw; + // output (1 outch x 1 height x 4 width): vo_outch_height + float vo0[4]; + // load output + index_t out_offset = h * out_width + w; + for (index_t ow = 0; ow < 4; ++ow) { + vo0[ow] = out_ptr0_base[out_offset + ow]; + } + + // calc by row + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + // outch 0 + vo0[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr0[kw]; + vo0[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + } // kw + + in_offset += dilation_h * in_width; + filter_ptr0 += filter_shape[3]; + } // kh + + for (index_t ow = 0; ow < 4; ++ow) { + out_ptr0_base[out_offset + ow] = vo0[ow]; + } + filter_ptr0 -= filter_size; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b + + UnPadOutput(*out_tensor, output); + return MaceStatus::MACE_SUCCESS; +} + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/fp32/conv_general.h b/mace/ops/arm/fp32/conv_general.h new file mode 100644 index 0000000000000000000000000000000000000000..01d019548a19fee9c79deb6d918dac9431110fac --- /dev/null +++ b/mace/ops/arm/fp32/conv_general.h @@ -0,0 +1,50 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_ +#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_ + +#include +#include "mace/public/mace.h" +#include "mace/core/tensor.h" +#include "mace/core/op_context.h" +#include "mace/ops/arm/fp32/conv_2d.h" + +namespace mace { +namespace ops { +namespace arm { +namespace fp32 { + +class Conv2dGeneral : public Conv2dBase { + public: + Conv2dGeneral(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : Conv2dBase(strides, dilations, paddings, padding_type) {} + virtual ~Conv2dGeneral() {} + + MaceStatus Compute( + const OpContext *context, + const Tensor *input, + const Tensor *filter, + Tensor *output); +}; + +} // namespace fp32 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_FP32_CONV_GENERAL_H_ diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/fp32/gemm.h index f4cfc42bb199161a877ab0329670004ef94a6b97..ce226c1a341d76d7f873cb527408688c2e538a8c 100644 --- a/mace/ops/arm/fp32/gemm.h +++ b/mace/ops/arm/fp32/gemm.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "mace/core/tensor.h" #include "mace/core/op_context.h" #include "mace/ops/common/matrix.h" +#include "mace/utils/math.h" // This implements matrix-matrix multiplication. // In the case of matrix-vector multiplication, use gemv.h/gemv.cc instead diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc index 703e39449663a66d8076d7b2500a9820c209938c..cd0f607fd63f16bb5c99ea0a369dc8423a6bf358 100644 --- a/mace/ops/arm/fp32/gemv.cc +++ b/mace/ops/arm/fp32/gemv.cc @@ -18,6 +18,8 @@ #include #include +#include "mace/utils/math.h" + #if !defined(__aarch64__) float vaddvq_f32(float32x4_t v) { float32x2_t _sum = vadd_f32(vget_low_f32(v), vget_high_f32(v)); @@ -258,11 +260,12 @@ MaceStatus Gemv::Compute(const OpContext *context, ++rhs_ptr; } - float32x4_t vbias = vdupq_n_f32(0); if (bias) { + float32x4_t vbias = vdupq_n_f32(0); vbias = vld1q_f32(bias_data + h_start); + vo = vaddq_f32(vo, vbias); } - vo = vaddq_f32(vo, vbias); + vst1q_f32(ret_ptr, vo); } else { // h_block_len < 4 #endif // MACE_GEMV_UNROLL diff --git a/mace/ops/arm/fp32/gemv.h b/mace/ops/arm/fp32/gemv.h index 3210def1dd50ecc5e4c45dbda0d4da67df55ee8e..1f406426fbe93ae965f23450eca2a5ba1c517db1 100644 --- a/mace/ops/arm/fp32/gemv.h +++ b/mace/ops/arm/fp32/gemv.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc new file mode 100644 index 0000000000000000000000000000000000000000..f987da81373282f769f660e5f10e7795413b3be4 --- /dev/null +++ b/mace/ops/arm/q8/eltwise.cc @@ -0,0 +1,157 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/q8/eltwise.h" + +#include +#include + +#include "mace/ops/common/gemmlowp_util.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace ops { +namespace arm { +namespace q8 { + +MaceStatus Eltwise::Compute(const OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output) { + MACE_UNUSED(context); + MACE_CHECK(type_ == SUM || type_ == SUB, + "Quantized Elementwise only support SUM and SUB now."); + + constexpr int left_shift = 20; + const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); + const double adjusted_input0_scale = input0->scale() / doubled_scale; + const double adjusted_input1_scale = input1->scale() / doubled_scale; + const double adjusted_output_scale = + doubled_scale / ((1 << left_shift) * output->scale()); + + int32_t input0_multiplier; + int32_t input1_multiplier; + int32_t output_multiplier; + int32_t input0_shift; + int32_t input1_shift; + int32_t output_shift; + QuantizeMultiplier(adjusted_input0_scale, + &input0_multiplier, + &input0_shift); + QuantizeMultiplier(adjusted_input1_scale, + &input1_multiplier, + &input1_shift); + QuantizeMultiplier(adjusted_output_scale, + &output_multiplier, + &output_shift); + + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); + Tensor::MappingGuard output_guard(output); + + auto input0_ptr = input0->data(); + auto input1_ptr = input1->data(); + auto output_ptr = output->mutable_data(); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i <= output->size() - 8; i += 8) { + const auto input0_val = vld1_u8(input0_ptr + i); + const auto input1_val = vld1_u8(input1_ptr + i); + const auto input0_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input0_val)); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val)); + const auto offset_input0 = + vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); + const auto offset_input1 = + vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); + auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); + auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); + auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); + auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); + const auto left_shift_dup = vdupq_n_s32(left_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); + input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); + input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); + input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); + input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); + const auto input0_shift_dup = vdupq_n_s32(input0_shift); + const auto input1_shift_dup = vdupq_n_s32(input1_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); + int32x4_t res_low, res_high; + if (type_ == SUM) { + res_low = vaddq_s32(input0_low_s32, input1_low_s32); + res_high = vaddq_s32(input0_high_s32, input1_high_s32); + } else { + res_low = vsubq_s32(input0_low_s32, input1_low_s32); + res_high = vsubq_s32(input0_high_s32, input1_high_s32); + } + res_low = vqrdmulhq_n_s32(res_low, output_multiplier); + res_high = vqrdmulhq_n_s32(res_high, output_multiplier); + res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift); + res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift); + const auto res_low_s16 = vmovn_s32(res_low); + const auto res_high_s16 = vmovn_s32(res_high); + const auto output_val = vaddq_s16(vcombine_s16(res_low_s16, + res_high_s16), + vdupq_n_s16(output->zero_point())); + vst1_u8(output_ptr + i, vqmovun_s16(output_val)); + } + + index_t handled_output_size = output->size() - output->size() % 8; +#pragma omp parallel for schedule(runtime) + for (index_t i = handled_output_size; i < output->size(); ++i) { + const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); + const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); + const int32_t shifted_input0 = offset_input0 * (1 << left_shift); + const int32_t shifted_input1 = offset_input1 * (1 << left_shift); + const int32_t multiplied_input0 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, + input0_multiplier), + -input0_shift); + const int32_t multiplied_input1 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, + input1_multiplier), + -input1_shift); + + int32_t res; + if (type_ == SUM) { + res = multiplied_input0 + multiplied_input1; + } else { + res = multiplied_input0 - multiplied_input1; + } + + const int32_t output_val = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(res, + output_multiplier), + -output_shift) + output->zero_point(); + output_ptr[i] = Saturate(output_val); + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace q8 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/q8/eltwise.h b/mace/ops/arm/q8/eltwise.h new file mode 100644 index 0000000000000000000000000000000000000000..200b13cb2769787a92c2d03da40f1b2e10d65900 --- /dev/null +++ b/mace/ops/arm/q8/eltwise.h @@ -0,0 +1,48 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This implements matrix-vector multiplication described as +// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt + +#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_ +#define MACE_OPS_ARM_Q8_ELTWISE_H_ + +#include "mace/core/op_context.h" +#include "mace/core/types.h" +#include "mace/ops/common/eltwise_type.h" + +namespace mace { +namespace ops { +namespace arm { +namespace q8 { + +class Eltwise { + public: + explicit Eltwise(const EltwiseType type) : type_(type) {} + + MaceStatus Compute(const OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output); + + private: + EltwiseType type_; +}; + +} // namespace q8 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_Q8_ELTWISE_H_ diff --git a/mace/ops/arm/q8/gemv.cc b/mace/ops/arm/q8/gemv.cc index 790a1448a138074105cc5d710e7c327fb5bf1f14..ce102e7e3171ff3344b4535576c9187866305fcd 100644 --- a/mace/ops/arm/q8/gemv.cc +++ b/mace/ops/arm/q8/gemv.cc @@ -18,14 +18,12 @@ #include #include -#include "mace/utils/utils.h" +#include "mace/utils/math.h" #include "mace/utils/quantize.h" #if !defined(__aarch64__) -#define vmlal_high_s16(c, a, b) vmlal_s16(c, vget_high_s16(a), vget_high_s16(b)) - -#define vaddvq_s32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3]) +#define vaddvq_u32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3]) #endif @@ -47,17 +45,19 @@ MaceStatus Gemv::Compute(const OpContext *context, Tensor *output) { MACE_UNUSED(context); - bool is_output_type_uint8 = - DataTypeToEnum::value == DataType::DT_UINT8; Tensor::MappingGuard lhs_guard(lhs); Tensor::MappingGuard rhs_guard(rhs); Tensor::MappingGuard bias_guard(bias); Tensor::MappingGuard output_guard(output); + const auto *lhs_data = lhs->data(); + const auto *rhs_data = rhs->data(); + OUTPUT_TYPE *output_data = output->mutable_data(); + float output_multiplier_float = 0.0; int32_t output_multiplier = 0; int32_t output_shift = 0; - if (is_output_type_uint8) { + if (is_output_type_uint8_) { MACE_CHECK(output->scale() > 0, "output scale must not be zero"); output_multiplier_float = lhs->scale() * rhs->scale() / output->scale(); GetOutputMultiplierAndShift(lhs->scale(), @@ -66,393 +66,110 @@ MaceStatus Gemv::Compute(const OpContext *context, &output_multiplier, &output_shift); } - const index_t h_block_size = 4; - const index_t h_block_count = RoundUpDiv(lhs_height, h_block_size); -#pragma omp parallel for collapse(2) schedule(runtime) + const int32_t lhs_zero_point = lhs->zero_point(); + const int32_t rhs_zero_point = rhs->zero_point(); + + const index_t w_block_size = 16; + const index_t w_block_count = lhs_width / w_block_size; + const index_t w_block_remain = lhs_width - w_block_size * w_block_count; + for (index_t b = 0; b < batch; ++b) { - for (index_t h_block_idx = 0; h_block_idx < h_block_count; ++h_block_idx) { - // TODO(liyin): it can be put it outside the loop, - // but openmp limits param count - const index_t w_block_size = 16; - const index_t w_block_count = lhs_width / w_block_size; - const index_t w_remain = lhs_width - w_block_size * w_block_count; - - uint8_t lhs_zero_point = static_cast(lhs->zero_point()); - uint8_t rhs_zero_point = static_cast(rhs->zero_point()); - - const uint8_t *lhs_data = lhs->data(); - const uint8_t *rhs_data = rhs->data(); - const int32_t *bias_data = nullptr; - if (bias) { - bias_data = bias->data(); + const uint8_t *rhs_base = + rhs_data + static_cast(rhs_batched) * b * lhs_width; + uint32_t sum_rhs = 0; + for (index_t i = 0; i < lhs_width; ++i) { + sum_rhs += static_cast(rhs_base[i]); + } + +#pragma omp parallel for schedule(runtime) + for (index_t h = 0; h < lhs_height; ++h) { + const uint8_t *lhs_ptr = lhs_data + + static_cast(lhs_batched) * b * lhs_height * lhs_width + + h * lhs_width; + const uint8_t *rhs_ptr = rhs_base; + OUTPUT_TYPE *output_ptr = output_data + b * lhs_height + h; + + uint32_t dot = 0; + uint32_t sum_lhs = 0; + uint32x4_t vo0_high_u32 = vdupq_n_u32(0); + uint32x4_t vo0_low_u32 = vdupq_n_u32(0); + uint32x4_t vo1_high_u32 = vdupq_n_u32(0); + uint32x4_t vo1_low_u32 = vdupq_n_u32(0); + uint32x4_t sum_lhs_low_u32 = vdupq_n_u32(0); + uint32x4_t sum_lhs_high_u32 = vdupq_n_u32(0); + + for (index_t w_block_idx = 0; w_block_idx < w_block_count; + ++w_block_idx) { + uint8x8_t vl0_u8 = vld1_u8(lhs_ptr); + uint8x8_t vl1_u8 = vld1_u8(lhs_ptr + 8); + + uint8x8_t vr0_u8 = vld1_u8(rhs_ptr); + uint8x8_t vr1_u8 = vld1_u8(rhs_ptr + 8); + + uint16x8_t vl0_u16 = vmovl_u8(vl0_u8); + uint16x8_t vl1_u16 = vmovl_u8(vl1_u8); + + uint16x8_t vr0_u16 = vmovl_u8(vr0_u8); + uint16x8_t vr1_u16 = vmovl_u8(vr1_u8); + + vo0_high_u32 = vmlal_u16(vo0_high_u32, + vget_high_u16(vl0_u16), + vget_high_u16(vr0_u16)); + vo0_low_u32 = vmlal_u16(vo0_low_u32, + vget_low_u16(vl0_u16), + vget_low_u16(vr0_u16)); + vo1_high_u32 = vmlal_u16(vo1_high_u32, + vget_high_u16(vl1_u16), + vget_high_u16(vr1_u16)); + vo1_low_u32 = vmlal_u16(vo1_low_u32, + vget_low_u16(vl1_u16), + vget_low_u16(vr1_u16)); + + // It can be precuculated if lhs is const, but for this case + // computation is not bottleneck + sum_lhs_high_u32 += vaddl_u16(vget_high_u16(vl0_u16), + vget_high_u16(vl1_u16)); + sum_lhs_low_u32 += vaddl_u16(vget_low_u16(vl0_u16), + vget_low_u16(vl1_u16)); + + lhs_ptr += 16; + rhs_ptr += 16; } - OUTPUT_TYPE *output_data = output->mutable_data(); - int32x4_t voutput_multiplier = vdupq_n_s32(output_multiplier); - int32x4_t voutput_shift_left = vdupq_n_s32(-output_shift); + vo0_low_u32 = vaddq_u32(vo0_high_u32, vo0_low_u32); + vo1_low_u32 = vaddq_u32(vo1_high_u32, vo1_low_u32); + vo0_low_u32 = vaddq_u32(vo0_low_u32, vo1_low_u32); + dot += vaddvq_u32(vo0_low_u32); - uint8x8_t - vlhs_zero_point = vdup_n_u8(lhs_zero_point); - uint8x8_t - vrhs_zero_point = vdup_n_u8(rhs_zero_point); + sum_lhs_low_u32 = vaddq_u32(sum_lhs_high_u32, sum_lhs_low_u32); + sum_lhs = vaddvq_u32(sum_lhs_low_u32); - const uint8_t - *lhs_ptr = lhs_data - + static_cast(lhs_batched) * b * lhs_height * lhs_width - + lhs_width * h_block_idx * h_block_size; - const uint8_t *rhs_ptr = - rhs_data + static_cast(rhs_batched) * b * lhs_width; - OUTPUT_TYPE - *ret_ptr = output_data + b * lhs_height + h_block_idx * h_block_size; - - const index_t h_block_len = - std::min(h_block_size, lhs_height - h_block_idx * h_block_size); - const index_t h_offset = h_block_idx * h_block_size; - - if (h_block_len == 4) { - int32x4_t vo0 = vdupq_n_s32(0); - int32x4_t vo1 = vdupq_n_s32(0); - int32x4_t vo2 = vdupq_n_s32(0); - int32x4_t vo3 = vdupq_n_s32(0); - - index_t r_w_block_count = w_block_count; - // just make compiler happy - MACE_UNUSED(r_w_block_count); - - // Register layout: (4x16) x (16x1) - // - // +----+ - // |d16 | - // | . | - // | . | - // | . | - // Rhs +----+ - // |d17 | - // | . | - // | . | - // | . | - // +----+ - // |d18 | - // | . | - // | . | - // | . | - // +----+ - // |d19 | - // | . | - // | . | - // | . | - // +----+ - // - // | | - // - // Lhs | | - // - // +--------+--------+--------+--------+ - - - - +----+ - // | d0 ... | d1 ... | d2 ... | d3 ... | |vo0 | - // | d4 ... | d5 ... | d6 ... | d7 ... | |vo1 | - // | d8 ... | d9 ... | d10... | d11... | |vo2 | - // | d12... | d13... | d14... | d15... | |vo3 | - // +--------+--------+--------+--------+ - - - - +----+ - // - // Accumulator - // - -#if not defined(__aarch64__) - asm volatile( - "cmp %[r_w_block_count], #0\n" - "beq 0f\n" - - "mov r0, %[rhs_ptr]\n" - "mov r1, %[lhs_ptr]\n" - "add r2, r1, %[lhs_width]\n" - "add r3, r2, %[lhs_width]\n" - "add r4, r3, %[lhs_width]\n" - - "vdup.u8 d20, %[rhs_zero_point]\n" - "vdup.u8 d21, %[lhs_zero_point]\n" - - // prelogue - "vld1.8 d16, [r0]!\n" - "vld1.8 d18, [r0]!\n" - - "vld1.8 d0, [r1]!\n" - "vld1.8 d2, [r1]!\n" - "vld1.8 d4, [r2]!\n" - "vld1.8 d6, [r2]!\n" - "vld1.8 d8, [r3]!\n" - "vld1.8 d10, [r3]!\n" - "vld1.8 d12, [r4]!\n" - "vld1.8 d14, [r4]!\n" - - "subs %[r_w_block_count], #1\n" - "beq 1f\n" - - "2: \n" - "vsubl.u8 q8, d16, d20\n" - "vsubl.u8 q9, d18, d20\n" - - "vsubl.u8 q0, d0, d21\n" - "vsubl.u8 q1, d2, d21\n" - "vsubl.u8 q2, d4, d21\n" - "vsubl.u8 q3, d6, d21\n" - "vsubl.u8 q4, d8, d21\n" - "vsubl.u8 q5, d10, d21\n" - "vsubl.u8 q6, d12, d21\n" - "vsubl.u8 q7, d14, d21\n" - - "vmlal.s16 %q[vo0], d0, d16\n" - "vmlal.s16 %q[vo1], d4, d16\n" - "vmlal.s16 %q[vo2], d8, d16\n" - "vmlal.s16 %q[vo3], d12, d16\n" - - "vld1.8 d0, [r1]!\n" - "vld1.8 d4, [r2]!\n" - "vld1.8 d8, [r3]!\n" - "vld1.8 d12, [r4]!\n" - "vld1.8 d16, [r0]!\n" - - "vmlal.s16 %q[vo0], d2, d18\n" - "vmlal.s16 %q[vo1], d6, d18\n" - "vmlal.s16 %q[vo2], d10, d18\n" - "vmlal.s16 %q[vo3], d14, d18\n" - - "vld1.8 d2, [r1]!\n" - "vld1.8 d6, [r2]!\n" - "vld1.8 d10, [r3]!\n" - "vld1.8 d14, [r4]!\n" - "vld1.8 d18, [r0]!\n" - - "vmlal.s16 %q[vo0], d1, d17\n" - "vmlal.s16 %q[vo1], d5, d17\n" - "vmlal.s16 %q[vo2], d9, d17\n" - "vmlal.s16 %q[vo3], d13, d17\n" - - "subs %[r_w_block_count], #1\n" - "vmlal.s16 %q[vo0], d3, d19\n" - "vmlal.s16 %q[vo1], d7, d19\n" - "vmlal.s16 %q[vo2], d11, d19\n" - "vmlal.s16 %q[vo3], d15, d19\n" - - "bne 2b\n" - - // prologue - "1:\n" - "vsubl.u8 q8, d16, d20\n" - "vsubl.u8 q9, d18, d20\n" - - "vsubl.u8 q0, d0, d21\n" - "vsubl.u8 q1, d2, d21\n" - "vsubl.u8 q2, d4, d21\n" - "vsubl.u8 q3, d6, d21\n" - "vsubl.u8 q4, d8, d21\n" - "vsubl.u8 q5, d10, d21\n" - "vsubl.u8 q6, d12, d21\n" - "vsubl.u8 q7, d14, d21\n" - - "vmlal.s16 %q[vo0], d0, d16\n" - "vmlal.s16 %q[vo1], d4, d16\n" - "vmlal.s16 %q[vo2], d8, d16\n" - "vmlal.s16 %q[vo3], d12, d16\n" - - "vmlal.s16 %q[vo0], d1, d17\n" - "vmlal.s16 %q[vo1], d5, d17\n" - "vmlal.s16 %q[vo2], d9, d17\n" - "vmlal.s16 %q[vo3], d13, d17\n" - - "vmlal.s16 %q[vo0], d2, d18\n" - "vmlal.s16 %q[vo1], d6, d18\n" - "vmlal.s16 %q[vo2], d10, d18\n" - "vmlal.s16 %q[vo3], d14, d18\n" - - "vmlal.s16 %q[vo0], d3, d19\n" - "vmlal.s16 %q[vo1], d7, d19\n" - "vmlal.s16 %q[vo2], d11, d19\n" - "vmlal.s16 %q[vo3], d15, d19\n" - - "0:\n" - : // outputs - [vo0] "+w"(vo0), - [vo1] "+w"(vo1), - [vo2] "+w"(vo2), - [vo3] "+w"(vo3), - [r_w_block_count] "+r"(r_w_block_count) - : // inputs - [lhs_ptr] "r"(lhs_ptr), [rhs_ptr] "r"(rhs_ptr), - [lhs_width] "r"(lhs_width), - [lhs_zero_point] "r"(lhs_zero_point), - [rhs_zero_point] "r"(rhs_zero_point) - : // clobbers - "cc", "memory", "r0", "r1", "r2", "r3", "r4", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", - "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", - "d21"); - - lhs_ptr += w_block_count * w_block_size; - rhs_ptr += w_block_count * w_block_size; -#else - for (index_t w_block_index = 0; w_block_index < w_block_count; - ++w_block_index) { - uint8x8_t vr0 = vld1_u8(rhs_ptr); - int16x8_t - vxr0 = vreinterpretq_s16_u16(vsubl_u8(vr0, vrhs_zero_point)); - uint8x8_t vr0n = vld1_u8(rhs_ptr + 8); - int16x8_t - vxr0n = vreinterpretq_s16_u16(vsubl_u8(vr0n, vrhs_zero_point)); - - uint8x8_t vl0 = vld1_u8(lhs_ptr); - int16x8_t - vxl0 = vreinterpretq_s16_u16(vsubl_u8(vl0, vlhs_zero_point)); - uint8x8_t vl0n = vld1_u8(lhs_ptr + 8); - int16x8_t - vxl0n = vreinterpretq_s16_u16(vsubl_u8(vl0n, vlhs_zero_point)); - - vo0 = vmlal_s16(vo0, vget_low_s16(vxl0), vget_low_s16(vxr0)); - vo0 = vmlal_high_s16(vo0, vxl0, vxr0); - vo0 = vmlal_s16(vo0, vget_low_s16(vxl0n), vget_low_s16(vxr0n)); - vo0 = vmlal_high_s16(vo0, vxl0n, vxr0n); - - const uint8_t *lhs_ptr1 = lhs_ptr + lhs_width; - - uint8x8_t vl1 = vld1_u8(lhs_ptr1); - int16x8_t - vxl1 = vreinterpretq_s16_u16(vsubl_u8(vl1, vlhs_zero_point)); - uint8x8_t vl1n = vld1_u8(lhs_ptr1 + 8); - int16x8_t - vxl1n = vreinterpretq_s16_u16(vsubl_u8(vl1n, vlhs_zero_point)); - - vo1 = vmlal_s16(vo1, vget_low_s16(vxl1), vget_low_s16(vxr0)); - vo1 = vmlal_high_s16(vo1, vxl1, vxr0); - vo1 = vmlal_s16(vo1, vget_low_s16(vxl1n), vget_low_s16(vxr0n)); - vo1 = vmlal_high_s16(vo1, vxl1n, vxr0n); - - const uint8_t *lhs_ptr2 = lhs_ptr1 + lhs_width; - - uint8x8_t vl2 = vld1_u8(lhs_ptr2); - int16x8_t - vxl2 = vreinterpretq_s16_u16(vsubl_u8(vl2, vlhs_zero_point)); - uint8x8_t vl2n = vld1_u8(lhs_ptr2 + 8); - int16x8_t - vxl2n = vreinterpretq_s16_u16(vsubl_u8(vl2n, vlhs_zero_point)); - - vo2 = vmlal_s16(vo2, vget_low_s16(vxl2), vget_low_s16(vxr0)); - vo2 = vmlal_high_s16(vo2, vxl2, vxr0); - vo2 = vmlal_s16(vo2, vget_low_s16(vxl2n), vget_low_s16(vxr0n)); - vo2 = vmlal_high_s16(vo2, vxl2n, vxr0n); - - const uint8_t *lhs_ptr3 = lhs_ptr2 + lhs_width; - - uint8x8_t vl3 = vld1_u8(lhs_ptr3); - int16x8_t - vxl3 = vreinterpretq_s16_u16(vsubl_u8(vl3, vlhs_zero_point)); - uint8x8_t vl3n = vld1_u8(lhs_ptr3 + 8); - int16x8_t - vxl3n = vreinterpretq_s16_u16(vsubl_u8(vl3n, vlhs_zero_point)); - - vo3 = vmlal_s16(vo3, vget_low_s16(vxl3), vget_low_s16(vxr0)); - vo3 = vmlal_high_s16(vo3, vxl3, vxr0); - vo3 = vmlal_s16(vo3, vget_low_s16(vxl3n), vget_low_s16(vxr0n)); - vo3 = vmlal_high_s16(vo3, vxl3n, vxr0n); - - lhs_ptr += 16; - rhs_ptr += 16; - } -#endif // __aarch64__ - int32x4_t vo = {vaddvq_s32(vo0), - vaddvq_s32(vo1), - vaddvq_s32(vo2), - vaddvq_s32(vo3)}; - - for (index_t w = 0; w < w_remain; ++w) { - vo[0] += - (lhs_ptr[0] - lhs_zero_point) * (rhs_ptr[0] - rhs_zero_point); - vo[1] += (lhs_ptr[lhs_width] - lhs_zero_point) - * (rhs_ptr[0] - rhs_zero_point); - vo[2] += (lhs_ptr[lhs_width * 2] - lhs_zero_point) - * (rhs_ptr[0] - rhs_zero_point); - vo[3] += (lhs_ptr[lhs_width * 3] - lhs_zero_point) - * (rhs_ptr[0] - rhs_zero_point); - ++lhs_ptr; - ++rhs_ptr; - } - - int32x4_t vbias = vdupq_n_s32(0); - if (bias) { - vbias = vld1q_s32(bias_data + h_offset); - } - vo = vaddq_s32(vo, vbias); - - if (is_output_type_uint8) { - int32x4_t vo_mul = vqrdmulhq_s32(vo, voutput_multiplier); - int32x4_t - fixup = vshrq_n_s32(vandq_s32(vo_mul, voutput_shift_left), 31); - int32x4_t fixed_up_x = vqaddq_s32(vo_mul, fixup); - int32x4_t - vo_rescale_int32 = vrshlq_s32(fixed_up_x, voutput_shift_left); - - int16x4_t vo_rescale_int16 = vqmovn_s32(vo_rescale_int32); - uint8x8_t vo_rescale_uint8 = - vqmovun_s16(vcombine_s16(vo_rescale_int16, vo_rescale_int16)); - - ret_ptr[0] = vo_rescale_uint8[0]; - ret_ptr[1] = vo_rescale_uint8[1]; - ret_ptr[2] = vo_rescale_uint8[2]; - ret_ptr[3] = vo_rescale_uint8[3]; - } else { - ret_ptr[0] = vo[0]; - ret_ptr[1] = vo[1]; - ret_ptr[2] = vo[2]; - ret_ptr[3] = vo[3]; - } - } else { // h_block_len < 4 - // TODO(liyin): handle here case by case (1,2,3) to accelerate - const uint8_t *tmp_lhs_ptr = lhs_ptr; - const uint8_t *tmp_rhs_ptr = rhs_ptr; - for (index_t h = 0; h < h_block_len; ++h) { - lhs_ptr = tmp_lhs_ptr + h * lhs_width; - rhs_ptr = tmp_rhs_ptr; - int32x4_t vo0 = vdupq_n_s32(0); - for (index_t w = 0; w < w_block_count; ++w) { - uint8x8_t vr0 = vld1_u8(rhs_ptr); - int16x8_t - vxr0 = vreinterpretq_s16_u16(vsubl_u8(vr0, vrhs_zero_point)); - uint8x8_t vr0n = vld1_u8(rhs_ptr + 8); - int16x8_t - vxr0n = vreinterpretq_s16_u16(vsubl_u8(vr0n, vrhs_zero_point)); - - uint8x8_t vl0 = vld1_u8(lhs_ptr); - int16x8_t - vxl0 = vreinterpretq_s16_u16(vsubl_u8(vl0, vlhs_zero_point)); - uint8x8_t vl0n = vld1_u8(lhs_ptr + 8); - int16x8_t - vxl0n = vreinterpretq_s16_u16(vsubl_u8(vl0n, vlhs_zero_point)); - - vo0 = vmlal_s16(vo0, vget_low_s16(vxl0), vget_low_s16(vxr0)); - vo0 = vmlal_high_s16(vo0, vxl0, vxr0); - vo0 = vmlal_s16(vo0, vget_low_s16(vxl0n), vget_low_s16(vxr0n)); - vo0 = vmlal_high_s16(vo0, vxl0n, vxr0n); - - lhs_ptr += 16; - rhs_ptr += 16; - } // w - int32_t s0 = vaddvq_s32(vo0) + (bias ? bias_data[h_offset + h] : 0); - for (index_t w = 0; w < w_remain; ++w) { - s0 += (lhs_ptr[0] - lhs_zero_point) * (rhs_ptr[0] - rhs_zero_point); - ++lhs_ptr; - ++rhs_ptr; - } // w - - if (is_output_type_uint8) { - ret_ptr[h] = - Saturate(std::roundf(s0 * output_multiplier_float)); - } else { - ret_ptr[h] = s0; - } - } // h - } // if - } // h_block_idx + for (index_t w = 0; w < w_block_remain; ++w) { + dot += (*lhs_ptr) * (*rhs_ptr); + sum_lhs += (*lhs_ptr); + ++lhs_ptr; + ++rhs_ptr; + } + + const auto zero_point_dot = + static_cast(lhs_zero_point * rhs_zero_point * lhs_width); + int32_t ret = dot - sum_lhs * rhs_zero_point - sum_rhs * lhs_zero_point + + zero_point_dot; + if (bias) { + ret += bias->data()[h]; + } + + if (is_output_type_uint8_) { + *output_ptr = + Saturate(std::roundf(ret * output_multiplier_float)); + } else { + *output_ptr = ret; + } + } // h } // b + return MaceStatus::MACE_SUCCESS; } @@ -466,7 +183,6 @@ class Gemv; } // namespace ops } // namespace mace -#if defined(vmlal_high_s16) -#undef vmlal_high_s16 -#undef vaddvq_s32 -#endif +#ifdef vaddvq_u32 +#undef vaddvq_u32 +#endif // vaddvq_u32 diff --git a/mace/ops/arm/q8/gemv.h b/mace/ops/arm/q8/gemv.h index adcb9590ebeff38eb8409ec49eb13a84044f64d8..21a275798a7dd9533c1645d606386aa89cf91a92 100644 --- a/mace/ops/arm/q8/gemv.h +++ b/mace/ops/arm/q8/gemv.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,7 +30,9 @@ namespace q8 { template class Gemv { public: - Gemv() {} + Gemv() : is_output_type_uint8_( + DataTypeToEnum::value == DataType::DT_UINT8) { + } ~Gemv() {} // Always row-major after transpose MaceStatus Compute( @@ -44,6 +46,9 @@ class Gemv { const bool lhs_batched, const bool rhs_batched, Tensor *output); + + private: + bool is_output_type_uint8_; }; } // namespace q8 diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index ee44ec59f7c329215f3a5ba95c8a6bf6e18f6399..469efe2e0c5eaac299d2622931a5e36154973d8e 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -22,6 +22,7 @@ #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/batch_norm.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -156,8 +157,8 @@ class BatchNormOp : public Operation { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; - kernel_.reset(new opencl::image::BatchNormKernel( - epsilon, activation, relux_max_limit, leakyrelu_coefficient)); + kernel_ = make_unique>( + epsilon, activation, relux_max_limit, leakyrelu_coefficient); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 11bf4f6e74dbfd7141963add244c50f8b9b1ff35..74f7a013c14af8294aaabcddf5a7a29d8662edf1 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index 8d1e463c56b3510901d42d5d4370273d252ecbf2..cfd350d458429ea86a68e9176c41108e2469f392 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -19,6 +19,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/batch_to_space.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -266,7 +267,7 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { explicit BatchToSpaceNDOp(OpConstructContext *context) : BatchToSpaceOpBase(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::BatchToSpaceKernel); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 3552a0a31289cbb070bd761644d5711530ea3b80..a8883e1431205f46e5abbb2a78f4b45d8537cec7 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -22,6 +22,7 @@ #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/bias_add.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -34,8 +35,8 @@ class BiasAddOp : public Operation { public: explicit BiasAddOp(OpConstructContext *context) : Operation(context), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", NHWC))) {} + has_data_format_(Operation::GetOptionalArg("has_data_format", 0)) + {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -56,7 +57,7 @@ class BiasAddOp : public Operation { const float *bias_ptr = bias->data(); float *output_ptr = output->mutable_data(); - if (input->dim_size() == 4 && data_format_ == NCHW) { + if (input->dim_size() == 4 && has_data_format_) { const index_t batch = input->dim(0); const index_t channels = input->dim(1); const index_t height_width = input->dim(2) * input->dim(3); @@ -89,7 +90,7 @@ class BiasAddOp : public Operation { } private: - DataFormat data_format_; + int has_data_format_; }; #ifdef MACE_ENABLE_OPENCL @@ -98,12 +99,11 @@ class BiasAddOp : public Operation { public: explicit BiasAddOp(OpConstructContext *context) : Operation(context), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", NHWC))) { + has_data_format_(Operation::GetOptionalArg("has_data_format", 1)) { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; - kernel_.reset(new opencl::image::BiasAddKernel); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } @@ -120,13 +120,13 @@ class BiasAddOp : public Operation { Tensor *output = this->Output(0); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC, + MACE_CHECK(input->dim_size() == 4 && has_data_format_, "gpu only support biasadd for 4-dimensional NHWC format tensor"); return kernel_->Compute(context, input, bias, output); } private: - DataFormat data_format_; + int has_data_format_; std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 920a478f7202d6af7bef000ea4693cc8aa67c292..7de89dd2296829390eb1964911af5378c6edf9cc 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -42,7 +42,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { OpDefBuilder("BiasAdd", "BiasAddBM") .Input("Input") .Input("Bias") - .AddIntArg("data_format", data_format) + .AddIntArg("has_data_format", 1) .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index 92b918592f984692ccaed7744bb4f4cc9fb3a17e..2e4764cac8ad2cf1f303a2e53c64fda444023fa3 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -36,7 +36,7 @@ void BiasAddSimple() { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") - .AddIntArg("data_format", NCHW) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); // Run @@ -90,7 +90,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") - .AddIntArg("data_format", NCHW) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); @@ -139,7 +139,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") - .AddIntArg("data_format", NCHW) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index d4e687b846f340cddbaf4cd3b50854f326b6eb44..8249c344bb4c7fed189aeae4afee3f42fce6c70c 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -50,8 +50,6 @@ void FilterBufferToImage(int iters, b2i_output); }; - // Warm-up - net.Setup(D); for (int i = 0; i < 5; ++i) { transform_func(); } diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc index 15f6e7d323e7885f779a015d99403e9ed7fc6f2d..229d4eb9657432f7966368da759cb0b497972ee9 100644 --- a/mace/ops/buffer_transform.cc +++ b/mace/ops/buffer_transform.cc @@ -39,14 +39,14 @@ class BufferTransformOp : public Operation { auto type = static_cast(Operation::GetOptionalArg( "buffer_type", static_cast(CONV2D_FILTER))); - auto data_format = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); + bool has_data_format = Operation::GetOptionalArg("has_data_format", 0) + != 0; MemoryType in_mem_type = context->workspace()->GetTensor( operator_def_->input(0))->memory_type(); return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform( context, input, type, out_mem_type_, wino_blk_size_, - data_format, output); + has_data_format, output); } private: diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index 57607755cc034f364d07660924d6481e3d79793b..70e1811a07292af8eb0982caf46decb393f28325 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -18,6 +18,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/channel_shuffle.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -83,7 +84,7 @@ class ChannelShuffleOp : public Operation { : Operation(context) { const int groups = Operation::GetOptionalArg("group", 1); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ChannelShuffleKernel(groups)); + kernel_ = make_unique>(groups); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc index 8634cf2cb8333d03a97b131692c84d5f5249cab5..ade33c59002d3924123eede8687269de3abb2119 100644 --- a/mace/ops/common/conv_pool_2d_util.cc +++ b/mace/ops/common/conv_pool_2d_util.cc @@ -24,7 +24,7 @@ namespace ops { void CalcPaddingAndOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const FilterDataFormat filter_format, + const DataFormat filter_format, const int *dilations, const int *strides, Padding padding, @@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC void CalcOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const FilterDataFormat filter_format, + const DataFormat filter_format, const int *padding_size, const int *dilations, const int *strides, diff --git a/mace/ops/common/conv_pool_2d_util.h b/mace/ops/common/conv_pool_2d_util.h index db359ee92b02a88c48555ada851047f3ebe7f2e5..e8d0d335f1e0900cf1c265817cbcd73dd63c66b3 100644 --- a/mace/ops/common/conv_pool_2d_util.h +++ b/mace/ops/common/conv_pool_2d_util.h @@ -35,7 +35,7 @@ namespace ops { void CalcPaddingAndOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const FilterDataFormat filter_format, + const DataFormat filter_format, const int *dilations, const int *strides, Padding padding, @@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, void CalcOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const FilterDataFormat filter_format, + const DataFormat filter_format, const int *padding_size, const int *dilations, const int *strides, diff --git a/mace/ops/common/eltwise_type.h b/mace/ops/common/eltwise_type.h new file mode 100644 index 0000000000000000000000000000000000000000..634c4919c18f221b255939a01d8411428b8f3476 --- /dev/null +++ b/mace/ops/common/eltwise_type.h @@ -0,0 +1,40 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_COMMON_ELTWISE_TYPE_H_ +#define MACE_OPS_COMMON_ELTWISE_TYPE_H_ + +namespace mace { +namespace ops { + +enum EltwiseType { + SUM = 0, + SUB = 1, + PROD = 2, + DIV = 3, + MIN = 4, + MAX = 5, + NEG = 6, + ABS = 7, + SQR_DIFF = 8, + POW = 9, + EQUAL = 10, + FLOOR_DIV = 11, + NONE = 12, +}; + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_COMMON_ELTWISE_TYPE_H_ diff --git a/mace/ops/gemmlowp_util.h b/mace/ops/common/gemmlowp_util.h similarity index 96% rename from mace/ops/gemmlowp_util.h rename to mace/ops/common/gemmlowp_util.h index c7091544ef5d90ef5fa11cbaacb052744dbe0ef0..c7eed2ad275c9b51cc5cf55cf2f88f90edf3d500 100644 --- a/mace/ops/gemmlowp_util.h +++ b/mace/ops/common/gemmlowp_util.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_GEMMLOWP_UTIL_H_ -#define MACE_OPS_GEMMLOWP_UTIL_H_ +#ifndef MACE_OPS_COMMON_GEMMLOWP_UTIL_H_ +#define MACE_OPS_COMMON_GEMMLOWP_UTIL_H_ #include @@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline { }; } // namespace mace -#endif // MACE_OPS_GEMMLOWP_UTIL_H_ +#endif // MACE_OPS_COMMON_GEMMLOWP_UTIL_H_ diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 47e95a37190cbf2eb6aed08af544220ad9ce8643..6b2ac58a23e3ebbcb59e72300b682cd809263cca 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -16,6 +16,7 @@ #include "mace/core/operator.h" #include "mace/utils/quantize.h" +#include "mace/utils/memory.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/concat.h" @@ -59,9 +60,9 @@ class ConcatOp : public ConcatOpBase { MACE_UNUSED(context); if (!checked_) { Validate(); - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && this->Input(0)->dim_size() == 4) { if (axis_ == 3) axis_ = 1; else if (axis_ == 2) axis_ = 3; else if (axis_ == 1) axis_ = 2; @@ -199,7 +200,7 @@ class ConcatOp : public ConcatOpBase { explicit ConcatOp(OpConstructContext *context) : ConcatOpBase(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ConcatKernel(axis_)); + kernel_ = make_unique>(axis_); } else { MACE_NOT_IMPLEMENTED; } @@ -250,9 +251,12 @@ void RegisterConcat(OpRegistryBase *op_registry) { if (op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } else { + int has_data_format = + ProtoArgHelper::GetOptionalArg( + *op, "has_data_format", 0); int axis = ProtoArgHelper::GetOptionalArg( *op, "axis", 3); - if (axis != 3) { + if (!has_data_format || axis != 3) { return { DeviceType::CPU }; } bool divisible_four = true; diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 88061a7b19804b9fda948bdc7c556fd2b81638fa..22eb544f96f15465177170868bdf4e68bcf46ab4 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -91,6 +91,7 @@ void OpenCLConcatHelper(int iters, .Input("Input0") .Input("Input1") .AddIntArg("axis", concat_dim) + .AddIntArg("has_data_format", 1) .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index fc57920b7fe7a7e3ca2d4aca8bb7fd80a2d76aa7..bc41b11e394835e22ad3670d49e67781ec4ea372 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -100,11 +100,12 @@ TEST_F(ConcatOpTest, CPUSimpleVertical) { } } -TEST_F(ConcatOpTest, CPURandom) { +namespace { +void CPURandomTest(int input_dim, int has_data_format) { static unsigned int seed = time(NULL); - int dim = 5; + int dim = input_dim; int num_inputs = 2 + rand_r(&seed) % 10; - int axis = 1; + int axis = 3; // Construct graph OpsTestNet net; auto builder = OpDefBuilder("Concat", "ConcatTest"); @@ -112,9 +113,13 @@ TEST_F(ConcatOpTest, CPURandom) { builder = builder.Input(MakeString("Input", i)); } builder.AddIntArg("axis", axis) + .AddIntArg("has_data_format", has_data_format) .Output("Output") .Finalize(net.NewOperatorDef()); + if (has_data_format) { + axis = 1; + } std::vector shape_data; GenerateRandomIntTypeData({dim}, &shape_data, 1, dim); std::vector> input_shapes(num_inputs, shape_data); @@ -152,6 +157,13 @@ TEST_F(ConcatOpTest, CPURandom) { } } } +} // namespace + +TEST_F(ConcatOpTest, CPURandom) { + CPURandomTest(5, 0); + CPURandomTest(4, 0); + CPURandomTest(4, 1); +} TEST_F(ConcatOpTest, QuantizedCPURandom) { static unsigned int seed = time(NULL); @@ -186,7 +198,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { builder = builder.Input(MakeString("Input", i)); } builder.AddIntArg("axis", axis_arg) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); @@ -248,7 +260,7 @@ namespace { template void OpenCLRandomTest(const std::vector> &shapes, const int axis, - DataFormat data_format) { + bool has_data_format) { srand(time(nullptr)); int num_inputs = shapes.size(); int concat_axis_size = 0; @@ -275,7 +287,7 @@ void OpenCLRandomTest(const std::vector> &shapes, builder.AddIntArg("axis", axis) .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("data_format", data_format) + .AddIntArg("has_data_format", has_data_format) .OutputShape(expected_shape) .Finalize(net.NewOperatorDef()); @@ -309,38 +321,37 @@ void OpenCLRandomTest(const std::vector> &shapes, } // namespace TEST_F(ConcatOpTest, OPENCLAligned) { - OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, - DataFormat::NHWC); + OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1); } TEST_F(ConcatOpTest, OPENCLHalfAligned) { - OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, - DataFormat::NHWC); + OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1); } TEST_F(ConcatOpTest, OPENCLUnAligned) { - OpenCLRandomTest({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3, - DataFormat::NHWC); + OpenCLRandomTest({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3, 1); } TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) { OpenCLRandomTest( {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, - 3, DataFormat::NHWC); + 3, 1); } TEST_F(ConcatOpTest, GPUFallbackToCPU2DInput) { - OpenCLRandomTest({{3, 4}, {3, 4}}, 1, DataFormat::DF_NONE); + OpenCLRandomTest({{3, 4}, {3, 4}}, 1, 0); } TEST_F(ConcatOpTest, GPUFallbackToCPUChanNotDivisibleBy4) { - OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3, - DataFormat::DF_NONE); + OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3, 0); +} + +TEST_F(ConcatOpTest, GPUFallbackToCPUNoDataFormat) { + OpenCLRandomTest({{1, 1, 4, 4}, {1, 1, 4, 4}}, 3, 0); } TEST_F(ConcatOpTest, GPUFallbackToCPUAxis2) { - OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2, - DataFormat::DF_NONE); + OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2, 0); } } // namespace test diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 19794b38be56fe3a99deb0583b0967575de571ae..a6421f45fed1b0520e468acaae58c5439c8c03e3 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#if defined(MACE_ENABLE_NEON) #include #endif #include @@ -27,21 +27,26 @@ #include "mace/core/operator.h" #include "mace/core/tensor.h" #include "mace/ops/activation.h" -#include "mace/ops/arm/conv_2d_neon.h" -#include "mace/ops/arm/conv_winograd.h" #include "mace/ops/conv_pool_2d_base.h" #include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/utils/utils.h" +#include "mace/utils/memory.h" +#include "mace/utils/math.h" #ifdef MACE_ENABLE_NEON #include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/fp32/conv_2d_1x1.h" -#else -#include "mace/ops/ref/conv_2d.h" +#include "mace/ops/arm/fp32/conv_2d_3x3.h" +#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h" +#include "mace/ops/arm/fp32/conv_2d_5x5.h" +#include "mace/ops/arm/fp32/conv_2d_7x7.h" +#include "mace/ops/arm/fp32/conv_2d_1xn.h" +#include "mace/ops/arm/fp32/conv_general.h" #endif // MACE_ENABLE_NEON +#include "mace/ops/ref/conv_2d.h" + #ifdef MACE_ENABLE_QUANTIZE -#include "mace/ops/gemmlowp_util.h" +#include "mace/ops/common/gemmlowp_util.h" #include "mace/ops/quantization_util.h" #endif // MACE_ENABLE_QUANTIZE @@ -54,22 +59,20 @@ namespace mace { namespace ops { -template +template class Conv2dOp; -template <> +template<> class Conv2dOp : public ConvPool2dOpBase { public: explicit Conv2dOp(OpConstructContext *context) : ConvPool2dOpBase(context), activation_(ops::StringToActivationType( Operation::GetOptionalArg("activation", - "NOOP"))), + "NOOP"))), relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), leakyrelu_coefficient_(Operation::GetOptionalArg( - "leakyrelu_coefficient", 0.0f)), - is_filter_transformed_(false), - conv2d_delegator_(nullptr) {} + "leakyrelu_coefficient", 0.0f)) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); @@ -77,445 +80,99 @@ class Conv2dOp : public ConvPool2dOpBase { const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; Tensor *output = this->Output(OUTPUT); - index_t input_batch = input->dim(0); - index_t input_channels = input->dim(1); - std::vector filter_shape(4); - filter_shape = filter->shape(); - - index_t stride_h = strides_[0]; - index_t stride_w = strides_[1]; - - index_t dilation_h = dilations_[0]; - index_t dilation_w = dilations_[1]; - - std::vector output_shape(4); - std::vector paddings(2); - if (paddings_.empty()) { - CalcNCHWPaddingAndOutputSize(input->shape().data(), - filter_shape.data(), - dilations_.data(), - strides_.data(), - padding_type_, - output_shape.data(), - paddings.data()); - } else { - paddings = paddings_; - CalcNCHWOutputSize(input->shape().data(), - filter_shape.data(), - paddings_.data(), - dilations_.data(), - strides_.data(), - RoundType::FLOOR, - output_shape.data()); - } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t height = output->dim(2); - index_t width = output->dim(3); - - MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); - MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels); - MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ", - input_channels); + const index_t channels = filter->dim(0); #ifdef MACE_ENABLE_NEON - index_t input_height = input->dim(2); - index_t input_width = input->dim(3); - index_t filter_h = filter->dim(2); - index_t filter_w = filter->dim(3); - - if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 - && dilation_h == 1 && dilation_w == 1) { - if (conv2d_delegator_.get() == nullptr) { - conv2d_delegator_.reset(new arm::fp32::Conv2dK1x1()); - } - conv2d_delegator_->Compute(context, input, filter, output); - } else { - // TODO(liyin): the code below needs to be refactored. - // delegate to each of kernels instead of ruling them all - index_t padded_input_height = input_height + paddings[0]; - index_t padded_input_width = input_width + paddings[1]; - index_t extra_input_height = padded_input_height; - index_t extra_input_width = padded_input_width; - index_t extra_output_height = height; - index_t extra_output_width = width; - - int pad_top = paddings[0] >> 1; - int pad_bottom = paddings[0] - pad_top; - int pad_left = paddings[1] >> 1; - int pad_right = paddings[1] - pad_left; - - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard filter_guard(filter); - Tensor::MappingGuard output_guard(output); - - auto filter_data = filter->data(); - auto output_data = output->mutable_data(); - - std::function conv_func; - - bool - use_winograd = filter_h == 3 && filter_w == 3 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1 - && input_channels >= 8 && channels >= 8; - bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3 + // the following params are used to decide which conv delegator to use + const index_t stride_h = strides_[0]; + const index_t stride_w = strides_[1]; + const index_t dilation_h = dilations_[0]; + const index_t dilation_w = dilations_[1]; + const index_t filter_h = filter->dim(2); + const index_t filter_w = filter->dim(3); + const index_t input_channels = input->dim(1); + + // NOTE: delegator is fixed after first round of running, + // although winograd depends on input params. + // We do not support changeable filter for now. + if (conv2d_delegator_.get() == nullptr) { + if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 + && dilation_h == 1 && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3 + && dilation_w == 1) { + if (input_channels >= 8 && channels >= 8) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } + } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_5x5_s1 = filter_h == 5 && filter_w == 5 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 5 && filter_w == 5 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_1x7_s1 = filter_h == 1 && filter_w == 7 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 7 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x1_s1 = filter_h == 7 && filter_w == 1 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x7_s1 = filter_h == 7 && filter_w == 7 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x7_s2 = filter_h == 7 && filter_w == 7 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 7 && stride_h == 2 && stride_w == 2 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 7 && stride_h == 3 && stride_w == 3 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_1x15_s1 = filter_h == 1 && filter_w == 15 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 1 && filter_w == 7 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - bool use_neon_15x1_s1 = filter_h == 15 && filter_w == 1 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 7 && filter_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 - && dilation_w == 1; - - std::vector transformed_input_shape; - std::vector transformed_output_shape; - std::vector transformed_filter_shape; - - // When size of input feature map is bigger than 16x16, - // set winograd out tile size to 6 to get higher performance. - index_t winograd_out_tile_size = 2; - if (input_height > 16 && input_width > 16) { - winograd_out_tile_size = 6; - } - - if (use_winograd) { - extra_output_height = RoundUp(height, winograd_out_tile_size); - extra_input_height = - std::max(padded_input_height, extra_output_height + 2); - extra_output_width = RoundUp(width, winograd_out_tile_size); - extra_input_width = - std::max(padded_input_width, extra_output_width + 2); - if (extra_input_height != padded_input_height) { - pad_bottom += (extra_input_height - padded_input_height); - } - if (extra_input_width != padded_input_width) { - pad_right += (extra_input_width - padded_input_width); - } - - index_t - tile_height_count = extra_output_height / winograd_out_tile_size; - index_t tile_width_count = extra_output_width / winograd_out_tile_size; - index_t tile_count = tile_height_count * tile_width_count; - index_t in_tile_area = - (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2); - - transformed_input_shape.insert(transformed_input_shape.end(), - {in_tile_area, batch, input_channels, - tile_count}); - transformed_output_shape.insert(transformed_output_shape.end(), - {in_tile_area, batch, channels, - tile_count}); - transformed_filter_shape.insert(transformed_filter_shape.end(), - {in_tile_area, channels, - input_channels}); - } else { - index_t tile_h, tile_w; - if (use_neon_3x3_s1) { - tile_h = 2; - tile_w = 4; - } else if (use_neon_7x1_s1 || use_neon_15x1_s1) { - tile_h = 4; - tile_w = 1; - } else { - tile_h = 1; - tile_w = 4; - } - extra_output_height = RoundUp(height, tile_h); - extra_input_height = - std::max(padded_input_height, (extra_output_height - 1) * stride_h - + (filter_h - 1) * dilation_h + 1); - extra_output_width = RoundUp(width, tile_w); - extra_input_width = - std::max(padded_input_width, (extra_output_width - 1) * stride_w - + (filter_w - 1) * dilation_w + 1); - if (extra_input_height != padded_input_height) { - pad_bottom += (extra_input_height - padded_input_height); - } - if (extra_input_width != padded_input_width) { - pad_right += (extra_input_width - padded_input_width); - } - } - - // decide scratch size before allocate it - index_t total_scratch_size = 0; - index_t transformed_input_size = 0; - index_t transformed_output_size = 0; - index_t padded_input_size = 0; - index_t padded_output_size = 0; - if (use_winograd) { - transformed_input_size = - std::accumulate(transformed_input_shape.begin(), - transformed_input_shape.end(), - 1, - std::multiplies()) * sizeof(float); - transformed_output_size = - std::accumulate(transformed_output_shape.begin(), - transformed_output_shape.end(), - 1, - std::multiplies()) * sizeof(float); - total_scratch_size += transformed_input_size + transformed_output_size; - } - if (extra_input_height != input_height - || extra_input_width != input_width) { - padded_input_size = - batch * input_channels * (input_height + pad_top + pad_bottom) - * (input_width + pad_left + pad_right) * sizeof(float) + - MACE_EXTRA_BUFFER_PAD_SIZE; - total_scratch_size += padded_input_size; - } - if (extra_output_height != height || extra_output_width != width) { - padded_output_size = - batch * channels * extra_output_height * extra_output_width - * sizeof(float); - total_scratch_size += padded_output_size; - } - - if (use_winograd) { - total_scratch_size += transformed_input_size + transformed_output_size; - } - - // Init scratch buffer - ScratchBuffer *scratch = context->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(total_scratch_size); - Tensor - transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT); - Tensor - transformed_output - (scratch->Scratch(transformed_output_size), DT_FLOAT); - Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT); - Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT); - const index_t extra_input_shape[4] = - {batch, input_channels, extra_input_height, extra_input_width}; - const index_t extra_output_shape[4] = - {batch, channels, extra_output_height, extra_output_width}; - - // make host compiler happy - MACE_UNUSED(extra_input_shape); - MACE_UNUSED(extra_output_shape); - - Tensor transformed_filter; - - // decide which convolution function to call - if (use_winograd) { - transformed_input.Reshape(transformed_input_shape); - transformed_output.Reshape(transformed_output_shape); - const float *transformed_filter_data = nullptr; - // filter only needs to be transformed once, set transformed_filter_data - // to null after the first run. - if (!is_filter_transformed_) { - transformed_filter.Resize(transformed_filter_shape); - switch (winograd_out_tile_size) { - case 2: - TransformFilter4x4(filter_data, - filter_shape[1], - filter_shape[0], - transformed_filter.mutable_data()); - break; - case 6: - TransformFilter8x8(filter_data, - filter_shape[1], - filter_shape[0], - transformed_filter.mutable_data()); - break; - default:MACE_NOT_IMPLEMENTED; - } - transformed_filter_data = transformed_filter.data(); - is_filter_transformed_ = true; - } - - float *transformed_input_data = transformed_input.mutable_data(); - float - *transformed_output_data = transformed_output.mutable_data(); - - conv_func = [=](const float *pad_input, float *pad_output) { - WinoGradConv3x3s1(pad_input, - transformed_filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - channels, - winograd_out_tile_size, - transformed_input_data, - transformed_output_data, - pad_output, - &sgemm_, - scratch); - }; - } else if (use_neon_3x3_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK3x3S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_3x3_s2) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK3x3S2(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_5x5_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK5x5S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_1x7_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK1x7S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x1_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x1S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x7_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x7S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x7_s2) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x7S2(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_7x7_s3) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK7x7S3(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_1x15_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK1x15S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else if (use_neon_15x1_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK15x1S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); - }; - } else { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dGeneral(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - filter_shape.data(), - strides_.data(), - dilations_.data(), - pad_output); - }; - } - - // pad input and output - const Tensor *pad_input_ptr = input; - if (extra_input_height != input_height - || extra_input_width != input_width) { - MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding( - input, pad_top, pad_bottom, pad_left, pad_right, &padded_input)); - pad_input_ptr = &padded_input; - } - - // TODO(libin): don't need clear after bias is integrated in each conv - Tensor *pad_output_ptr = output; - if (extra_output_height != height || extra_output_width != width) { - padded_output.Reshape({batch, channels, extra_output_height, - extra_output_width}); - padded_output.Clear(); - pad_output_ptr = &padded_output; + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 1 && filter_w == 15 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); + } else if (filter_h == 15 && filter_w == 1 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 + && dilation_w == 1) { + conv2d_delegator_ = make_unique( + paddings_, padding_type_); } else { - output->Clear(); - } - - const float *pad_input_data = pad_input_ptr->data(); - float *pad_output_data = pad_output_ptr->mutable_data(); - - conv_func(pad_input_data, pad_output_data); - - // unpack output - if (extra_output_height != height || extra_output_width != width) { -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t h = 0; h < height; ++h) { - memcpy( - output_data + b * channels * height * width - + c * height * width - + h * width, - pad_output_data - + b * channels * extra_output_height * extra_output_width - + c * extra_output_height * extra_output_width - + h * extra_output_width, - sizeof(float) * width); - } - } - } + conv2d_delegator_ = make_unique( + strides_, + dilations_, + paddings_, + padding_type_); } } + + conv2d_delegator_->Compute(context, input, filter, output); #else - if (conv2d_delegator_.get() == nullptr) { - conv2d_delegator_.reset(new ref::Conv2d(paddings[0], - paddings[1], - stride_h, - stride_w, - dilation_h, - dilation_w)); + if (ref_conv2d_delegator_.get() == nullptr) { + ref_conv2d_delegator_ = make_unique>(strides_, + dilations_, + paddings_, + padding_type_); } - conv2d_delegator_->Compute(context, input, filter, output); + ref_conv2d_delegator_->Compute(context, input, filter, output); #endif Tensor::MappingGuard bias_guard(bias); @@ -523,6 +180,9 @@ class Conv2dOp : public ConvPool2dOpBase { auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); if (bias_data != nullptr) { + const index_t batch = input->dim(0); + const index_t height = output->dim(2); + const index_t width = output->dim(3); const index_t image_size = height * width; #pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { @@ -555,188 +215,13 @@ class Conv2dOp : public ConvPool2dOpBase { } private: - void Conv2dGeneral(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const index_t *filter_shape, - const int *stride_hw, - const int *dilation_hw, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = filter_shape[1] * in_image_size; - const index_t out_batch_size = filter_shape[0] * out_image_size; - const index_t filter_size = filter_shape[2] * filter_shape[3]; - -#pragma omp parallel for collapse(2) schedule(runtime) - for (index_t b = 0; b < in_shape[0]; b++) { - for (index_t m = 0; m < filter_shape[0]; m += 4) { - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t out_channels = filter_shape[0]; - const index_t in_channels = filter_shape[1]; - - const int stride_h = stride_hw[0]; - const int stride_w = stride_hw[1]; - const int dilation_h = dilation_hw[0]; - const int dilation_w = dilation_hw[1]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = out_ptr0_base + out_image_size; - float *out_ptr2_base = out_ptr1_base + out_image_size; - float *out_ptr3_base = out_ptr2_base + out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = - filter + m * in_channels * filter_size + c * filter_size; - const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; - const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; - const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t ih = h * stride_h; - index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; - // output (4 outch x 1 height x 4 width): vo_outch_height - float vo0[4], vo1[4], vo2[4], vo3[4]; - // load output - index_t out_offset = h * out_width + w; - for (index_t ow = 0; ow < 4; ++ow) { - vo0[ow] = out_ptr0_base[out_offset + ow]; - vo1[ow] = out_ptr1_base[out_offset + ow]; - vo2[ow] = out_ptr2_base[out_offset + ow]; - vo3[ow] = out_ptr3_base[out_offset + ow]; - } - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - // outch 0 - vo0[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr0[kw]; - vo0[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - // outch 1 - vo1[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr1[kw]; - vo1[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - // outch 2 - vo2[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr2[kw]; - vo2[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - // outch 3 - vo3[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr3[kw]; - vo3[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - } // kw - - in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; - filter_ptr1 += filter_shape[3]; - filter_ptr2 += filter_shape[3]; - filter_ptr3 += filter_shape[3]; - } // kh - - for (index_t ow = 0; ow < 4; ++ow) { - out_ptr0_base[out_offset + ow] = vo0[ow]; - out_ptr1_base[out_offset + ow] = vo1[ow]; - out_ptr2_base[out_offset + ow] = vo2[ow]; - out_ptr3_base[out_offset + ow] = vo3[ow]; - } - - filter_ptr0 -= filter_size; - filter_ptr1 -= filter_size; - filter_ptr2 -= filter_size; - filter_ptr3 -= filter_size; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = - filter + mm * in_channels * filter_size + c * filter_size; - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t ih = h * stride_h; - index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; - // output (1 outch x 1 height x 4 width): vo_outch_height - float vo0[4]; - // load output - index_t out_offset = h * out_width + w; - for (index_t ow = 0; ow < 4; ++ow) { - vo0[ow] = out_ptr0_base[out_offset + ow]; - } - - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - // outch 0 - vo0[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr0[kw]; - vo0[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - } // kw - - in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; - } // kh - - for (index_t ow = 0; ow < 4; ++ow) { - out_ptr0_base[out_offset + ow] = vo0[ow]; - } - filter_ptr0 -= filter_size; - } // w - } // h - } // c - } // mm - } // if - } // m - } // b - } const ActivationType activation_; const float relux_max_limit_; const float leakyrelu_coefficient_; - bool is_filter_transformed_; - SGemm sgemm_; #ifdef MACE_ENABLE_NEON std::unique_ptr conv2d_delegator_; #else - std::unique_ptr> conv2d_delegator_; + std::unique_ptr> ref_conv2d_delegator_; #endif // MACE_ENABLE_NEON private: @@ -744,7 +229,6 @@ class Conv2dOp : public ConvPool2dOpBase { MACE_OP_OUTPUT_TAGS(OUTPUT); }; - #ifdef MACE_ENABLE_QUANTIZE template <> class Conv2dOp : public ConvPool2dOpBase { @@ -848,7 +332,7 @@ class Conv2dOp : public ConvPool2dOpBase { ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); scratch->GrowSize(im2col_size); - im2col.reset(new Tensor(scratch->Scratch(im2col_size), DT_UINT8)); + im2col = make_unique(scratch->Scratch(im2col_size), DT_UINT8); uint8_t *im2col_data = im2col->mutable_data(); Im2col(input_data, input->shape(), filter_h, filter_w, stride_h, stride_w, static_cast(input->zero_point()), @@ -993,10 +477,10 @@ class Conv2dOp : public ConvPool2dOpBase { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; - kernel_.reset(new opencl::image::Conv2dKernel); + kernel_ = make_unique>(); } else { mem_type = MemoryType::GPU_BUFFER; - kernel_.reset(new opencl::buffer::Conv2dKernel); + kernel_ = make_unique>(); } context->set_output_mem_type(mem_type); // Transform filter tensor to target format @@ -1051,7 +535,6 @@ class Conv2dOp : public ConvPool2dOpBase { }; #endif // MACE_ENABLE_OPENCL - void RegisterConv2D(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, DeviceType::CPU, float); diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc index 0d41845795ecf6b50a9016c99e4e84e0c05d120c..3dda169dd80f02a258d854ce88c7f511beab0167 100644 --- a/mace/ops/crop.cc +++ b/mace/ops/crop.cc @@ -15,6 +15,8 @@ #include #include "mace/core/operator.h" +#include "mace/utils/math.h" +#include "mace/utils/memory.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/crop.h" #endif // MACE_ENABLE_OPENCL @@ -23,12 +25,24 @@ namespace mace { namespace ops { template -class CropOp : public Operation { +class CropOp; + +template +class CropOp : public Operation { public: explicit CropOp(OpConstructContext *context) : Operation(context), - axis_(Operation::GetOptionalArg("axis", 2)), - offset_(Operation::GetRepeatedArgs("offset")) {} + offset_(Operation::GetRepeatedArgs("offset")) { + MACE_CHECK(offset_.size() == 4, + "crop op only supports 4-dims inputs now."); + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df) { + // NHWC -> NCHW + offset_ = TransposeShape(offset_, {0, 3, 1, 2}); + } + } + MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -46,21 +60,13 @@ class CropOp : public Operation { std::vector output_shape(input0->shape()); for (index_t i = 0; i < in0_dims; ++i) { - int32_t crop_offset = 0; - index_t new_size = input0->dim(i); - if (i >= axis_) { - new_size = input1->dim(i); - if (offset_.size() == 1) { - crop_offset = offset_[0]; - } else if (offset_.size() > 1) { - crop_offset = offset_[i - axis_]; - } - MACE_CHECK(input0->dim(i) - crop_offset >= input1->dim(i)) - << "the crop for dimension" << i << "is out of bound with size" - << input1->dim(i) << "and offset" << crop_offset; + if (offset_[i] >= 0) { + output_shape[i] = input1->dim(i); + offsets[i] = offset_[i]; + MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i)) + << "the crop for dimension " << i << " is out of bound with size " + << input1->dim(i) << " and offset " << offsets[i]; } - output_shape[i] = new_size; - offsets[i] = crop_offset; } MACE_RETURN_IF_ERROR(output->Resize(output_shape)); T *output_data = output->mutable_data(); @@ -102,7 +108,6 @@ class CropOp : public Operation { } private: - const int axis_; std::vector offset_; }; @@ -112,10 +117,9 @@ class CropOp : public Operation { public: explicit CropOp(OpConstructContext *context) : Operation(context) { - const int axis = Operation::GetOptionalArg("axis", 2); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::CropKernel( - axis, Operation::GetRepeatedArgs("offset"))); + kernel_ = make_unique>( + Operation::GetRepeatedArgs("offset")); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc index 4ca25b15a3cd607e9b8394bc090e502486cc93e7..724d8ca2958360e991031b003af59f4a3f27b183 100644 --- a/mace/ops/crop_benchmark.cc +++ b/mace/ops/crop_benchmark.cc @@ -21,107 +21,80 @@ namespace test { namespace { template -void CropHelper(int iters, int crop_axis, int dim1, int offset) { +void CropHelper(int iters, + const std::vector &shape0, + const std::vector &shape1, + int crop_axis, + int offset) { mace::testing::StopTiming(); OpsTestNet net; - OpDefBuilder("Crop", "CropBM") - .Input("Input0") - .Input("Input1") - .AddIntArg("axis", crop_axis) - .AddIntsArg("offset", {offset}) - .Output("Output") - .Finalize(net.NewOperatorDef()); - // Add input data - const int kDim0 = 100; - net.AddRandomInput("Input0", {1, kDim0, dim1, dim1, }); - net.AddRandomInput("Input1", - {1, kDim0 / 2, dim1 / 2, dim1 / 2}); + std::vector offsets(4, -1); - // Warm-up - for (int i = 0; i < 5; ++i) { - net.RunOp(D); + for (int i = crop_axis; i < 4; ++i) { + offsets[i] = offset; } - const int64_t tot = static_cast(iters) * kDim0 * dim1 * dim1; - testing::BytesProcessed(tot * sizeof(T)); - mace::testing::StartTiming(); - while (iters--) { - net.RunOp(D); - } -} -} // namespace - -#define MACE_BM_CROP_CPU_MACRO(AXIS, DIM, OFFSET) \ - static void MACE_BM_CROP_CPU_##AXIS##_##DIM##_##OFFSET(int iters) { \ - CropHelper(iters, AXIS, DIM, OFFSET); \ - } \ - MACE_BENCHMARK(MACE_BM_CROP_CPU_##AXIS##_##DIM##_##OFFSET) - -MACE_BM_CROP_CPU_MACRO(1, 256, 3); -MACE_BM_CROP_CPU_MACRO(2, 256, 3); -MACE_BM_CROP_CPU_MACRO(3, 512, 3); -MACE_BM_CROP_CPU_MACRO(2, 512, 6); - -namespace { -template -void OpenCLCropHelper(int iters, - const std::vector &shape0, - const std::vector &shape1, - int crop_axis, - int offset) { - mace::testing::StopTiming(); - - OpsTestNet net; - // Add input data - net.AddRandomInput("Input0", shape0); - net.AddRandomInput("Input1", shape1); + if (D == DeviceType::CPU) { + auto input_shape0 = TransposeShape(shape0, {0, 3, 1, 2}); + auto input_shape1 = TransposeShape(shape1, {0, 3, 1, 2}); + net.AddRandomInput("Input0", input_shape0); + net.AddRandomInput("Input1", input_shape1); + } else if (D == DeviceType::GPU) { + // Add input data + net.AddRandomInput("Input0", shape0); + net.AddRandomInput("Input1", shape1); + } else { + MACE_NOT_IMPLEMENTED; + } OpDefBuilder("Crop", "CropBM") .Input("Input0") .Input("Input1") - .AddIntArg("axis", crop_axis) - .AddIntsArg("offset", {offset}) + .AddIntsArg("offset", offsets) + .AddIntArg("has_data_format", 1) .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Warm-up - for (int i = 0; i < 5; ++i) { - net.RunOp(DeviceType::GPU); + net.Setup(D); + for (int i = 0; i < 1; ++i) { + net.Run(); } const int64_t tot = static_cast(iters) * - (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size()); + (net.GetTensor("Input0")->size()); testing::BytesProcessed(tot * sizeof(T)); mace::testing::StartTiming(); while (iters--) { - net.RunOp(DeviceType::GPU); + net.Run(); } } } // namespace -#define MACE_BM_CROP_GPU_MACRO(N, H, W, C, AXIS, OFFSET, TYPE) \ - static void MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET##\ - _##TYPE(int iters) { \ - std::vector shape0 = {N, H, W, C}; \ - std::vector shape1 = {N / 2, H / 2, W / 2, C / 2}; \ - OpenCLCropHelper(iters, shape0, shape1, AXIS, OFFSET); \ - } \ - MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\ - ##_##TYPE) - -MACE_BM_CROP_GPU_MACRO(4, 32, 32, 32, 2, 4, float); -MACE_BM_CROP_GPU_MACRO(8, 32, 32, 64, 1, 0, float); -MACE_BM_CROP_GPU_MACRO(8, 32, 32, 128, 0, 0, float); -MACE_BM_CROP_GPU_MACRO(8, 32, 32, 256, 2, 4, float); +#define MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, DEVICE, TYPE) \ + static void MACE_BM_CROP_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET## \ + _##DEVICE##_##TYPE(int iters) { \ + std::vector shape0 = {N, H, W, C}; \ + std::vector shape1 = {N / 2, H / 2, W / 2, C / 2}; \ + CropHelper(iters, shape0, shape1, AXIS, OFFSET); \ + } \ + MACE_BENCHMARK(MACE_BM_CROP_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\ + ##_##DEVICE##_##TYPE) + +#define MACE_BM_CROP(N, H, W, C, AXIS, OFFSET) \ + MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, CPU, float); \ + MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, GPU, float); \ + MACE_BM_CROP_MACRO(N, H, W, C, AXIS, OFFSET, GPU, half); + +MACE_BM_CROP(4, 32, 32, 32, 2, 4); +MACE_BM_CROP(8, 32, 32, 64, 1, 0); +MACE_BM_CROP(8, 32, 32, 128, 0, 0); +MACE_BM_CROP(8, 32, 32, 256, 2, 4); -MACE_BM_CROP_GPU_MACRO(4, 32, 32, 32, 2, 4, half); -MACE_BM_CROP_GPU_MACRO(8, 32, 32, 64, 1, 0, half); -MACE_BM_CROP_GPU_MACRO(8, 32, 32, 128, 0, 0, half); -MACE_BM_CROP_GPU_MACRO(8, 32, 32, 256, 2, 4, half); } // namespace test } // namespace ops diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc index 872d3154491e22a63ed6e98621a63476ea70ebb5..213b8ce89a58b5745c4e5685c6a825442b5826ce 100644 --- a/mace/ops/crop_test.cc +++ b/mace/ops/crop_test.cc @@ -26,7 +26,6 @@ void RunCrop(const std::vector &input_shape, const std::vector &input_data, const std::vector &input_shape2, const std::vector &offset, - const int axis, const std::vector &expected_shape, const std::vector &expected_data) { OpsTestNet net; @@ -39,7 +38,7 @@ void RunCrop(const std::vector &input_shape, .Input("Input1") .Output("Output") .AddIntsArg("offset", offset) - .AddIntArg("axis", axis) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); } else if (D == CPU) { net.TransformDataFormat("Input0", @@ -55,7 +54,7 @@ void RunCrop(const std::vector &input_shape, .Input("InputNCHW1") .Output("OutputNCHW") .AddIntsArg("offset", offset) - .AddIntArg("axis", axis) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); } @@ -113,7 +112,7 @@ TEST_F(CropTest, SimpleCPU) { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, - 4.0, 4.0, 4.0}, {1, 5, 5, 3}, {2, 2}, 2, + 4.0, 4.0, 4.0}, {1, 5, 5, 3}, {-1, 2, 2, -1}, {1, 5, 5, 3}, {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, @@ -168,7 +167,7 @@ TEST_F(CropTest, SimpleGPU) { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, - 4.0, 4.0, 4.0}, {1, 5, 5, 3}, {2, 2}, 2, + 4.0, 4.0, 4.0}, {1, 5, 5, 3}, {-1, 2, 2, -1}, {1, 5, 5, 3}, {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, diff --git a/mace/ops/cumsum.cc b/mace/ops/cumsum.cc new file mode 100644 index 0000000000000000000000000000000000000000..f0117270c80ce25bda50ab8e8461302b521c484e --- /dev/null +++ b/mace/ops/cumsum.cc @@ -0,0 +1,152 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/core/operator.h" + +namespace mace { +namespace ops { + +template +class CumsumOp; + +template +class CumsumOp : public Operation { + public: + explicit CumsumOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 0)), + exclusive_(Operation::GetOptionalArg("exclusive", false)), + reverse_(Operation::GetOptionalArg("reverse", false)), + checked_(false) {} + + void Validate() { + const int32_t input_dims = this->Input(0)->dim_size(); + axis_ = + axis_ < 0 ? axis_ + input_dims : axis_; + MACE_CHECK((0 <= axis_ && axis_ < input_dims), + "Expected concatenating axis in the range [", -input_dims, ", ", + input_dims, "], but got ", axis_); + } + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + if (!checked_) { + Validate(); + bool has_data_format = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_data_format && this->Input(0)->dim_size() == 4) { + if (axis_ == 3) axis_ = 1; + else if (axis_ == 2) axis_ = 3; + else if (axis_ == 1) axis_ = 2; + } + checked_ = true; + } + + const Tensor *input = this->Input(0); + const std::vector input_shape = input->shape(); + + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard output_mapper(output); + + const float *input_ptr = input->data(); + float *output_ptr = output->mutable_data(); + + const index_t outer_size = std::accumulate(input_shape.begin(), + input_shape.begin() + axis_, + 1, + std::multiplies()); + const index_t inner_size = std::accumulate(input_shape.begin() + axis_ + 1, + input_shape.end(), + 1, + std::multiplies()); + const index_t cum_size = input_shape[axis_]; + + if (!reverse_) { +#pragma omp parallel for + for (index_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + index_t start_idx = outer_idx * cum_size * inner_size; + for (index_t cum_idx = 0; cum_idx < cum_size; ++cum_idx) { + if (cum_idx == 0) { + if (exclusive_) { + std::memset(output_ptr + start_idx, + 0, + sizeof(T) * inner_size); + } else { + std::memcpy(output_ptr + start_idx, + input_ptr + start_idx, + sizeof(T) * inner_size); + } + } else { + index_t cur_idx = start_idx + cum_idx * inner_size; + index_t pre_idx = start_idx + (cum_idx - 1) * inner_size; + index_t input_idx = exclusive_ ? pre_idx : cur_idx; + for (index_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) { + output_ptr[cur_idx + inner_idx] = + output_ptr[pre_idx + inner_idx] + + input_ptr[input_idx + inner_idx]; + } + } + } + } + } else { +#pragma omp parallel for + for (index_t outer_idx = outer_size - 1; outer_idx >= 0; --outer_idx) { + index_t start_idx = outer_idx * cum_size * inner_size; + for (index_t cum_idx = cum_size - 1; cum_idx >= 0; --cum_idx) { + index_t cur_idx = start_idx + cum_idx * inner_size; + if (cum_idx == cum_size - 1) { + if (exclusive_) { + std::memset(output_ptr + cur_idx, + 0, + sizeof(T) * inner_size); + } else { + std::memcpy(output_ptr + cur_idx, + input_ptr + cur_idx, + sizeof(T) * inner_size); + } + } else { + index_t pre_idx = start_idx + (cum_idx + 1) * inner_size; + index_t input_idx = exclusive_ ? pre_idx : cur_idx; + for (index_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) { + output_ptr[cur_idx + inner_idx] = + output_ptr[pre_idx + inner_idx] + + input_ptr[input_idx + inner_idx]; + } + } + } + } + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + int32_t axis_; + bool exclusive_; + bool reverse_; + bool checked_; +}; + +void RegisterCumsum(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Cumsum", CumsumOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/cumsum_benchmark.cc b/mace/ops/cumsum_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ca59fa0501fe92a35fbe0a02141cdd23a7c1198 --- /dev/null +++ b/mace/ops/cumsum_benchmark.cc @@ -0,0 +1,90 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class CumsumOpTest : public OpsTestBase {}; + +namespace { +template +void Cumsum(int iters, int batch, int channels, int height, int width) { + mace::testing::StopTiming(); + + // Construct graph + OpsTestNet net; + + // Add input data + if (D == DeviceType::CPU) { + net.AddRandomInput("Input", {batch, channels, height, width}); + } else { + MACE_NOT_IMPLEMENTED; + } + + OpDefBuilder("Cumsum", "CumsumTest") + .Input("Input") + .Output("Output") + .AddIntArg("axis", 0) + .AddIntArg("exclusive", 0) + .AddIntArg("reverse", 0) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} +} // namespace + +#define MACE_BM_CUMSUM_MACRO(N, C, H, W, TYPE, DEVICE) \ + static void MACE_BM_CUMSUM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + Cumsum(iters, N, C, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_CUMSUM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_CUMSUM(N, C, H, W) \ + MACE_BM_CUMSUM_MACRO(N, C, H, W, float, CPU); + +MACE_BM_CUMSUM(1, 1, 512, 512); +MACE_BM_CUMSUM(1, 3, 128, 128); +MACE_BM_CUMSUM(1, 3, 512, 512); +MACE_BM_CUMSUM(1, 32, 112, 112); +MACE_BM_CUMSUM(1, 64, 256, 256); +MACE_BM_CUMSUM(1, 64, 512, 512); +MACE_BM_CUMSUM(1, 128, 56, 56); +MACE_BM_CUMSUM(1, 128, 256, 256); +MACE_BM_CUMSUM(1, 256, 14, 14); +MACE_BM_CUMSUM(1, 512, 14, 14); +MACE_BM_CUMSUM(1, 1024, 7, 7); +MACE_BM_CUMSUM(32, 1, 256, 256); +MACE_BM_CUMSUM(32, 3, 256, 256); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/cumsum_test.cc b/mace/ops/cumsum_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8b111540c9040a391ae419d86e3c042b23954b5e --- /dev/null +++ b/mace/ops/cumsum_test.cc @@ -0,0 +1,91 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class CumsumOpTest : public OpsTestBase {}; + +namespace { +template +void SimpleTestWithDataFormat(const std::vector &shape, + const std::vector &input, + const int axis, + const int exclusive, + const int reverse, + const std::vector &output) { + // Construct graph + OpsTestNet net; + + net.AddInputFromArray("Input", shape, input); + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + + OpDefBuilder("Cumsum", "CumsumTest") + .Input("InputNCHW") + .Output("OutputNCHW") + .AddIntArg("axis", axis) + .AddIntArg("exclusive", exclusive) + .AddIntArg("reverse", reverse) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", 1) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(DeviceType::CPU); + + net.TransformDataFormat("OutputNCHW", NCHW, "Output", + NHWC); + + net.AddInputFromArray("ExpectedOutput", shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(CumsumOpTest, HasDataFormatCPU) { + SimpleTestWithDataFormat( + {2, 2, 2, 2}, + {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}, + 0, 0, 0, + {0., 1., 2., 3., 4., 5., 6., 7., 8., 10., 12., 14., 16., 18., 20., 22.}); + SimpleTestWithDataFormat( + {2, 2, 2, 2}, + {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}, + 1, 0, 0, + {0., 1., 2., 3., 4., 6., 8., 10., 8., 9., 10., 11., 20., 22., 24., 26.}); + SimpleTestWithDataFormat( + {2, 2, 2, 2}, + {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}, + 0, 1, 0, + {0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 3., 4., 5., 6., 7.}); + SimpleTestWithDataFormat( + {2, 2, 2, 2}, + {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}, + 0, 0, 1, + {8., 10., 12., 14., 16., 18., 20., 22., 8., 9., 10., 11., 12., 13., 14., + 15.}); + SimpleTestWithDataFormat( + {2, 2, 2, 2}, + {0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.}, + 1, 1, 1, + {4., 5., 6., 7., 0., 0., 0., 0., 12., 13., 14., 15., 0., 0., 0., 0.}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 22fa5c5bb6f95c637e4d9b96652293302697c769..6e9a0fa8db36209887f86d0fdc75d5c5d1a5c2bc 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -28,7 +28,8 @@ #include "mace/core/tensor.h" #include "mace/ops/activation.h" #include "mace/ops/arm/deconv_2d_neon.h" -#include "mace/utils/utils.h" +#include "mace/utils/memory.h" +#include "mace/utils/math.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/deconv_2d.h" @@ -362,7 +363,7 @@ class Deconv2dOp : public Deconv2dOpBase { : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::Deconv2dKernel); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index ed9cdb539445b17810eaa685135ad12fbfc1a3ba..2460d75a258068c4e0f08576311bf93ace6b3289 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -19,6 +19,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/depth_to_space.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -97,7 +98,7 @@ class DepthToSpaceOp : public Operation { : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::DepthToSpaceKernel(block_size)); + kernel_ = make_unique>(block_size); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index c61f13049d51a6ce6c3fe624c345052316f4a6d3..22b13c268de07a10ffc4dfc06fdad69c953a37f5 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -33,6 +33,7 @@ #include "mace/ops/arm/depthwise_conv2d_neon.h" #include "mace/ops/conv_pool_2d_base.h" #include "mace/public/mace.h" +#include "mace/utils/memory.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" @@ -493,19 +494,25 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; - kernel_.reset(new opencl::image::DepthwiseConv2dKernel); + kernel_ = make_unique>(); } else { mem_type = MemoryType::GPU_BUFFER; - kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel); + kernel_ = make_unique>(); } context->set_output_mem_type(mem_type); - // Transform filter tensor to target format - MACE_CHECK(TransformFilter( - context, - operator_def_.get(), - 1, - OpenCLBufferType::DW_CONV2D_FILTER, - mem_type) == MaceStatus::MACE_SUCCESS); + Tensor *filter_tensor = context->workspace()->GetTensor( + operator_def_->input(1)); + if (filter_tensor != nullptr && filter_tensor->is_weight()) { + // Transform filter tensor to target format + MACE_CHECK(TransformFilter( + context, + operator_def_.get(), + 1, + OpenCLBufferType::DW_CONV2D_FILTER, + mem_type) == MaceStatus::MACE_SUCCESS); + } else { + context->SetInputOpenCLBufferType(1, OpenCLBufferType::DW_CONV2D_FILTER); + } if (operator_def_->input_size() > 2) { MACE_CHECK(TransformFilter( context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index 06c55ab27a2f831bb681bb3ef2c39d96b44922b1..3d203cfa5678c1ca407b6db2d441890bc00785a5 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -26,8 +26,9 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/ops/arm/depthwise_deconv2d_neon.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" #include "mace/public/mace.h" +#include "mace/utils/memory.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/depthwise_deconv2d.h" @@ -36,7 +37,7 @@ namespace mace { namespace ops { -template +template class DepthwiseDeconv2dOp; template<> @@ -91,10 +92,11 @@ class DepthwiseDeconv2dOp const index_t pad_top = out_paddings[1] / 2; index_t padded_out_size = - std::accumulate(padded_out_shape.begin(), - padded_out_shape.end(), - 1, - std::multiplies()) * sizeof(float); + PadAlignSize(std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) + * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE); ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); scratch->GrowSize(padded_out_size); @@ -252,7 +254,6 @@ class DepthwiseDeconv2dOp padded_out_shape.data(), out_data); - if (!no_pad) { CropPadOut(out_data, padded_out_shape.data(), @@ -383,7 +384,7 @@ class DepthwiseDeconv2dOp const index_t out_offset = i * strides[0] * out_width + j * strides[1]; for (int q = 0; q < in_channels_g; ++q) { - const index_t in_base = + const index_t in_base = ((b * group + g) * in_channels_g + q) * in_img_size; const index_t in_offset = in_base + i * in_width + j; @@ -412,7 +413,7 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index f035eeee579907fea2ddb77d04ca5c982c903b67..92864ae1016fad410ce054887babd09ee2557c59 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef MACE_ENABLE_NEON +#ifdef MACE_ENABLE_QUANTIZE +#include "mace/ops/arm/q8/eltwise.h" +#endif // MACE_ENABLE_QUANTIZE +#endif // MACE_ENABLE_NEON + #include "mace/ops/eltwise.h" #include @@ -24,6 +30,7 @@ #include "mace/core/future.h" #include "mace/core/operator.h" #include "mace/core/tensor.h" +#include "mace/utils/memory.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/buffer_transformer.h" @@ -890,8 +897,8 @@ class EltwiseOp : public Operation { scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), scalar_input_index_(Operation::GetOptionalArg( "scalar_input_index", 1)), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", 0))) {} + has_data_format_(Operation::GetOptionalArg( + "has_data_format", 0)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -920,7 +927,9 @@ class EltwiseOp : public Operation { const Tensor *input1, Tensor *output) { bool swapped = false; - if (input0->size() < input1->size()) { + if (input0->dim_size() < input1->dim_size() + || (input0->dim_size() == input1->dim_size() + && input0->size() < input1->size())) { std::swap(input0, input1); swapped = true; } @@ -931,7 +940,7 @@ class EltwiseOp : public Operation { // check if we can broadcast tensor uint32_t rank_diff = static_cast(input0->dim_size() - input1->dim_size()); - if (data_format_ == NCHW) { + if (has_data_format_) { MACE_CHECK( (input0->dim_size() == 4) && ((input1->dim_size() == 0) || @@ -956,7 +965,7 @@ class EltwiseOp : public Operation { const T *input0_ptr = input0->data(); const T *input1_ptr = input1->data(); - if (data_format_ == NCHW && input1->dim_size() > 0) { + if (has_data_format_ && input1->dim_size() > 0) { MACE_RETURN_IF_ERROR(output->ResizeLike(input0)); Tensor::MappingGuard output_guard(output); DstType *output_ptr = output->mutable_data(); @@ -1018,7 +1027,7 @@ class EltwiseOp : public Operation { std::vector coeff_; float scalar_input_; int32_t scalar_input_index_; - DataFormat data_format_; + int has_data_format_; Tensor scalar_tensor_; }; @@ -1033,21 +1042,30 @@ class EltwiseOp : public Operation { coeff_(Operation::GetRepeatedArgs("coeff")), scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), scalar_input_index_(Operation::GetOptionalArg( - "scalar_input_index", 1)), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", 0))) {} + "scalar_input_index", 1)) +#ifdef MACE_ENABLE_NEON + , eltwise_(static_cast(Operation::GetOptionalArg( + "type", static_cast(ops::EltwiseType::NONE)))) +#endif + {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); const Tensor *input0 = this->Input(0); - const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; + MACE_CHECK(this->InputSize() == 2, + "Quantized Elementwise don't support broadcast now."); + const Tensor *input1 = this->Input(1); Tensor *output = this->Output(0); - MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. "); + MACE_CHECK(type_ == SUM || type_ == SUB, + "Quantized Elementwise only support SUM and SUB now."); MACE_CHECK(input0->size() == input1->size(), "input0 and input1 must have the same shape."); MACE_CHECK(output->scale() != 0); MACE_RETURN_IF_ERROR(output->Resize(input0->shape())); +#ifdef MACE_ENABLE_NEON + eltwise_.Compute(context, input0, input1, output); +#else constexpr int left_shift = 20; const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); const double adjusted_input0_scale = input0->scale() / doubled_scale; @@ -1078,57 +1096,8 @@ class EltwiseOp : public Operation { auto input0_ptr = input0->data(); auto input1_ptr = input1->data(); auto output_ptr = output->mutable_data(); - - index_t handled_output_size = 0; -#ifdef MACE_ENABLE_NEON - #pragma omp parallel for schedule(runtime) - for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) { - const auto input0_val = vld1_u8(input0_ptr + i); - const auto input1_val = vld1_u8(input1_ptr + i); - const auto input0_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input0_val)); - const auto input1_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input1_val)); - const auto offset_input0 = - vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); - const auto offset_input1 = - vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); - auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); - auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); - auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); - auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); - const auto left_shift_dup = vdupq_n_s32(left_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); - input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); - input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); - input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); - input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); - const auto input0_shift_dup = vdupq_n_s32(input0_shift); - const auto input1_shift_dup = vdupq_n_s32(input1_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); - auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32); - auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32); - sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier); - sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier); - sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift); - sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift); - const auto sum_low_s16 = vmovn_s32(sum_low); - const auto sum_high_s16 = vmovn_s32(sum_high); - const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16, - sum_high_s16), - vdupq_n_s16(output->zero_point())); - vst1_u8(output_ptr + i, vqmovun_s16(output_val)); - } - handled_output_size = output->size() - output->size() % 8; -#endif // NEON #pragma omp parallel for schedule(runtime) - for (index_t i = handled_output_size; i < output->size(); ++i) { + for (index_t i = 0; i < output->size(); ++i) { const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); const int32_t shifted_input0 = offset_input0 * (1 << left_shift); @@ -1143,14 +1112,22 @@ class EltwiseOp : public Operation { gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, input1_multiplier), -input1_shift); - const int32_t sum = multiplied_input0 + multiplied_input1; + + int32_t res; + if (type_ == SUM) { + res = multiplied_input0 + multiplied_input1; + } else { + res = multiplied_input0 - multiplied_input1; + } + const int32_t output_val = gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(sum, + gemmlowp::SaturatingRoundingDoublingHighMul(res, output_multiplier), -output_shift) + output->zero_point(); output_ptr[i] = Saturate(output_val); } +#endif // NEON return MaceStatus::MACE_SUCCESS; } @@ -1160,8 +1137,10 @@ class EltwiseOp : public Operation { std::vector coeff_; float scalar_input_; int32_t scalar_input_index_; - DataFormat data_format_; Tensor scalar_tensor_; +#ifdef MACE_ENABLE_NEON + arm::q8::Eltwise eltwise_; +#endif }; #endif // MACE_ENABLE_QUANTIZE @@ -1181,8 +1160,8 @@ class EltwiseOp : public Operation { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; - kernel_.reset(new opencl::image::EltwiseKernel( - type, coeff, scalar_input, scalar_input_index)); + kernel_ = make_unique>( + type, coeff, scalar_input, scalar_input_index); } else { MACE_NOT_IMPLEMENTED; } @@ -1192,12 +1171,23 @@ class EltwiseOp : public Operation { for (int i = 0; i < input_size; ++i) { if (ws->HasTensor(operator_def_->input(i)) && ws->GetTensor(operator_def_->input(i))->is_weight()) { - MACE_CHECK(TransformFilter( - context, - operator_def_.get(), - i, - OpenCLBufferType::ARGUMENT, - mem_type) == MaceStatus::MACE_SUCCESS); + if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) { + MACE_CHECK(TransformFilter( + context, + operator_def_.get(), + i, + OpenCLBufferType::ARGUMENT, + mem_type) == MaceStatus::MACE_SUCCESS); + } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) { + MACE_CHECK(TransformFilter( + context, + operator_def_.get(), + i, + OpenCLBufferType::IN_OUT_CHANNEL, + mem_type) == MaceStatus::MACE_SUCCESS); + } else { + MACE_NOT_IMPLEMENTED; + } } } } diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index c79c6c27abfb3cef4ed02abfacc3dea5384e1bd3..208d7f26549b6642502dcf6022983ad4f0f52622 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -15,25 +15,11 @@ #ifndef MACE_OPS_ELTWISE_H_ #define MACE_OPS_ELTWISE_H_ +#include "mace/ops/common/eltwise_type.h" + namespace mace { namespace ops { -enum EltwiseType { - SUM = 0, - SUB = 1, - PROD = 2, - DIV = 3, - MIN = 4, - MAX = 5, - NEG = 6, - ABS = 7, - SQR_DIFF = 8, - POW = 9, - EQUAL = 10, - FLOOR_DIV = 11, - NONE = 12, -}; - inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; } } // namespace ops diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 0bfb666f70d3fd606703e32bcd3a4baf3f788fa6..a1959e9df5c388dd6a3605538e83558f3d4e563d 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -30,12 +30,12 @@ void EltwiseBenchmark( OpsTestNet net; // Add input data - if (D == DeviceType::GPU) { - net.AddRandomInput("Input0", {n, h, w, c}); - net.AddRandomInput("Input1", {n, h, w, c}); - } else { + if (D == DeviceType::CPU && DataTypeToEnum::value != DT_UINT8) { net.AddRandomInput("Input0", {n, c, h, w}); net.AddRandomInput("Input1", {n, c, h, w}); + } else { + net.AddRandomInput("Input0", {n, h, w, c}); + net.AddRandomInput("Input1", {n, h, w, c}); } OpDefBuilder("Eltwise", "EltwiseTest") @@ -44,18 +44,25 @@ void EltwiseBenchmark( .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", {1.2, 2.1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); + net.Setup(D); + + if (D == DeviceType::CPU && DataTypeToEnum::value == DT_UINT8) { + net.GetTensor("Output")->SetScale(0.1); + } + // Warm-up for (int i = 0; i < 5; ++i) { - net.RunOp(D); + net.Run(); net.Sync(); } mace::testing::StartTiming(); while (iters--) { - net.RunOp(D); + net.Run(); net.Sync(); } } @@ -86,6 +93,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256); MACE_BM_ELTWISE(5, 1, 128, 128, 32); MACE_BM_ELTWISE(5, 1, 240, 240, 256); +MACE_BM_ELTWISE_MACRO(0, 1, 128, 128, 32, uint8_t, CPU); +MACE_BM_ELTWISE_MACRO(1, 1, 128, 128, 32, uint8_t, CPU); + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 7ca799e2e8701b8adb439218c17ce10d8fbd0f56..58306b625a5ce8e38b0b129c230a4401d3a06ae9 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -75,7 +75,7 @@ void SimpleTensorScalar(const ops::EltwiseType type, .AddIntArg("T", DataTypeToEnum::v()) .AddIntArg("type", static_cast(type)) .AddFloatArg("scalar_input", x) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) .Output("TOutput") .Finalize(net.NewOperatorDef()); @@ -120,7 +120,7 @@ void SimpleTensorEltwise(const ops::EltwiseType type, .AddIntArg("T", DataTypeToEnum::v()) .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) .Output("TOutput"); if (shape0.size() > 1) { @@ -642,7 +642,7 @@ void RandomTensorScalar(const ops::EltwiseType type, .Input("TInput") .AddIntArg("type", static_cast(type)) .AddFloatArg("scalar_input", 0.1) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .Output("TOutput") .Finalize(net.NewOperatorDef()); // Run @@ -699,7 +699,7 @@ void RandomTensorEltwise(const ops::EltwiseType type, .Input("TInput1") .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .Output("TOutput") .Finalize(net.NewOperatorDef()); @@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type, } } -void QuantizedSum(const std::vector &shape) { +void Quantized(const std::vector &shape, + const ops::EltwiseType type) { // Construct graph OpsTestNet net; @@ -753,8 +754,8 @@ void QuantizedSum(const std::vector &shape) { OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput0") .Input("TInput1") - .AddIntArg("type", static_cast(ops::EltwiseType::SUM)) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("type", static_cast(type)) + .AddIntArg("has_data_format", 1) .Output("TOutput") .Finalize(net.NewOperatorDef()); @@ -794,7 +795,7 @@ void QuantizedSum(const std::vector &shape) { .Input("QuantizedInput0") .Input("QuantizedInput1") .Output("QuantizedOutput") - .AddIntArg("type", static_cast(ops::EltwiseType::SUM)) + .AddIntArg("type", static_cast(type)) .AddIntArg("T", static_cast(DT_UINT8)) .Finalize(net.NewOperatorDef()); net.Setup(DeviceType::CPU); @@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) { {1, 1, 2, 1}, {2, 3}, {1, 1, 2, 5}, {4, 1, 0, 1, 4, 4, 9, 16, 25, 36}); } -TEST_F(EltwiseOpTest, QuantizedSum) { - QuantizedSum({1, 32, 32, 16}); - QuantizedSum({1, 31, 31, 17}); +TEST_F(EltwiseOpTest, Quantized) { + Quantized({1, 32, 32, 16}, ops::EltwiseType::SUM); + Quantized({1, 31, 31, 17}, ops::EltwiseType::SUM); + Quantized({1, 32, 32, 16}, ops::EltwiseType::SUB); + Quantized({1, 31, 31, 17}, ops::EltwiseType::SUB); } } // namespace test diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index c82aa8ff5332c850b70100b97b0c6c1cfb3c33d3..22d45ea7c5de05eff05f2ad1fa30c9bcd92f6b7d 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -38,6 +38,8 @@ #include "mace/ops/opencl/image/fully_connected.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" + namespace mace { namespace ops { @@ -186,7 +188,7 @@ class FullyConnectedOp : public FullyConnectedOpBase { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; - kernel_.reset(new opencl::image::FullyConnectedKernel); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc index 32f849c7abf69318fe2fdcb9dcacb97bc437aec0..2c0f474ca7aa9437328f0319ba6538c11f538d3d 100644 --- a/mace/ops/gather_test.cc +++ b/mace/ops/gather_test.cc @@ -23,53 +23,67 @@ namespace test { class GatherOpTest : public OpsTestBase {}; namespace { +template void TestGather(const std::vector &weight_shape, - const std::vector &weight, + const std::vector &weight, const std::vector &input_shape, const std::vector &input, const int axis, const std::vector &output_shape, - const std::vector &output) { + const std::vector &output) { OpsTestNet net; - net.AddInputFromArray("Params", weight_shape, weight); + net.AddInputFromArray("Params", weight_shape, weight); net.AddInputFromArray("Indices", input_shape, input); OpDefBuilder("Gather", "GatherTest") .Input("Params") .Input("Indices") + .AddIntArg("T", DataTypeToEnum::v()) .AddIntArg("axis", axis) .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(CPU); - auto expected = net.CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace TEST_F(GatherOpTest, CPUScalarIndex) { - TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, + {}, {5}, 0, {2}, {10, 11}); + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {}, {5}, 0, {2}, {10, 11}); } TEST_F(GatherOpTest, CPURank1Index) { - TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, + {3}, {2, 4, 6}, 0, {3, 2}, {4, 5, 8, 9, 12, 13}); + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {3}, {2, 4, 6}, 0, {3, 2}, {4, 5, 8, 9, 12, 13}); } TEST_F(GatherOpTest, CPURank1IndexWithAxis1) { - TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, + {1}, {1}, 1, {10, 1}, {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}); + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {1}, {1}, 1, {10, 1}, {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}); } TEST_F(GatherOpTest, CPURankHighIndex) { - TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, + {1, 3}, {2, 4, 6}, 0, {1, 3, 2}, {4, 5, 8, 9, 12, 13}); + TestGather({10, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {1, 3}, {2, 4, 6}, 0, {1, 3, 2}, {4, 5, 8, 9, 12, 13}); } diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc index cd0d96b8cef49abad2e97cd60a81619065d51ebb..38f711f57ad824f146a4cd0abf306300b5122735 100644 --- a/mace/ops/infer_conv2d_shape.cc +++ b/mace/ops/infer_conv2d_shape.cc @@ -34,9 +34,10 @@ class InferConv2dShapeOp : public Operation { Tensor::MappingGuard output_guard(output); int32_t *output_data = output->mutable_data(); - const int32_t data_format = - Operation::GetOptionalArg("data_format", 0); - const bool isNCHW = data_format == 1; + auto has_data_format = + Operation::GetOptionalArg("has_data_format", 0); + const bool isNCHW = (has_data_format && + input->data_format() == DataFormat::NCHW); Padding padding_type = static_cast(Operation::GetOptionalArg( diff --git a/mace/ops/infer_conv2d_shape_test.cc b/mace/ops/infer_conv2d_shape_test.cc index feaaecff8364d9f1a3270105bc03ddb36e3f5be2..333baaf944b34d8e2e0d78cd4e3d84aefa950163 100644 --- a/mace/ops/infer_conv2d_shape_test.cc +++ b/mace/ops/infer_conv2d_shape_test.cc @@ -57,8 +57,8 @@ void TestInferConv2dShapeOp(const std::vector &input_shape, } // namespace TEST_F(InferConv2dShapeOpTest, TestInferConv2dShape) { -TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3}); -TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3}); + TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3}); + TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3}); } } // namespace test diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc index a342cef812847070f63ed048045159185d28f0a5..82ed9053b6d05a40c2e31e6854c0ec16c62f7ae8 100644 --- a/mace/ops/lstm_cell.cc +++ b/mace/ops/lstm_cell.cc @@ -18,6 +18,7 @@ #include "mace/core/operator.h" #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/lstm_cell.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -25,6 +26,7 @@ namespace ops { template class LSTMCellOp; +#ifdef MACE_ENABLE_OPENCL template class LSTMCellOp : public Operation { public: @@ -35,7 +37,7 @@ class LSTMCellOp : public Operation { 0.0)); MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::LSTMCellKernel(forget_bias)); + kernel_ = make_unique>(forget_bias); } else { MACE_NOT_IMPLEMENTED; } @@ -88,6 +90,7 @@ class LSTMCellOp : public Operation { MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL); MACE_OP_OUTPUT_TAGS(CELL, OUTPUT); }; +#endif void RegisterLSTMCell(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp, diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index a3aebcb49abe323a24bc792f857577481be19f35..3b0913de574607660b807ea133f3e797a30aca71 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -21,8 +21,7 @@ #include "mace/core/operator.h" #include "mace/core/tensor.h" -#include "mace/ops/sgemm.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" #ifdef MACE_ENABLE_NEON #include "mace/ops/arm/fp32/gemm.h" @@ -38,7 +37,7 @@ #endif // MACE_ENABLE_NEON #ifdef MACE_ENABLE_QUANTIZE -#include "mace/ops/gemmlowp_util.h" +#include "mace/ops/common/gemmlowp_util.h" #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL @@ -233,8 +232,8 @@ class MatMulFixpointImpl { const index_t height, const index_t K, const index_t width, - const bool lhs_bached, - const bool rhs_bached, + const bool lhs_batched, + const bool rhs_batched, Tensor *C) { #if defined(MACE_ENABLE_NEON) if (width == 1 && AOrder == gemmlowp::MapOrder::RowMajor) { @@ -245,8 +244,8 @@ class MatMulFixpointImpl { batch, height, K, - true, - true, + lhs_batched, + rhs_batched, C); } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) { gemv_kernel_.Compute(context, @@ -256,8 +255,8 @@ class MatMulFixpointImpl { batch, width, K, - true, - true, + lhs_batched, + rhs_batched, C); } else { #endif // MACE_ENABLE_NEON @@ -281,11 +280,13 @@ class MatMulFixpointImpl { for (index_t i = 0; i < batch; ++i) { gemmlowp::MatrixMap - a_matrix(a_ptr_base + static_cast(lhs_bached) * i * a_size, + a_matrix(a_ptr_base + + static_cast(lhs_batched) * i * a_size, height, K); gemmlowp::MatrixMap - b_matrix(b_ptr_base + static_cast(rhs_bached) * i * b_size, + b_matrix(b_ptr_base + + static_cast(rhs_batched) * i * b_size, K, width); gemmlowp::MatrixMap @@ -315,8 +316,8 @@ class MatMulFixpointImpl { const index_t height, const index_t K, const index_t width, - const bool lhs_bached, - const bool rhs_bached, + const bool lhs_batched, + const bool rhs_batched, Tensor *C) { C->SetScale(A->scale() * B->scale()); C->SetZeroPoint(0); @@ -330,8 +331,8 @@ class MatMulFixpointImpl { batch, height, K, - lhs_bached, - rhs_bached, + lhs_batched, + rhs_batched, C); } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) { gemv_kernel_.Compute(context, @@ -341,8 +342,8 @@ class MatMulFixpointImpl { batch, width, K, - lhs_bached, - rhs_bached, + lhs_batched, + rhs_batched, C); } else { #endif // MACE_ENABLE_NEON @@ -366,12 +367,12 @@ class MatMulFixpointImpl { for (index_t i = 0; i < batch; ++i) { gemmlowp::MatrixMap a_matrix - (a_ptr_base + static_cast(lhs_bached) * i * a_size, + (a_ptr_base + static_cast(lhs_batched) * i * a_size, height, K); gemmlowp::MatrixMap b_matrix - (b_ptr_base + static_cast(rhs_bached) * i * b_size, + (b_ptr_base + static_cast(rhs_batched) * i * b_size, K, width); gemmlowp::MatrixMap diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index acacdc7ff91bbda1945415f8ed668a4e0fb63bbd..308113ffcc380d67fd39f89bcb487fce628d77e9 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -21,7 +21,6 @@ #include "public/gemmlowp.h" #include "mace/benchmark/statistics.h" #include "mace/core/testing/test_benchmark.h" -#include "mace/ops/sgemm.h" #include "mace/ops/ops_test_util.h" namespace gemmlowp { @@ -94,32 +93,6 @@ namespace test { namespace { -// Matmul with (m, k) x (k, n) -void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) { - mace::testing::StopTiming(); - std::vector lhs(m * k); - std::vector rhs(k * n); - std::vector result(m * n); - - ops::SGemmMatrixMap - matrix_lhs(1, m, k, SGemmRowMajor, lhs.data(), - true); - ops::SGemmMatrixMap - matrix_rhs(1, k, n, SGemmRowMajor, rhs.data(), - true); - ops::SGemmMatrixMap - matrix_result(1, m, n, SGemmRowMajor, result.data()); - - ops::SGemm sgemm; - - sgemm(matrix_lhs, matrix_rhs, &matrix_result); - - mace::testing::StartTiming(); - while (iters--) { - sgemm(matrix_lhs, matrix_rhs, &matrix_result); - } -} - void MatmulBenchmark_Eigen(int iters, int m, int k, int n) { mace::testing::StopTiming(); Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k); @@ -223,7 +196,6 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) { MACE_BENCHMARK(MACE_BM_MATMUL_##M##_##K##_##N##_##FUNC) #define MACE_BM_MATMUL(M, K, N) \ - MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float); \ MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float); \ MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \ MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t); diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index 741393ffea45d435b52156f38b3a3ddc4d0e5b84..f88ac39435e328ad2a4ada6b3c41a73558fdb791 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -135,7 +135,8 @@ void Complex(const std::vector &batch, rhs_batched, &expected_output_tensor); - ExpectTensorNear(expected_output_tensor, *net.GetTensor("Output")); + ExpectTensorNear(expected_output_tensor, *net.GetTensor("Output"), + 1e-4, 1e-2); } } // namespace diff --git a/mace/ops/opencl/activation.h b/mace/ops/opencl/activation.h index 6eecb6416659f899e4926332a02695597782ee62..6e9b92242b499906fb3304fdcedfc1e739e9abb4 100644 --- a/mace/ops/opencl/activation.h +++ b/mace/ops/opencl/activation.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_ACTIVATION_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/addn.h b/mace/ops/opencl/addn.h index b78a7099646f77f40e5f2c058d90dcd414cb0dec..ba161ba641ed6d1c041e2e41aa547f1c45071e48 100644 --- a/mace/ops/opencl/addn.h +++ b/mace/ops/opencl/addn.h @@ -18,7 +18,7 @@ #include #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/batch_norm.h b/mace/ops/opencl/batch_norm.h index 6f91f95d67d4efe23ae9b8eb57142ea7ba1f3acd..bf49c994e127910632dfd2b2ce9d76a4855a29dc 100644 --- a/mace/ops/opencl/batch_norm.h +++ b/mace/ops/opencl/batch_norm.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_BATCH_NORM_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/batch_to_space.h b/mace/ops/opencl/batch_to_space.h index 4cf8db94399d13ed77391d80f8d7447b8edff59a..9bb62f7052d64ba437d44f5d0ed6403c475c3dde 100644 --- a/mace/ops/opencl/batch_to_space.h +++ b/mace/ops/opencl/batch_to_space.h @@ -19,7 +19,7 @@ #include "mace/core/types.h" #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/bias_add.h b/mace/ops/opencl/bias_add.h index d0b4469dd0154ae7234f984771ffa70509abeb32..80a86423c50922bec581b445206445cf2df83d41 100644 --- a/mace/ops/opencl/bias_add.h +++ b/mace/ops/opencl/bias_add.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_BIAS_ADD_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h index 736ecb2a420af7941490224b6f0c390abbb3bac9..4ef8d79d9304143d29ba35125ad0b0970af310cb 100644 --- a/mace/ops/opencl/buffer/conv_2d.h +++ b/mace/ops/opencl/buffer/conv_2d.h @@ -22,6 +22,7 @@ #include "mace/ops/opencl/buffer/utils.h" #include "mace/ops/opencl/helper.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -211,8 +212,8 @@ MaceStatus Conv2dKernel::Compute( old_scratch_size_ = scratch->size(); } - padded_input.reset(new Tensor(scratch->Scratch(padded_input_size), - input->dtype())); + padded_input = make_unique(scratch->Scratch(padded_input_size), + input->dtype()); padded_input->Resize(padded_input_shape); PadInput(context, &kernels_[0], input, pad_top, pad_left, diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.h b/mace/ops/opencl/buffer/depthwise_conv2d.h index 74a3cb945158382fb9b546cdfee6d0091c1892c7..6a46334a787378441d84d020cf578042e6bd24b9 100644 --- a/mace/ops/opencl/buffer/depthwise_conv2d.h +++ b/mace/ops/opencl/buffer/depthwise_conv2d.h @@ -22,6 +22,7 @@ #include "mace/ops/opencl/buffer/utils.h" #include "mace/ops/opencl/helper.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -165,8 +166,8 @@ MaceStatus DepthwiseConv2dKernel::Compute( old_scratch_size_ = scratch->size(); } - padded_input.reset(new Tensor(scratch->Scratch(padded_input_size), - input->dtype())); + padded_input = make_unique(scratch->Scratch(padded_input_size), + input->dtype()); padded_input->Resize(padded_input_shape); PadInput(context, &kernels_[0], input, pad_top, pad_left, diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h index ab1e6f85929298483339944d7eb97d0781023a04..4f153e4acfff75ab179e567803e05e14f67ceebf 100644 --- a/mace/ops/opencl/buffer/pooling.h +++ b/mace/ops/opencl/buffer/pooling.h @@ -24,6 +24,7 @@ #include "mace/ops/opencl/buffer/utils.h" #include "mace/ops/opencl/helper.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -124,8 +125,8 @@ MaceStatus PoolingKernel::Compute( old_scratch_size_ = scratch->size(); } - padded_input.reset(new Tensor(scratch->Scratch(padded_input_size), - input->dtype())); + padded_input = make_unique(scratch->Scratch(padded_input_size), + input->dtype()); padded_input->Resize(padded_input_shape); PadInput(context, &kernels_[0], input, 0, 0, diff --git a/mace/ops/opencl/buffer_transform_kernel.h b/mace/ops/opencl/buffer_transform_kernel.h index 4269b67d22ca157f28fcde4a0f607f9ae6e9a5df..47f1cbaf10f4cf70c0a1d9014ba0ad77261414fe 100644 --- a/mace/ops/opencl/buffer_transform_kernel.h +++ b/mace/ops/opencl/buffer_transform_kernel.h @@ -17,7 +17,7 @@ #include "mace/core/runtime/opencl/opencl_util.h" #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { class OpContext; diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h index e65ae3701efe51068bb81a39e533f170502c792e..dbb6eab64c22f2941c2710f6a2730a527149f6c3 100644 --- a/mace/ops/opencl/buffer_transformer.h +++ b/mace/ops/opencl/buffer_transformer.h @@ -24,6 +24,7 @@ #include "mace/ops/opencl/image/image_to_buffer.h" #include "mace/ops/opencl/buffer/buffer_transform.h" #include "mace/ops/common/transpose.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -34,11 +35,11 @@ class OpenCLBufferTransformer { OpenCLBufferTransformer(const MemoryType in_mem_type, const MemoryType out_mem_type) { if (out_mem_type == MemoryType::GPU_IMAGE) { - kernel_.reset(new opencl::image::BufferToImage); + kernel_ = make_unique>(); } else if (in_mem_type == MemoryType::GPU_IMAGE) { - kernel_.reset(new opencl::image::ImageToBuffer); + kernel_ = make_unique>(); } else { - kernel_.reset(new opencl::buffer::BufferTransform); + kernel_ = make_unique>(); } } @@ -47,7 +48,7 @@ class OpenCLBufferTransformer { const OpenCLBufferType type, const MemoryType out_mem_type, const int wino_blk_size, - const DataFormat data_format, + bool has_data_format, Tensor *output) { Workspace *ws = context->workspace(); DataType dt = DataTypeToEnum::value; @@ -66,13 +67,14 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform CPU Buffer " << input->name() << " to GPU Buffer " << internal_tensor->name() << " with data type " << dt; - if (data_format == DataFormat::NHWC && input->shape().size() == 4) { + if (has_data_format && input->shape().size() == 4) { // 1. (NCHW -> NHWC) std::vector dst_dims = {0, 2, 3, 1}; std::vector output_shape = TransposeShape(input->shape(), dst_dims); internal_tensor->Resize(output_shape); + internal_tensor->set_data_format(DataFormat::NHWC); // TODO(liuqi): Only support float now const float *input_ptr = input->data(); Tensor::MappingGuard guard(internal_tensor); @@ -104,13 +106,13 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform GPU Buffer " << internal_tensor.name() << " to CPU Buffer " << output->name() << " with data type " << dt; - if (data_format == DataFormat::NHWC && - internal_tensor.shape().size() == 4) { + if (has_data_format && internal_tensor.shape().size() == 4) { // NHWC -> NCHW std::vector dst_dims = {0, 3, 1, 2}; std::vector output_shape = TransposeShape(internal_tensor.shape(), dst_dims); + output->set_data_format(DataFormat::NCHW); Tensor::MappingGuard guard(&internal_tensor); const float *internal_ptr = internal_tensor.data(); output->Resize(output_shape); diff --git a/mace/ops/opencl/channel_shuffle.h b/mace/ops/opencl/channel_shuffle.h index 86634d75bc0bb0e13254ec0a9c82714f7b746fda..df4a4b0f8e7fd92a0e4663643aeb1ef66de04e79 100644 --- a/mace/ops/opencl/channel_shuffle.h +++ b/mace/ops/opencl/channel_shuffle.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_CHANNEL_SHUFFLE_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/concat.h b/mace/ops/opencl/concat.h index d657ffbe9bdd2f69301b9f519491433531822f8d..abeec7c62e25299ac4de95e0b0dadc61bdb35900 100644 --- a/mace/ops/opencl/concat.h +++ b/mace/ops/opencl/concat.h @@ -18,7 +18,7 @@ #include #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/crop.h b/mace/ops/opencl/crop.h index 88aceea6f93845ece318b8b26b45e25eaf24dbfc..b12c5ee00fed43c0f954921bac11a26fa21e0f7e 100644 --- a/mace/ops/opencl/crop.h +++ b/mace/ops/opencl/crop.h @@ -18,7 +18,7 @@ #include #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/depth_to_space.h b/mace/ops/opencl/depth_to_space.h index 17c03d453593ccb7ca1c1ae58890f80b01a5c706..9d2d4fcba65fe01545c2588cdb9d667f53408af7 100644 --- a/mace/ops/opencl/depth_to_space.h +++ b/mace/ops/opencl/depth_to_space.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_DEPTH_TO_SPACE_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { class OpContext; diff --git a/mace/ops/opencl/eltwise.h b/mace/ops/opencl/eltwise.h index dec2b150d79a372a05895482a7db1819766b20e5..52156f06e908a394ae910abbeefb6da23e6cb236 100644 --- a/mace/ops/opencl/eltwise.h +++ b/mace/ops/opencl/eltwise.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_ELTWISE_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/fully_connected.h b/mace/ops/opencl/fully_connected.h index 8e421ad2f20510a76dcc0c5c841745d1832ac688..416aed6c8692ceaf45da1d1eb36f82b3753c8729 100644 --- a/mace/ops/opencl/fully_connected.h +++ b/mace/ops/opencl/fully_connected.h @@ -18,7 +18,7 @@ #include "mace/ops/activation.h" #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc index 8f3cd289bae5da40365cdefc9397c58eb0e7b1d1..46d4fd5b288d8463bfb44a5a879d9a93a5aebc70 100644 --- a/mace/ops/opencl/helper.cc +++ b/mace/ops/opencl/helper.cc @@ -19,7 +19,7 @@ #include #include "mace/utils/tuner.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/helper.h b/mace/ops/opencl/helper.h index 33ea688b51ab9cbc958af1e489959681061c3239..a4a49b4e15a021f1fa55fbd39c514777f03005bd 100644 --- a/mace/ops/opencl/helper.h +++ b/mace/ops/opencl/helper.h @@ -21,12 +21,13 @@ #include #include "mace/core/future.h" -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_util.h" #include "mace/core/types.h" -#include "mace/utils/utils.h" +#include "mace/utils/memory.h" +#include "mace/utils/math.h" namespace mace { namespace ops { @@ -41,8 +42,8 @@ namespace ops { #define MACE_OUT_OF_RANGE_INIT(kernel) \ if (runtime->IsOutOfRangeCheckEnabled()) { \ - oorc_flag = std::move(std::unique_ptr( \ - new Buffer((context)->device()->allocator()))); \ + oorc_flag = make_unique( \ + (context)->device()->allocator()); \ MACE_RETURN_IF_ERROR((oorc_flag)->Allocate(sizeof(int)));\ oorc_flag->Map(nullptr); \ *(oorc_flag->mutable_data()) = 0; \ diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc index db63300eb7607dead1cc9661533e0e7d463e5e4b..125a973ae7de4409b31fa2a716c35409d5955d0e 100644 --- a/mace/ops/opencl/image/conv_2d_3x3.cc +++ b/mace/ops/opencl/image/conv_2d_3x3.cc @@ -16,7 +16,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/ops/common/activation_type.h" #include "mace/ops/opencl/helper.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc index 08568a5d9e39d671a2e3d84de8fc1fa22c588f95..7f0250cbc4ebc73cfa52c6041c9da8c95b7e3892 100644 --- a/mace/ops/opencl/image/conv_2d_general.cc +++ b/mace/ops/opencl/image/conv_2d_general.cc @@ -16,7 +16,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/ops/opencl/helper.h" #include "mace/ops/common/activation_type.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h index 3ffb4fba69a8a79f46d188fbe9ddd9a2540759f1..e390a6ca69a2712dc1959c75ece199255011a173 100644 --- a/mace/ops/opencl/image/crop.h +++ b/mace/ops/opencl/image/crop.h @@ -34,16 +34,14 @@ template class CropKernel : public OpenCLCropKernel { public: explicit CropKernel( - const int axis, const std::vector &offset) - : axis_(axis), offset_(offset) {} + : offset_(offset) {} MaceStatus Compute( OpContext *context, const std::vector &input_list, Tensor *output) override; private: - const int axis_; std::vector offset_; cl::Kernel kernel_; uint32_t kwg_size_; @@ -68,57 +66,14 @@ MaceStatus CropKernel::Compute( std::vector offsets(4, 0); std::vector output_shape(input0->shape()); - switch (axis_) { - case 0: - if (offset_.size() == 1) { - offsets[0] = offset_[0]; - offsets[1] = offset_[0]; - offsets[2] = offset_[0]; - offsets[3] = offset_[0]; - } else if (offset_.size() == 4) { - offsets[0] = offset_[0]; - offsets[1] = offset_[2]; - offsets[2] = offset_[3]; - offsets[3] = offset_[1]; - } - for (int i = 0; i < 4; ++i) { - output_shape[i] = input1->dim(i); - } - break; - case 1: - if (offset_.size() == 1) { - offsets[1] = offset_[0]; - offsets[2] = offset_[0]; - offsets[3] = offset_[0]; - } else if (offset_.size() == 3) { - offsets[1] = offset_[1]; - offsets[2] = offset_[2]; - offsets[3] = offset_[0]; - } - for (int i = 1; i < 4; ++i) { - output_shape[i] = input1->dim(i); - } - break; - case 2: - if (offset_.size() == 1) { - offsets[1] = offset_[0]; - offsets[2] = offset_[0]; - } else if (offset_.size() == 2) { - offsets[1] = offset_[0]; - offsets[2] = offset_[1]; - } - output_shape[1] = input1->dim(1); - output_shape[2] = input1->dim(2); - break; - case 3: - if (offset_.size() == 1) { - offsets[2] = offset_[0]; - } - output_shape[2] = input1->dim(2); - break; - default: - MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary."); - break; + for (index_t i = 0; i < in0_dims; ++i) { + if (offset_[i] >= 0) { + output_shape[i] = input1->dim(i); + offsets[i] = offset_[i]; + MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i)) + << "the crop for dimension " << i << " is out of bound with size " + << input1->dim(i) << " and offset " << offsets[i]; + } } MACE_CHECK(offsets[3] % 4 == 0, "MACE opencl only supports cropping channel" diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc index 527d6cc87f0b8e5023100a9d403f363d66db5871..27a0bc30533f4538a537dc6c3084178ee1d5d3cd 100644 --- a/mace/ops/opencl/image/winograd_conv2d.cc +++ b/mace/ops/opencl/image/winograd_conv2d.cc @@ -17,7 +17,8 @@ #include "mace/ops/common/activation_type.h" #include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/opencl/helper.h" -#include "mace/utils/utils.h" +#include "mace/utils/memory.h" +#include "mace/utils/math.h" namespace mace { namespace ops { @@ -264,9 +265,9 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, OpenCLBufferType::IN_OUT_HEIGHT, &t_input_image_shape); ScratchImage transformed_input_image(scratch_manager); - std::unique_ptr transformed_input(new Tensor( + std::unique_ptr transformed_input = make_unique( transformed_input_image.Scratch(context->device()->allocator(), - t_input_image_shape, dt), dt)); + t_input_image_shape, dt), dt); MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape, t_input_image_shape)); MACE_RETURN_IF_ERROR(WinogradInputTransform( @@ -289,9 +290,9 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, &mm_output_image_shape); ScratchImage mm_output_image(scratch_manager); - std::unique_ptr mm_output(new Tensor( + std::unique_ptr mm_output = make_unique( mm_output_image.Scratch(context->device()->allocator(), - mm_output_image_shape, dt), dt)); + mm_output_image_shape, dt), dt); MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape, mm_output_image_shape)); diff --git a/mace/ops/opencl/lstm_cell.h b/mace/ops/opencl/lstm_cell.h index 07ea2e65551092d5c8dcfe561a2dcaecc8c9261a..4dee034c12fa858fea262e2977d5383b5863e9b3 100644 --- a/mace/ops/opencl/lstm_cell.h +++ b/mace/ops/opencl/lstm_cell.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_LSTM_CELL_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/matmul.h b/mace/ops/opencl/matmul.h index c51c83c146fdc7834de05522f8ab6939f4974673..05879f8ae2ed8623652316d13e0526e48a584b3b 100644 --- a/mace/ops/opencl/matmul.h +++ b/mace/ops/opencl/matmul.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_MATMUL_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/out_of_range_check_test.cc b/mace/ops/opencl/out_of_range_check_test.cc index 61e19808d1dad91045876e75e9b525c042c78427..8909f35113c5a77d78cf614970d9d027019f111c 100644 --- a/mace/ops/opencl/out_of_range_check_test.cc +++ b/mace/ops/opencl/out_of_range_check_test.cc @@ -22,6 +22,7 @@ #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/ops/opencl/helper.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -130,7 +131,8 @@ TEST(OutOfRangeCheckTest, RandomTest) { index_t channels = 11; GPUContext gpu_context; - std::unique_ptr device(new GPUDevice(gpu_context.opencl_tuner())); + std::unique_ptr device = make_unique( + gpu_context.opencl_tuner()); Workspace ws; OpContext context(&ws, device.get()); diff --git a/mace/ops/opencl/pad.h b/mace/ops/opencl/pad.h index cfc7edb3a1351e1b9bf8d1a152e4d906c6f09d47..640137691964b9e57cca9e69ee0c73a8d85420f0 100644 --- a/mace/ops/opencl/pad.h +++ b/mace/ops/opencl/pad.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_PAD_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { class OpContext; diff --git a/mace/ops/opencl/reduce.h b/mace/ops/opencl/reduce.h index 4f6aa2187561a22ac0e6758b45738a73a6bf9fa7..f653f8b02805dfb387d38b617eed2256e70255d9 100644 --- a/mace/ops/opencl/reduce.h +++ b/mace/ops/opencl/reduce.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_REDUCE_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/resize_bicubic.h b/mace/ops/opencl/resize_bicubic.h index 4fde45453b6335e2ca1a1eab8c57f909b253e97b..b7fd71a0dbcddc85322fae1b2a8973a1b63af1b5 100644 --- a/mace/ops/opencl/resize_bicubic.h +++ b/mace/ops/opencl/resize_bicubic.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_RESIZE_BICUBIC_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" #include "mace/core/types.h" namespace mace { diff --git a/mace/ops/opencl/resize_bilinear.h b/mace/ops/opencl/resize_bilinear.h index 18dd312845b0bcba5ed91a1f9ad5aa0311e2279a..66035d8511136952eeefe92efce0a3fd614aad5e 100644 --- a/mace/ops/opencl/resize_bilinear.h +++ b/mace/ops/opencl/resize_bilinear.h @@ -17,7 +17,7 @@ #include "mace/core/types.h" #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/resize_nearest_neighbor.h b/mace/ops/opencl/resize_nearest_neighbor.h index fda220aee9704228d435a304001a5f679f2d28e3..b0178827ac6190d413b179b4a98c367d1e5f9c37 100644 --- a/mace/ops/opencl/resize_nearest_neighbor.h +++ b/mace/ops/opencl/resize_nearest_neighbor.h @@ -17,7 +17,7 @@ #include "mace/core/types.h" #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/softmax.h b/mace/ops/opencl/softmax.h index caca5dc693a6d2e8735edc91a0f5c9f0feab65c4..a4a439ec1501db36a417c2d533d14cf1bea103e5 100644 --- a/mace/ops/opencl/softmax.h +++ b/mace/ops/opencl/softmax.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_SOFTMAX_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/space_to_batch.h b/mace/ops/opencl/space_to_batch.h index 350bf120975c6e3747e8cb5c848d9eb88f646d71..9f73ff5acdfaaa7956219bff51c42c4beb9c40b4 100644 --- a/mace/ops/opencl/space_to_batch.h +++ b/mace/ops/opencl/space_to_batch.h @@ -19,7 +19,7 @@ #include "mace/core/types.h" #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/space_to_depth.h b/mace/ops/opencl/space_to_depth.h index 69e40c82a0e3d7d06cd181d52bb71f7f1b7bd8e0..454cb686e7d1589c3c1329cc97d7a60b3c9ed663 100644 --- a/mace/ops/opencl/space_to_depth.h +++ b/mace/ops/opencl/space_to_depth.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_SPACE_TO_DEPTH_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/split.h b/mace/ops/opencl/split.h index 61f75d85c6f2c3032eacbeb908bb79edf61c26ea..8c7ac5636b77a1d07449d6f8ce09c77a6934b537 100644 --- a/mace/ops/opencl/split.h +++ b/mace/ops/opencl/split.h @@ -18,7 +18,7 @@ #include #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/sqrdiff_mean.h b/mace/ops/opencl/sqrdiff_mean.h index 822b992f0009726210126bcb19d06f480dcbf7ca..781a08c56568bf6f27230abd109d53ab24faa7e3 100644 --- a/mace/ops/opencl/sqrdiff_mean.h +++ b/mace/ops/opencl/sqrdiff_mean.h @@ -16,7 +16,7 @@ #define MACE_OPS_OPENCL_SQRDIFF_MEAN_H_ #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/math.h" namespace mace { class OpContext; diff --git a/mace/ops/ops_registry.cc b/mace/ops/ops_registry.cc index 58f6572756ac1a02b53bdac6c7de1c1622679684..7fc3545883ee855a578c001ac3ff75ff574261b6 100644 --- a/mace/ops/ops_registry.cc +++ b/mace/ops/ops_registry.cc @@ -29,6 +29,7 @@ extern void RegisterChannelShuffle(OpRegistryBase *op_registry); extern void RegisterConcat(OpRegistryBase *op_registry); extern void RegisterConv2D(OpRegistryBase *op_registry); extern void RegisterCrop(OpRegistryBase *op_registry); +extern void RegisterCumsum(OpRegistryBase *op_registry); extern void RegisterDeconv2D(OpRegistryBase *op_registry); extern void RegisterDepthToSpace(OpRegistryBase *op_registry); extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry); @@ -44,6 +45,7 @@ extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry); extern void RegisterMatMul(OpRegistryBase *op_registry); extern void RegisterOneHot(OpRegistryBase *op_registry); extern void RegisterPad(OpRegistryBase *op_registry); +extern void RegisterPNorm(OpRegistryBase *op_registry); extern void RegisterPooling(OpRegistryBase *op_registry); extern void RegisterReduce(OpRegistryBase *op_registry); extern void RegisterPriorBox(OpRegistryBase *op_registry); @@ -54,14 +56,19 @@ extern void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry); extern void RegisterReverse(OpRegistryBase *op_registry); extern void RegisterScalarMath(OpRegistryBase *op_registry); extern void RegisterShape(OpRegistryBase *op_registry); +extern void RegisterSlice(OpRegistryBase *op_registry); extern void RegisterSoftmax(OpRegistryBase *op_registry); extern void RegisterSpaceToBatchND(OpRegistryBase *op_registry); extern void RegisterSpaceToDepth(OpRegistryBase *op_registry); +extern void RegisterSplice(OpRegistryBase *op_registry); extern void RegisterSplit(OpRegistryBase *op_registry); extern void RegisterSqrDiffMean(OpRegistryBase *op_registry); extern void RegisterSqueeze(OpRegistryBase *op_registry); extern void RegisterStack(OpRegistryBase *op_registry); extern void RegisterStridedSlice(OpRegistryBase *op_registry); +extern void RegisterSumGroup(OpRegistryBase *op_registry); +extern void RegisterTargetRMSNorm(OpRegistryBase *op_registry); +extern void RegisterTimeOffset(OpRegistryBase *op_registry); extern void RegisterTranspose(OpRegistryBase *op_registry); extern void RegisterUnstack(OpRegistryBase *op_registry); @@ -90,6 +97,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterConcat(this); ops::RegisterConv2D(this); ops::RegisterCrop(this); + ops::RegisterCumsum(this); ops::RegisterDeconv2D(this); ops::RegisterDepthToSpace(this); ops::RegisterDepthwiseConv2d(this); @@ -105,6 +113,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterMatMul(this); ops::RegisterOneHot(this); ops::RegisterPad(this); + ops::RegisterPNorm(this); ops::RegisterPooling(this); ops::RegisterReduce(this); ops::RegisterPriorBox(this); @@ -115,14 +124,19 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterReverse(this); ops::RegisterScalarMath(this); ops::RegisterShape(this); + ops::RegisterSlice(this); ops::RegisterSoftmax(this); ops::RegisterSpaceToBatchND(this); ops::RegisterSpaceToDepth(this); + ops::RegisterSplice(this); ops::RegisterSplit(this); ops::RegisterStack(this); ops::RegisterStridedSlice(this); ops::RegisterSqrDiffMean(this); ops::RegisterSqueeze(this); + ops::RegisterSumGroup(this); + ops::RegisterTargetRMSNorm(this); + ops::RegisterTimeOffset(this); ops::RegisterTranspose(this); ops::RegisterUnstack(this); diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index ce9c1bbde07ddd8857f33718f06eb47d1fb34fa9..25de146a59db15f456a0941c14222fc30a5a54e7 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -14,6 +14,7 @@ #include "mace/ops/ops_test_util.h" #include "mace/core/memory_optimizer.h" +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -120,17 +121,16 @@ OpTestContext *OpTestContext::Get(int num_threads, OpTestContext::OpTestContext(int num_threads, CPUAffinityPolicy cpu_affinity_policy, bool use_gemmlowp) - : gpu_context_(new GPUContext(GetStoragePathFromEnv())), + : gpu_context_(std::make_shared(GetStoragePathFromEnv())), opencl_mem_types_({MemoryType::GPU_IMAGE}) { - device_map_[DeviceType::CPU] = std::unique_ptr( - new CPUDevice(num_threads, - cpu_affinity_policy, - use_gemmlowp)); - - device_map_[DeviceType::GPU] = std::unique_ptr( - new GPUDevice(gpu_context_->opencl_tuner(), - gpu_context_->opencl_cache_storage(), - GPUPriorityHint::PRIORITY_NORMAL)); + device_map_[DeviceType::CPU] = make_unique( + num_threads, cpu_affinity_policy, use_gemmlowp); + + device_map_[DeviceType::GPU] = make_unique( + gpu_context_->opencl_tuner(), + gpu_context_->opencl_cache_storage(), + GPUPriorityHint::PRIORITY_NORMAL, + GPUPerfHint::PERF_HIGH); } std::shared_ptr OpTestContext::gpu_context() const { @@ -167,9 +167,20 @@ bool OpsTestNet::Setup(mace::DeviceType device) { !ws_.GetTensor(input)->is_weight()) { auto input_info = net_def.add_input_info(); input_info->set_name(input); - auto data_format = ProtoArgHelper::GetOptionalArg( - op_def, "data_format", DataFormat::DF_NONE); - input_info->set_data_format(data_format); + auto has_data_format = ProtoArgHelper::GetOptionalArg( + op_def, "has_data_format", 1); + auto is_quantized_op = ProtoArgHelper::GetOptionalArg( + op_def, "T", static_cast(DT_FLOAT)) + == static_cast(DT_UINT8); + if (has_data_format) { + if (is_quantized_op || device == DeviceType::GPU) { + input_info->set_data_format(NHWC); + } else { + input_info->set_data_format(NCHW); + } + } else { + input_info->set_data_format(DataFormat::DF_NONE); + } auto &shape = ws_.GetTensor(input)->shape(); for (auto d : shape) { input_info->add_dims(static_cast(d)); @@ -177,24 +188,26 @@ bool OpsTestNet::Setup(mace::DeviceType device) { } } } - auto op_def = op_defs_.back(); - for (int i = 0; i < op_def.output_size(); ++i) { - ws_.RemoveTensor(op_def.output(i)); - auto output_info = net_def.add_output_info(); - output_info->set_name(op_def.output(i)); - if (op_def.output_type_size() == op_def.output_size()) { - output_info->set_data_type(op_def.output_type(i)); - } else { - output_info->set_data_type(DataType::DT_FLOAT); + if (!op_defs_.empty()) { + auto op_def = op_defs_.back(); + for (int i = 0; i < op_def.output_size(); ++i) { + ws_.RemoveTensor(op_def.output(i)); + auto output_info = net_def.add_output_info(); + output_info->set_name(op_def.output(i)); + if (op_def.output_type_size() == op_def.output_size()) { + output_info->set_data_type(op_def.output_type(i)); + } else { + output_info->set_data_type(DataType::DT_FLOAT); + } } } MemoryOptimizer mem_optimizer; - net_ = std::unique_ptr(new SerialNet( + net_ = make_unique( op_registry_.get(), &net_def, &ws_, OpTestContext::Get()->GetDevice(device), - &mem_optimizer)); + &mem_optimizer); MaceStatus status = (ws_.PreallocateOutputTensor( net_def, &mem_optimizer, @@ -236,12 +249,12 @@ MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def, const mace::DeviceType device) { device_type_ = device; MemoryOptimizer mem_optimizer; - net_ = std::unique_ptr(new SerialNet( + net_ = make_unique( op_registry_.get(), &net_def, &ws_, OpTestContext::Get()->GetDevice(device), - &mem_optimizer)); + &mem_optimizer); MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor( net_def, &mem_optimizer, diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 07cbad06bdb57381ca3befada4baf1e1f11b5bed..8226079711535766f30e06626b80110c4883b82a 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -34,7 +34,8 @@ #include "mace/core/workspace.h" #include "mace/ops/ops_registry.h" #include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/utils/memory.h" +#include "mace/utils/math.h" #include "mace/utils/quantize.h" #include "mace/ops/testing/test_utils.h" @@ -97,7 +98,7 @@ class OpTestContext { class OpsTestNet { public: OpsTestNet() : - op_registry_(new OpRegistry()) {} + op_registry_(make_unique()) {} template void AddInputFromArray(const std::string &name, @@ -258,9 +259,9 @@ class OpsTestNet { template void TransformFilterDataFormat(const std::string &src_name, - const FilterDataFormat src_format, + const DataFormat src_format, const std::string &dst_name, - const FilterDataFormat dst_format) { + const DataFormat dst_format) { Tensor *input = ws_.GetTensor(src_name); Tensor *output = ws_.CreateTensor( dst_name, @@ -355,9 +356,9 @@ class OpsTestNet { std::unique_ptr CreateTensor( const std::vector &shape = {}, const std::vector &data = {}) { - std::unique_ptr res( - new Tensor(OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v())); + std::unique_ptr res = make_unique( + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); if (!data.empty()) { res->Resize(shape); T *input_data = res->mutable_data(); diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index 0dfdf673b21f49ce231030251ed78004971e0b3f..aaa6b230f4b5237dc88d16e369dcf289a8fe9df6 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -20,6 +20,8 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/pad.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" +#include "mace/utils/math.h" namespace mace { namespace ops { @@ -39,9 +41,9 @@ class PadOp : public Operation { constant_value_(Operation::GetOptionalArg( "constant_value", 0.0)) { MACE_CHECK(paddings_.size() == 8); - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df) { paddings_ = TransposeShape(paddings_, {0, 1, 6, 7, 2, 3, 4, 5}); } } @@ -54,11 +56,9 @@ class PadOp : public Operation { this->paddings_.size() == static_cast(input->dim_size()) * 2); auto input_shape = input->shape(); for (size_t i = 0; i < paddings_.size(); ++i) { - if (type_ == PadType::REFLECT) { - MACE_CHECK(paddings_[i] < input_shape[i / 2]); - - } else if (type_ == PadType::SYMMETRIC) { - MACE_CHECK(paddings_[i] <= input_shape[i / 2]); + if (type_ == PadType::REFLECT || type_ == PadType::SYMMETRIC) { + MACE_CHECK(paddings_[i] < input_shape[i / 2], paddings_[i], + " vs ", input_shape[i / 2]); } MACE_CHECK(paddings_[i] >= 0); } @@ -182,8 +182,8 @@ class PadOp : public Operation { float constant_value = Operation::GetOptionalArg( "constant_value", 0.0); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::PadKernel( - type, paddings, constant_value)); + kernel_ = make_unique>( + type, paddings, constant_value); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index 0466aa6be486d5f5917f4397006e5cdc4619179e..b449e02f9166c21620daf289baac89b34c25b37f 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -29,7 +29,11 @@ void Pad(int iters, int batch, int height, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); + if (D == DeviceType::CPU) { + net.AddRandomInput("Input", {batch, channels, height, width}); + } else { + net.AddRandomInput("Input", {batch, height, width, channels}); + } const std::vector paddings = {0, 0, pad, pad, pad, pad, 0, 0}; OpDefBuilder("Pad", "PadTest") @@ -37,6 +41,7 @@ void Pad(int iters, int batch, int height, .Output("Output") .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", pad_type) + .AddIntArg("has_data_format", 1) .AddFloatArg("constant_value", 1.0) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index 63bb449f25057ae8335dc95a6d52042dec2186c6..e68e8eb8d06b864b9c9173ada5fbb2312ec0566c 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -39,7 +39,7 @@ void SimpleConstant() { .Output("Output") .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0}) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -52,7 +52,7 @@ void SimpleConstant() { .Output("TOutput") .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0}) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -101,7 +101,7 @@ void Result(const std::vector &input_shape, .Output(t_output) .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", static_cast(pad_type)) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -179,7 +179,7 @@ TEST_F(PadTest, ComplexCPU) { .Output("TOutput") .AddIntsArg("paddings", {0, 0, 1, 1, 1, 1, 1, 1}) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -217,7 +217,7 @@ void Complex(const std::vector &input_shape, .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", pad_type) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -234,7 +234,7 @@ void Complex(const std::vector &input_shape, .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", pad_type) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run diff --git a/mace/ops/pnorm.cc b/mace/ops/pnorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..8742a3b4492cb36aab4deece867a2021c4afd106 --- /dev/null +++ b/mace/ops/pnorm.cc @@ -0,0 +1,133 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is for PNormComponent in Kaldi. +// The input-dim must be dividable by output-dim. +// The output will be divided to output-dim group, +// so input-dim should be dividable by output-dim. +// For each row: +// p is 0: output[i] = sum(abs(input[i*group + j]) > 0) +// p is 1: output[i] = sum(abs(input[i*group + j])) +// p is 2: output[i] = sqrt(sum(input[i * group + j] * input[i * group + j])), +// for j = (0 : group - 1) +// p's default value is 2. + +#include +#include + +#include "mace/core/operator.h" + + +namespace mace { +namespace ops { + +template +class PNormOp; + +template +class PNormOp : public Operation { + public: + explicit PNormOp(OpConstructContext *context) + : Operation(context), + p_(Operation::GetOptionalArg("p", 2)), + output_dim_(Operation::GetOptionalArg("output_dim", 0)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + + + const std::vector &input_shape = input->shape(); + const index_t dim_size = input_shape.size(); + MACE_CHECK(dim_size >= 1, "PNorm only supports input dim size >= 1"); + std::vector output_shape(input_shape); + const index_t input_dim = input_shape[dim_size -1]; + MACE_CHECK(output_dim_ > 0, + "Output dim should be greater than zero."); + MACE_CHECK(input_dim % output_dim_ == 0 && output_dim_ < input_dim, + "PNorm's input dim should be a multiple of output dim."); + const index_t group_size = input_dim / output_dim_; + output_shape[dim_size - 1] = output_dim_; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_output(output); + + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + const index_t bh = + std::accumulate(input->shape().begin(), input->shape().end() - 1, 1, + std::multiplies()); + if (p_ == 0) { +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t i = 0; i < bh; ++i) { + for (index_t j = 0; j < output_dim_; ++j) { + const T *in_base = input_data + i * input_dim + j * group_size; + T *out_base = output_data + i * output_dim_; + T temp_result = 0; + for (index_t g = 0; g < group_size; ++g) { + T value = + (std::fabs(in_base[g]) + > std::numeric_limits::epsilon()) ? 1.0f : 0.0f; + temp_result += value; + } + out_base[j] = temp_result; + } + } + } else if (p_ == 1) { +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t i = 0; i < bh; ++i) { + for (index_t j = 0; j < output_dim_; ++j) { + const T *in_base = input_data + i * input_dim + j * group_size; + T *out_base = output_data + i * output_dim_; + T temp_result = 0; + for (index_t g = 0; g < group_size; ++g) { + temp_result += std::abs(in_base[g]);; + } + out_base[j] = temp_result; + } + } + } else if (p_ == 2) { +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t i = 0; i < bh; ++i) { + for (index_t j = 0; j < output_dim_; ++j) { + const T *in_base = input_data + i * input_dim + j * group_size; + T *out_base = output_data + i * output_dim_; + T temp_result = 0; + for (index_t g = 0; g < group_size; ++g) { + temp_result += in_base[g] * in_base[g]; + } + out_base[j] = std::sqrt(temp_result); + } + } + } else { + LOG(FATAL) << "PNorm's p should be 0, 1 or 2, here p is: " << p_; + } + return MaceStatus::MACE_SUCCESS; + } + + private: + int p_; + int output_dim_; +}; + +void RegisterPNorm(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "PNorm", PNormOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/pnorm_benchmark.cc b/mace/ops/pnorm_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..e3af765cd22f3589abd602dc6e28cd96acc2ee0f --- /dev/null +++ b/mace/ops/pnorm_benchmark.cc @@ -0,0 +1,77 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void PNormBenchmark(int iters, int n, int h, int w, int p, int ow) { + mace::testing::StopTiming(); + + OpsTestNet net; + // Add input data + net.AddRandomInput("Input", {n, h, w}); + + OpDefBuilder("PNorm", "PNormBM") + .Input("Input") + .AddIntArg("p", p) + .AddIntArg("output_dim", ow) + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + net.Sync(); + } +} +} // namespace + +#define MACE_BM_PNORM_MACRO(N, H, W, P, OW, TYPE, DEVICE) \ + static void \ + MACE_BM_PNORM_##N##_##H##_##W##_##P##_##OW##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + PNormBenchmark(iters, N, H, W, P, OW); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_PNORM_##N##_##H##_##W##_##P##_##OW##_##TYPE##_##DEVICE) + +#define MACE_BM_PNORM(N, H, W, P, OW) \ + MACE_BM_PNORM_MACRO(N, H, W, P, OW, float, CPU); + +MACE_BM_PNORM(1, 10, 256, 0, 128); +MACE_BM_PNORM(1, 20, 128, 1, 64); +MACE_BM_PNORM(1, 10, 128, 2, 64); +MACE_BM_PNORM(1, 16, 256, 0, 128); +MACE_BM_PNORM(1, 32, 128, 1, 64); +MACE_BM_PNORM(1, 10, 512, 2, 256); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/pnorm_test.cc b/mace/ops/pnorm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3510868250cadd655ffe345855973e39d2a0e534 --- /dev/null +++ b/mace/ops/pnorm_test.cc @@ -0,0 +1,70 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class PNormOpTest : public OpsTestBase {}; + +namespace { +template +void TestPNorm(const std::vector &input_shape, + const std::vector &input, + const int p, + const int output_dim, + const std::vector &output_shape, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), + input_shape, + input); + + OpDefBuilder("PNorm", "PNormTest") + .Input("Input") + .AddIntArg("p", p) + .AddIntArg("output_dim", output_dim) + .Output("Output") + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.AddInputFromArray("ExpectedOutput", output_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(PNormOpTest, SimpleTest) { + TestPNorm( + {1, 5, 10}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}, + 2, 5, + {1, 5, 5}, + {2.236067977, 5, 7.810249676, 10.630145813, 13.453624047, + 5, 7.810249676, 10.630145813, 13.453624047, 16.278820596, + 7.810249676, 10.630145813, 13.453624047, 16.278820596, 19.104973175, + 10.630145813, 13.453624047, 16.278820596, 19.104973175, 21.931712199, + 13.453624047, 16.278820596, 19.104973175, 21.931712199, 24.758836806}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 8fd87cdfa38771a56636fd7bd54894ea1cbe042e..969f2774e3bb5a5fcf35e37e5f613f2f87b9f19b 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -32,6 +32,7 @@ #include "mace/ops/opencl/image/pooling.h" #include "mace/ops/opencl/buffer/pooling.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -433,10 +434,10 @@ class PoolingOp : public PoolingOpBase { explicit PoolingOp(OpConstructContext *context) : PoolingOpBase(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::PoolingKernel); + kernel_ = make_unique>(); } else { context->set_output_mem_type(MemoryType::GPU_BUFFER); - kernel_.reset(new opencl::buffer::PoolingKernel); + kernel_ = make_unique>(); } } MaceStatus Run(OpContext *context) override { diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index f4a147cc7b8191f5323cf38acd532830a44948c9..068212f204d85a3129d1f7ad9e9cbe0cfca06491 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -25,6 +25,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/reduce.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -84,7 +85,7 @@ class ReduceOp : public ReduceOpBase { private: void Simplify(const Tensor *input) { std::vector bitmap(static_cast(input->dim_size()), false); - if (axis_.size() == 0) { + if (axis_.empty()) { for (int i = 0; i < input->dim_size(); ++i) { bitmap[i] = true; } @@ -93,9 +94,9 @@ class ReduceOp : public ReduceOpBase { int index = axis_[i] >= 0 ? axis_[i] : axis_[i] + input->dim_size(); - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && DataTypeToEnum::value != DT_UINT8 + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && DataTypeToEnum::value != DT_UINT8 && input->dim_size() == 4) { if (index == 1 || index == 2) index = index + 1; else if (index == 3) index = 1; @@ -847,9 +848,9 @@ class ReduceOp : public ReduceOpBase { explicit ReduceOp(OpConstructContext *context) : ReduceOpBase(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ReduceKernel(reduce_type_, - axis_, - keep_dims_)); + kernel_ = make_unique>(reduce_type_, + axis_, + keep_dims_); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/reduce_benchmark.cc b/mace/ops/reduce_benchmark.cc index d97131672c2fba7d988b0e5118a410b54acc571a..1d5fbe33ccb10dc7ffbef9b00353ed93889691fd 100644 --- a/mace/ops/reduce_benchmark.cc +++ b/mace/ops/reduce_benchmark.cc @@ -38,6 +38,7 @@ void Reduce(int iters, int batch, int channels, .Input("Input") .AddIntsArg("axis", axis) .Output("OutputImage") + .AddIntArg("has_data_format", 1) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/reduce_test.cc b/mace/ops/reduce_test.cc index 78a9f9345a8ca4da9eae0a0beedcb8dd1fbed49c..fc284084b25dfe7aac2c6fb936953dbe98e75212 100644 --- a/mace/ops/reduce_test.cc +++ b/mace/ops/reduce_test.cc @@ -44,7 +44,7 @@ void Simple(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); // Run @@ -56,7 +56,7 @@ void Simple(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); // Run @@ -84,7 +84,7 @@ void Simple3D(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); // Run @@ -588,7 +588,7 @@ void RandomTest(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); // Run @@ -600,7 +600,7 @@ void RandomTest(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OPENCLOutput") .Finalize(net.NewOperatorDef()); // Run @@ -662,7 +662,7 @@ void TestQuant(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .AddIntArg("T", DT_FLOAT) .Finalize(net.NewOperatorDef()); @@ -687,7 +687,7 @@ void TestQuant(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .AddIntArg("T", DT_UINT8) .Finalize(net.NewOperatorDef()); net.RunOp(); diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc index 4707d9229bd9ce5cac322bcc8b0294521e061062..e5b7952a334b8fb5bcc4d13d8264fc6f76d8c41d 100644 --- a/mace/ops/ref/conv_2d.cc +++ b/mace/ops/ref/conv_2d.cc @@ -16,7 +16,6 @@ #include "mace/ops/ref/conv_2d.h" #include -#include "mace/ops/common/conv_pool_2d_util.h" namespace mace { namespace ops { @@ -30,31 +29,36 @@ MaceStatus Conv2d::Compute(const OpContext *context, const std::vector in_shape = input->shape(); const std::vector filter_shape = filter->shape(); - const std::vector out_shape = output->shape(); - const std::vector stride_hw{stride_h_, stride_w_}; - const std::vector dilation_hw{dilation_h_, dilation_w_}; - const std::vector paddings{pad_h_, pad_w_}; - const index_t pad_top = pad_h_ >> 1; - const index_t pad_left = pad_w_ >> 1; - - std::vector output_shape(4); - - CalcOutputSize(in_shape.data(), - NCHW, - filter_shape.data(), - OIHW, - paddings.data(), - dilation_hw.data(), - stride_hw.data(), - RoundType::FLOOR, - output_shape.data()); - output->Resize(output_shape); - + std::vector out_shape(4); + + std::vector paddings(2); + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input->shape().data(), + filter->shape().data(), + dilations_.data(), + strides_.data(), + padding_type_, + out_shape.data(), + paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input->shape().data(), + filter->shape().data(), + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::FLOOR, + out_shape.data()); + } + const index_t pad_top = paddings[0] >> 1; + const index_t pad_left = paddings[1] >> 1; + output->Resize(out_shape); const index_t in_image_size = in_shape[2] * in_shape[3]; const index_t out_image_size = out_shape[2] * out_shape[3]; const index_t in_batch_size = filter_shape[1] * in_image_size; const index_t out_batch_size = filter_shape[0] * out_image_size; const index_t filter_size = filter_shape[2] * filter_shape[3]; + Tensor::MappingGuard input_guard(input); Tensor::MappingGuard filter_guard(filter); Tensor::MappingGuard output_guard(output); @@ -86,8 +90,10 @@ MaceStatus Conv2d::Compute(const OpContext *context, for (index_t kh = 0; kh < filter_shape[2]; ++kh) { for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - const index_t ih = -pad_top + h * stride_h_ + kh * dilation_h_; - const index_t iw = -pad_left + w * stride_w_ + kw * dilation_w_; + const index_t + ih = -pad_top + h * strides_[0] + kh * dilations_[0]; + const index_t + iw = -pad_left + w * strides_[1] + kw * dilations_[1]; if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw]; } diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h index e99af5cf0093dd7ab419d7b321ed36bf941bfeb3..10baac8cb86abcdd1f88993ae12fb752f589fcb7 100644 --- a/mace/ops/ref/conv_2d.h +++ b/mace/ops/ref/conv_2d.h @@ -16,9 +16,12 @@ #ifndef MACE_OPS_REF_CONV_2D_H_ #define MACE_OPS_REF_CONV_2D_H_ +#include + #include "mace/public/mace.h" #include "mace/core/tensor.h" #include "mace/core/op_context.h" +#include "mace/ops/common/conv_pool_2d_util.h" namespace mace { namespace ops { @@ -27,30 +30,39 @@ namespace ref { template class Conv2d { public: - Conv2d(int stride_h, int stride_w, int dilation_h, int dilation_w); + Conv2d(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} ~Conv2d() {} MaceStatus Compute( const OpContext *context, const Tensor *input, const Tensor *filter, Tensor *output); + + private: + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; }; template<> class Conv2d { public: - Conv2d(int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w) - : pad_h_(pad_h), - pad_w_(pad_w), - stride_h_(stride_h), - stride_w_(stride_w), - dilation_h_(dilation_h), - dilation_w_(dilation_w) {} + Conv2d(const std::vector strides, + const std::vector dilations, + const std::vector paddings, + const Padding padding_type) + : strides_(strides), + dilations_(dilations), + paddings_(paddings), + padding_type_(padding_type) {} ~Conv2d() {} // Always row-major after transpose MaceStatus Compute( @@ -60,12 +72,10 @@ class Conv2d { Tensor *output); private: - int pad_h_; - int pad_w_; - int stride_h_; - int stride_w_; - int dilation_h_; - int dilation_w_; + const std::vector strides_; + const std::vector dilations_; + const std::vector paddings_; + const Padding padding_type_; }; } // namespace ref diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc index f082cf31a9dbf35aad4ce2ca65c5f4cb6d5679e7..98ea215e7678b32170bf98d415b0c88ec23a60e6 100644 --- a/mace/ops/reshape.cc +++ b/mace/ops/reshape.cc @@ -15,6 +15,7 @@ #include #include "mace/core/operator.h" +#include "mace/utils/math.h" namespace mace { namespace ops { @@ -23,16 +24,12 @@ template class ReshapeOp : public Operation { public: explicit ReshapeOp(OpConstructContext *context) - : Operation(context) {} + : Operation(context), + has_df_(Operation::GetOptionalArg("has_data_format", 0)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); const Tensor *input = this->Input(INPUT); - const std::vector &input_shape = input->shape(); - int axis = Operation::GetOptionalArg("reshape_axis", 0); - int num_axes = Operation::GetOptionalArg("num_axes", -1); - MACE_CHECK(axis == 0 && num_axes == -1, - "Only support axis = 0 and num_axes = -1"); const Tensor *shape = this->Input(SHAPE); const index_t num_dims = shape->dim_size() == 0 ? 0 : shape->dim(0); Tensor::MappingGuard shape_guard(shape); @@ -40,20 +37,16 @@ class ReshapeOp : public Operation { int unknown_idx = -1; index_t product = 1; - std::vector out_shape; + std::vector out_shape(num_dims); index_t n = 0; for (int i = 0; i < num_dims; ++i) { if (shape_data[i] == -1) { MACE_CHECK(unknown_idx == -1, "Only one input size may be -1"); unknown_idx = i; - out_shape.push_back(1); - } else if (shape_data[i] == 0) { - MACE_CHECK(shape_data[i] == 0, "Shape should be 0"); - out_shape.push_back(input_shape[i]); - product *= input_shape[i]; + out_shape[i] = 1; } else { - MACE_CHECK(shape_data[i] > 0, "Shape must be non-negative: ", + MACE_CHECK(shape_data[i] >= 0, "Shape must be non-negative: ", shape_data[i]); if (shape_data[i] == 0) { MACE_CHECK(i < input->dim_size(), @@ -62,7 +55,7 @@ class ReshapeOp : public Operation { } else { n = shape_data[i]; } - out_shape.push_back(n); + out_shape[i] = n; product *= n; } } @@ -77,14 +70,13 @@ class ReshapeOp : public Operation { } Tensor *output = this->Output(OUTPUT); // NHWC -> NCHW - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && D == DeviceType::CPU + + if (has_df_ && D == DeviceType::CPU && out_shape.size() == 4 && shape->is_weight()) { std::vector dst_dims = {0, 3, 1, 2}; - std::vector out_shape_gpu = TransposeShape( + std::vector trans_shape = TransposeShape( out_shape, dst_dims); - out_shape = out_shape_gpu; + out_shape = trans_shape; } output->ReuseTensorBuffer(*input); @@ -93,6 +85,9 @@ class ReshapeOp : public Operation { return MaceStatus::MACE_SUCCESS; } + private: + bool has_df_; + private: MACE_OP_INPUT_TAGS(INPUT, SHAPE); MACE_OP_OUTPUT_TAGS(OUTPUT); diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc index 9334e850fa214ab710969e7f5e7b3e28f17b303d..236e670f1d26b97471e219ba746102d777a008b5 100644 --- a/mace/ops/resize_bicubic.cc +++ b/mace/ops/resize_bicubic.cc @@ -23,6 +23,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/resize_bicubic.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -197,9 +198,8 @@ class ResizeBicubicOp : public Operation { "size", {-1, -1}); MACE_CHECK(size.size() == 2); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ResizeBicubicKernel(align_corners, - size[0], - size[1])); + kernel_ = make_unique>( + align_corners, size[0], size[1]); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index e4c2f3fc3c64bb08410c709bd2f8b405363dcdd5..46720b3c29d32d01f82902a0bfcc49071aa6aa2a 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -19,6 +19,7 @@ #include #include "mace/core/operator.h" +#include "mace/utils/memory.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/resize_bilinear.h" @@ -332,9 +333,8 @@ class ResizeBilinearOp : public Operation { "size", {-1, -1}); MACE_CHECK(size.size() == 2); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ResizeBilinearKernel(align_corners, - size[0], - size[1])); + kernel_ = make_unique>( + align_corners, size[0], size[1]); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc index c40fd46dce86d382df5dec340fbd66cf143f782d..5cdbf07fa101881c4b1c5a4b66476a01199cacee 100644 --- a/mace/ops/resize_nearest_neighbor.cc +++ b/mace/ops/resize_nearest_neighbor.cc @@ -22,6 +22,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/resize_nearest_neighbor.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -142,8 +143,8 @@ class ResizeNearestNeighborOp : public Operation { bool align_corners = Operation::GetOptionalArg( "align_corners", false); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ResizeNearestNeighborKernel( - align_corners)); + kernel_ = make_unique>( + align_corners); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/sgemm.cc b/mace/ops/sgemm.cc deleted file mode 100644 index 1601aac2cd774d9b35406d30dceea56e27469c93..0000000000000000000000000000000000000000 --- a/mace/ops/sgemm.cc +++ /dev/null @@ -1,1182 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "mace/ops/sgemm.h" -#include "mace/core/runtime/cpu/cpu_runtime.h" - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) -#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3]) -#endif - -namespace mace { -namespace ops { - -void SGemm::operator()(const SGemmMatrixMap &lhs, - const SGemmMatrixMap &rhs, - SGemmMatrixMap *result, - ScratchBuffer *scratch_buffer) { - if (lhs.is_const() && !rhs.is_const()) { - SGemmMatrixMap lhs_transpose = lhs.transpose(); - SGemmMatrixMap rhs_transpose = rhs.transpose(); - SGemmMatrixMap result_transpose = result->transpose(); - return operator()(rhs_transpose, - lhs_transpose, - &result_transpose, - scratch_buffer); - } - - if (scratch_buffer != nullptr) { - index_t total_size = result->size(); - if (!lhs.is_const()) { - total_size += lhs.size(); - } - if (!rhs.is_const()) { - total_size += rhs.size(); - } - scratch_buffer->GrowSize(total_size * sizeof(float)); - - if (!lhs.is_const()) { - packed_lhs_.reset(new Tensor(scratch_buffer->Scratch( - lhs.size() * sizeof(float)), DT_FLOAT)); - } - if (!rhs.is_const()) { - packed_rhs_.reset(new Tensor(scratch_buffer->Scratch( - rhs.size() * sizeof(float)), DT_FLOAT)); - } - packed_result_.reset(new Tensor(scratch_buffer->Scratch( - result->size() * sizeof(float)), DT_FLOAT)); - } - - if (packed_lhs_.get() == nullptr) { - packed_lhs_.reset(new Tensor(GetCPUAllocator(), DT_FLOAT)); - packed_lhs_->Resize({lhs.size()}); - } - if (packed_rhs_.get() == nullptr) { - packed_rhs_.reset(new Tensor(GetCPUAllocator(), DT_FLOAT)); - packed_rhs_->Resize({rhs.size()}); - } - if (packed_result_.get() == nullptr) { - packed_result_.reset(new Tensor(GetCPUAllocator(), DT_FLOAT)); - packed_result_->Resize({result->size()}); - } - - if (!lhs.is_const() || !packed_) { - PackLhs(lhs, packed_lhs_.get()); - if (lhs.is_const()) { - AdviseFree(reinterpret_cast(const_cast(lhs.data())), - lhs.size() * sizeof(float)); - } - } - if (!rhs.is_const() || !packed_) { - PackRhs(rhs, packed_rhs_.get()); - if (rhs.is_const()) { - AdviseFree(reinterpret_cast(const_cast(rhs.data())), - rhs.size() * sizeof(float)); - } - } - packed_ = true; - - RunInternal(*packed_lhs_, - *packed_rhs_, - lhs.batch(), - lhs.row(), - lhs.col(), - rhs.col(), - packed_result_.get()); - - UnPack(*packed_result_, result); -} - -void SGemm::Run(const float *A, - const float *B, - const index_t batch, - const index_t height_a, - const index_t width_a, - const index_t height_b, - const index_t width_b, - const bool transpose_a, - const bool transpose_b, - const bool is_a_weight, - const bool is_b_weight, - float *C, - ScratchBuffer *scratch_buffer) { - index_t height_c = height_a; - index_t width_c = width_b; - if (transpose_a) { - height_c = width_a; - } - if (transpose_b) { - width_c = height_b; - } - - SGemmMatrixMap matrix_a = - SGemmMatrixMap(batch, - height_a, - width_a, - ops::SGemmRowMajor, - A, - is_a_weight); - SGemmMatrixMap matrix_b = - ops::SGemmMatrixMap(batch, - height_b, - width_b, - ops::SGemmRowMajor, - B, - is_b_weight); - if (transpose_a) { - matrix_a = matrix_a.transpose(); - } - if (transpose_b) { - matrix_b = matrix_b.transpose(); - } - SGemmMatrixMap - matrix_c(batch, height_c, width_c, ops::SGemmRowMajor, C); - operator()(matrix_a, matrix_b, &matrix_c, scratch_buffer); -} - -#if defined(MACE_ENABLE_NEON) -#if defined(__aarch64__) - -// calculate 8 rows, 4 cols for each depth -#define MACE_SGEMM_PART_CAL_R8_C4_D1(D, VD, VDN) \ - c0 = vfmaq_laneq_f32(c0, b##D, a##VD, 0); \ - c1 = vfmaq_laneq_f32(c1, b##D, a##VD, 1); \ - c2 = vfmaq_laneq_f32(c2, b##D, a##VD, 2); \ - c3 = vfmaq_laneq_f32(c3, b##D, a##VD, 3); \ - c4 = vfmaq_laneq_f32(c4, b##D, a##VDN, 0); \ - c5 = vfmaq_laneq_f32(c5, b##D, a##VDN, 1); \ - c6 = vfmaq_laneq_f32(c6, b##D, a##VDN, 2); \ - c7 = vfmaq_laneq_f32(c7, b##D, a##VDN, 3); - -// calculate 4 rows, 4 cols for each depth -#define MACE_SGEMM_PART_CAL_R4_C4_D1(D) \ - c0 = vfmaq_laneq_f32(c0, b##D, a##D, 0); \ - c1 = vfmaq_laneq_f32(c1, b##D, a##D, 1); \ - c2 = vfmaq_laneq_f32(c2, b##D, a##D, 2); \ - c3 = vfmaq_laneq_f32(c3, b##D, a##D, 3); - -// calculate 4 cols for 8 depths for each row -#define MACE_SGEMM_PART_CAL_R1_C4_D8(R, VR, VRN) \ - c##R = vfmaq_laneq_f32(c##R, b0, a##VR, 0); \ - c##R = vfmaq_laneq_f32(c##R, b1, a##VR, 1); \ - c##R = vfmaq_laneq_f32(c##R, b2, a##VR, 2); \ - c##R = vfmaq_laneq_f32(c##R, b3, a##VR, 3); \ - c##R = vfmaq_laneq_f32(c##R, b4, a##VRN, 0); \ - c##R = vfmaq_laneq_f32(c##R, b5, a##VRN, 1); \ - c##R = vfmaq_laneq_f32(c##R, b6, a##VRN, 2); \ - c##R = vfmaq_laneq_f32(c##R, b7, a##VRN, 3); - -// calculate 4 cols for 4 depths for each row -#define MACE_SGEMM_PART_CAL_R1_C4_D4(R) \ - c##R = vfmaq_laneq_f32(c##R, b0, a##R, 0); \ - c##R = vfmaq_laneq_f32(c##R, b1, a##R, 1); \ - c##R = vfmaq_laneq_f32(c##R, b2, a##R, 2); \ - c##R = vfmaq_laneq_f32(c##R, b3, a##R, 3); - -// calculate 8 cols for 4 depths for each row -#define MACE_SGEMM_PART_CAL_R1_C8_D4(VR, VRN, R) \ - c##VR = vfmaq_laneq_f32(c##VR, b0, a##R, 0); \ - c##VR = vfmaq_laneq_f32(c##VR, b2, a##R, 1); \ - c##VR = vfmaq_laneq_f32(c##VR, b4, a##R, 2); \ - c##VR = vfmaq_laneq_f32(c##VR, b6, a##R, 3); \ - c##VRN = vfmaq_laneq_f32(c##VRN, b1, a##R, 0); \ - c##VRN = vfmaq_laneq_f32(c##VRN, b3, a##R, 1); \ - c##VRN = vfmaq_laneq_f32(c##VRN, b5, a##R, 2); \ - c##VRN = vfmaq_laneq_f32(c##VRN, b7, a##R, 3); - -#else - -#define MACE_SGEMM_PART_CAL_R8_C4_D1(D, VD, VDN) \ - c0 = vmlaq_lane_f32(c0, b##D, vget_low_f32(a##VD), 0); \ - c1 = vmlaq_lane_f32(c1, b##D, vget_low_f32(a##VD), 1); \ - c2 = vmlaq_lane_f32(c2, b##D, vget_high_f32(a##VD), 0); \ - c3 = vmlaq_lane_f32(c3, b##D, vget_high_f32(a##VD), 1); \ - c4 = vmlaq_lane_f32(c4, b##D, vget_low_f32(a##VDN), 0); \ - c5 = vmlaq_lane_f32(c5, b##D, vget_low_f32(a##VDN), 1); \ - c6 = vmlaq_lane_f32(c6, b##D, vget_high_f32(a##VDN), 0); \ - c7 = vmlaq_lane_f32(c7, b##D, vget_high_f32(a##VDN), 1); - -#define MACE_SGEMM_PART_CAL_R4_C4_D1(D) \ - c0 = vmlaq_lane_f32(c0, b##D, vget_low_f32(a##D), 0); \ - c1 = vmlaq_lane_f32(c1, b##D, vget_low_f32(a##D), 1); \ - c2 = vmlaq_lane_f32(c2, b##D, vget_high_f32(a##D), 0); \ - c3 = vmlaq_lane_f32(c3, b##D, vget_high_f32(a##D), 1); - -#define MACE_SGEMM_PART_CAL_R1_C4_D8(R, VR, VRN) \ - c##R = vmlaq_lane_f32(c##R, b0, vget_low_f32(a##VR), 0); \ - c##R = vmlaq_lane_f32(c##R, b1, vget_low_f32(a##VR), 1); \ - c##R = vmlaq_lane_f32(c##R, b2, vget_high_f32(a##VR), 0); \ - c##R = vmlaq_lane_f32(c##R, b3, vget_high_f32(a##VR), 1); \ - c##R = vmlaq_lane_f32(c##R, b4, vget_low_f32(a##VRN), 0); \ - c##R = vmlaq_lane_f32(c##R, b5, vget_low_f32(a##VRN), 1); \ - c##R = vmlaq_lane_f32(c##R, b6, vget_high_f32(a##VRN), 0); \ - c##R = vmlaq_lane_f32(c##R, b7, vget_high_f32(a##VRN), 1); - -#define MACE_SGEMM_PART_CAL_R1_C4_D4(R) \ - c##R = vmlaq_lane_f32(c##R, b0, vget_low_f32(a##R), 0); \ - c##R = vmlaq_lane_f32(c##R, b1, vget_low_f32(a##R), 1); \ - c##R = vmlaq_lane_f32(c##R, b2, vget_high_f32(a##R), 0); \ - c##R = vmlaq_lane_f32(c##R, b3, vget_high_f32(a##R), 1); - -#endif // __aarch64__ -#endif // MACE_ENABLE_NEON - -void SGemm::RunInternal(const PackedBlock &lhs, - const PackedBlock &rhs, - const index_t batch, - const index_t height, - const index_t depth, - const index_t width, - PackedBlock *result) { - const float *lhs_data = lhs.data(); - const float *rhs_data = rhs.data(); - float *result_data = result->mutable_data(); - -#define MACE_SGEMM_RUN_PER_BATCH \ - for (index_t b = 0; b < batch; ++b) { \ - RunPerBatch(lhs_data + b * height * depth, \ - rhs_data + b * depth * width, \ - height, \ - depth, \ - width, \ - result_data + b * height * width); \ - } - - if (batch >= MaceOpenMPThreadCount) { -#pragma omp parallel for schedule(runtime) - MACE_SGEMM_RUN_PER_BATCH - } else { - MACE_SGEMM_RUN_PER_BATCH - } - -#undef MACE_SGEMM_RUN_PER_BATCH -} - -void SGemm::RunPerBatch(const float *lhs_data, - const float *rhs_data, - const index_t height, - const index_t depth, - const index_t width, - float *result_data) { -#if defined(MACE_ENABLE_NEON) - const index_t block_w = width >> 2; - const index_t remain_w = width - (block_w << 2); -#else - const index_t remain_w = width; -#endif - -#if defined(MACE_ENABLE_NEON) - // TODO(liyin): make better use l2(l1) cache, try to fit as much lhs data as - // as possible to cache, by tiling lhs by height and rhs by width. - - // w: 4 -#pragma omp parallel for schedule(runtime) - for (index_t bw = 0; bw < block_w; ++bw) { - index_t remain_h = height; - index_t block_h = 0; - - const float *lhs_ptr = lhs_data; - float *res_ptr = result_data + height * (bw << 2); - -#if defined(__aarch64__) - block_h = remain_h >> 3; - remain_h -= (block_h << 3); - - // h: 8 - for (index_t bh = 0; bh < block_h; ++bh) { - const float *rhs_ptr = rhs_data + depth * (bw << 2); - - index_t remain_d = depth; - index_t block_d = remain_d >> 3; - remain_d -= (block_d << 3); - - float32x4_t c0, c1, c2, c3, c4, c5, c6, c7; - c0 = vdupq_n_f32(0.f); - c1 = vdupq_n_f32(0.f); - c2 = vdupq_n_f32(0.f); - c3 = vdupq_n_f32(0.f); - c4 = vdupq_n_f32(0.f); - c5 = vdupq_n_f32(0.f); - c6 = vdupq_n_f32(0.f); - c7 = vdupq_n_f32(0.f); - - // d: 8 - for (index_t bd = 0; bd < block_d; ++bd) { - // 8.8.4 - float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, - a14, a15; - float32x4_t b0, b1, b2, b3, b4, b5, b6, b7; - - a0 = vld1q_f32(lhs_ptr); - a1 = vld1q_f32(lhs_ptr + 4); - a2 = vld1q_f32(lhs_ptr + 8); - a3 = vld1q_f32(lhs_ptr + 12); - a4 = vld1q_f32(lhs_ptr + 16); - a5 = vld1q_f32(lhs_ptr + 20); - a6 = vld1q_f32(lhs_ptr + 24); - a7 = vld1q_f32(lhs_ptr + 28); - a8 = vld1q_f32(lhs_ptr + 32); - a9 = vld1q_f32(lhs_ptr + 36); - a10 = vld1q_f32(lhs_ptr + 40); - a11 = vld1q_f32(lhs_ptr + 44); - a12 = vld1q_f32(lhs_ptr + 48); - a13 = vld1q_f32(lhs_ptr + 52); - a14 = vld1q_f32(lhs_ptr + 56); - a15 = vld1q_f32(lhs_ptr + 60); - - b0 = vld1q_f32(rhs_ptr); - b1 = vld1q_f32(rhs_ptr + 4); - b2 = vld1q_f32(rhs_ptr + 8); - b3 = vld1q_f32(rhs_ptr + 12); - b4 = vld1q_f32(rhs_ptr + 16); - b5 = vld1q_f32(rhs_ptr + 20); - b6 = vld1q_f32(rhs_ptr + 24); - b7 = vld1q_f32(rhs_ptr + 28); - - MACE_SGEMM_PART_CAL_R8_C4_D1(0, 0, 1); // d = 1 - MACE_SGEMM_PART_CAL_R8_C4_D1(1, 2, 3); // d = 2 - MACE_SGEMM_PART_CAL_R8_C4_D1(2, 4, 5); - MACE_SGEMM_PART_CAL_R8_C4_D1(3, 6, 7); - MACE_SGEMM_PART_CAL_R8_C4_D1(4, 8, 9); - MACE_SGEMM_PART_CAL_R8_C4_D1(5, 10, 11); - MACE_SGEMM_PART_CAL_R8_C4_D1(6, 12, 13); - MACE_SGEMM_PART_CAL_R8_C4_D1(7, 14, 15); - - lhs_ptr += 64; - rhs_ptr += 32; - } - - block_d = remain_d >> 2; - remain_d -= (block_d << 2); - - // d: 4 - for (index_t bd = 0; bd < block_d; ++bd) { - // 8.4.4 - float32x4_t a0, a1, a2, a3, a4, a5, a6, a7; - float32x4_t b0, b1, b2, b3; - - a0 = vld1q_f32(lhs_ptr); - a1 = vld1q_f32(lhs_ptr + 4); - a2 = vld1q_f32(lhs_ptr + 8); - a3 = vld1q_f32(lhs_ptr + 12); - a4 = vld1q_f32(lhs_ptr + 16); - a5 = vld1q_f32(lhs_ptr + 20); - a6 = vld1q_f32(lhs_ptr + 24); - a7 = vld1q_f32(lhs_ptr + 28); - - b0 = vld1q_f32(rhs_ptr); - b1 = vld1q_f32(rhs_ptr + 4); - b2 = vld1q_f32(rhs_ptr + 8); - b3 = vld1q_f32(rhs_ptr + 12); - - MACE_SGEMM_PART_CAL_R8_C4_D1(0, 0, 1); // d = 1 - MACE_SGEMM_PART_CAL_R8_C4_D1(1, 2, 3); // d = 2 - MACE_SGEMM_PART_CAL_R8_C4_D1(2, 4, 5); - MACE_SGEMM_PART_CAL_R8_C4_D1(3, 6, 7); - - lhs_ptr += 32; - rhs_ptr += 16; - } - - // TODO(liyin): handle remain by each case - // d: remain - for (index_t d = 0; d < remain_d; ++d) { - // 8.1.4 - float32x4_t a0, a1; - float32x4_t b0; - - a0 = vld1q_f32(lhs_ptr); - a1 = vld1q_f32(lhs_ptr + 4); - - b0 = vld1q_f32(rhs_ptr); - - MACE_SGEMM_PART_CAL_R8_C4_D1(0, 0, 1); // d = 1 - - lhs_ptr += 8; - rhs_ptr += 4; - } - - vst1q_f32(res_ptr, c0); - vst1q_f32(res_ptr + 4, c1); - vst1q_f32(res_ptr + 8, c2); - vst1q_f32(res_ptr + 12, c3); - vst1q_f32(res_ptr + 16, c4); - vst1q_f32(res_ptr + 20, c5); - vst1q_f32(res_ptr + 24, c6); - vst1q_f32(res_ptr + 28, c7); - - res_ptr += 32; - } // bh: 8 -#endif // __aarch64__ - - // h: 4 - block_h = remain_h >> 2; - remain_h -= (block_h << 2); - - for (index_t bh = 0; bh < block_h; ++bh) { - const float *rhs_ptr = rhs_data + depth * (bw << 2); - - index_t remain_d = depth; - index_t block_d = 0; - - float32x4_t c0, c1, c2, c3; - c0 = vdupq_n_f32(0.f); - c1 = vdupq_n_f32(0.f); - c2 = vdupq_n_f32(0.f); - c3 = vdupq_n_f32(0.f); - - // d: 8 - block_d = remain_d >> 3; - remain_d -= (block_d << 3); - -#if defined(__aarch64__) - for (index_t bd = 0; bd < block_d; ++bd) { - // 4.8.4 - float32x4_t a0, a1, a2, a3, a4, a5, a6, a7; - float32x4_t b0, b1, b2, b3, b4, b5, b6, b7; - - a0 = vld1q_f32(lhs_ptr); - a1 = vld1q_f32(lhs_ptr + 4); - a2 = vld1q_f32(lhs_ptr + 8); - a3 = vld1q_f32(lhs_ptr + 12); - a4 = vld1q_f32(lhs_ptr + 16); - a5 = vld1q_f32(lhs_ptr + 20); - a6 = vld1q_f32(lhs_ptr + 24); - a7 = vld1q_f32(lhs_ptr + 28); - - b0 = vld1q_f32(rhs_ptr); - b1 = vld1q_f32(rhs_ptr + 4); - b2 = vld1q_f32(rhs_ptr + 8); - b3 = vld1q_f32(rhs_ptr + 12); - b4 = vld1q_f32(rhs_ptr + 16); - b5 = vld1q_f32(rhs_ptr + 20); - b6 = vld1q_f32(rhs_ptr + 24); - b7 = vld1q_f32(rhs_ptr + 28); - - MACE_SGEMM_PART_CAL_R4_C4_D1(0); // d = 1 - MACE_SGEMM_PART_CAL_R4_C4_D1(1); // d = 2 - MACE_SGEMM_PART_CAL_R4_C4_D1(2); - MACE_SGEMM_PART_CAL_R4_C4_D1(3); - MACE_SGEMM_PART_CAL_R4_C4_D1(4); - MACE_SGEMM_PART_CAL_R4_C4_D1(5); - MACE_SGEMM_PART_CAL_R4_C4_D1(6); - MACE_SGEMM_PART_CAL_R4_C4_D1(7); - - lhs_ptr += 32; - rhs_ptr += 32; - } -#else // arm v7 - // 4.8.4 - if (block_d > 0) { - asm volatile( - "0: \n" - - "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" - "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" - "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" - - "vld1.f32 {d20-d21}, [%[rhs_ptr]]! \n" - "vld1.f32 {d22-d23}, [%[rhs_ptr]]! \n" - "vld1.f32 {d24-d25}, [%[rhs_ptr]]! \n" - - "vmla.f32 %q[c0], q10, d0[0] \n" - "vmla.f32 %q[c1], q10, d0[1] \n" - "vmla.f32 %q[c2], q10, d1[0] \n" - "vmla.f32 %q[c3], q10, d1[1] \n" - - "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" - "vld1.f32 {d26-d27}, [%[rhs_ptr]]! \n" - - "vmla.f32 %q[c0], q11, d2[0] \n" - "vmla.f32 %q[c1], q11, d2[1] \n" - "vmla.f32 %q[c2], q11, d3[0] \n" - "vmla.f32 %q[c3], q11, d3[1] \n" - - "vld1.f32 {d8-d9}, [%[lhs_ptr]]! \n" - "vld1.f32 {d28-d29}, [%[rhs_ptr]]! \n" - - "vmla.f32 %q[c0], q12, d4[0] \n" - "vmla.f32 %q[c1], q12, d4[1] \n" - "vmla.f32 %q[c2], q12, d5[0] \n" - "vmla.f32 %q[c3], q12, d5[1] \n" - - "vld1.f32 {d10-d11}, [%[lhs_ptr]]! \n" - "vld1.f32 {d30-d31}, [%[rhs_ptr]]! \n" - - "vmla.f32 %q[c0], q13, d6[0] \n" - "vmla.f32 %q[c1], q13, d6[1] \n" - "vmla.f32 %q[c2], q13, d7[0] \n" - "vmla.f32 %q[c3], q13, d7[1] \n" - - "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" - "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" - - "vld1.f32 {d20-d21}, [%[rhs_ptr]]! \n" - "vld1.f32 {d22-d23}, [%[rhs_ptr]]! \n" - - "vmla.f32 %q[c0], q14, d8[0] \n" - "vmla.f32 %q[c1], q14, d8[1] \n" - "vmla.f32 %q[c2], q14, d9[0] \n" - "vmla.f32 %q[c3], q14, d9[1] \n" - - "vmla.f32 %q[c0], q15, d10[0] \n" - "vmla.f32 %q[c1], q15, d10[1] \n" - "vmla.f32 %q[c2], q15, d11[0] \n" - "vmla.f32 %q[c3], q15, d11[1] \n" - - "vmla.f32 %q[c0], q10, d0[0] \n" - "vmla.f32 %q[c1], q10, d0[1] \n" - "vmla.f32 %q[c2], q10, d1[0] \n" - "vmla.f32 %q[c3], q10, d1[1] \n" - - "subs %[block_d], %[block_d], #1 \n" - - "vmla.f32 %q[c0], q11, d2[0] \n" - "vmla.f32 %q[c1], q11, d2[1] \n" - "vmla.f32 %q[c2], q11, d3[0] \n" - "vmla.f32 %q[c3], q11, d3[1] \n" - - "bne 0b \n" - : // outputs - [lhs_ptr] "+r"(lhs_ptr), - [rhs_ptr] "+r"(rhs_ptr), - [res_ptr] "+r"(res_ptr), - [block_d] "+r"(block_d), - [c0] "+w"(c0), - [c1] "+w"(c1), - [c2] "+w"(c2), - [c3] "+w"(c3) - : // inputs - : // clabbers - "cc", "memory", - "q0", "q1", "q2", "q3", "q4", "q5", - "q10", "q11", "q12", "q13", "q14", "q15"); - } -#endif // __aarch64__ - - // d: 4 - block_d = remain_d >> 2; - remain_d -= (block_d << 2); - - for (index_t bd = 0; bd < block_d; ++bd) { - // 4.4.4 - float32x4_t a0, a1, a2, a3; - float32x4_t b0, b1, b2, b3; - - a0 = vld1q_f32(lhs_ptr); - a1 = vld1q_f32(lhs_ptr + 4); - a2 = vld1q_f32(lhs_ptr + 8); - a3 = vld1q_f32(lhs_ptr + 12); - - b0 = vld1q_f32(rhs_ptr); - b1 = vld1q_f32(rhs_ptr + 4); - b2 = vld1q_f32(rhs_ptr + 8); - b3 = vld1q_f32(rhs_ptr + 12); - - MACE_SGEMM_PART_CAL_R4_C4_D1(0); // d = 1 - MACE_SGEMM_PART_CAL_R4_C4_D1(1); // d = 2 - MACE_SGEMM_PART_CAL_R4_C4_D1(2); - MACE_SGEMM_PART_CAL_R4_C4_D1(3); - - lhs_ptr += 16; - rhs_ptr += 16; - } - - // d: remain - for (index_t d = 0; d < remain_d; ++d) { - // 4.1.4 - float32x4_t a0; - float32x4_t b0; - - a0 = vld1q_f32(lhs_ptr); - - b0 = vld1q_f32(rhs_ptr); - - MACE_SGEMM_PART_CAL_R4_C4_D1(0); // d = 1 - - lhs_ptr += 4; - rhs_ptr += 4; - } - vst1q_f32(res_ptr, c0); - vst1q_f32(res_ptr + 4, c1); - vst1q_f32(res_ptr + 8, c2); - vst1q_f32(res_ptr + 12, c3); - - res_ptr += 16; - } // bh: 4 - - // h: 1 - for (index_t h = 0; h < remain_h; ++h) { - const float *rhs_ptr = rhs_data + depth * (bw << 2); - - index_t remain_d = depth; - index_t block_d = 0; - - float32x4_t c0 = vdupq_n_f32(0.f); - - // d: 8 - block_d = remain_d >> 3; - remain_d -= (block_d << 3); - - for (index_t bd = 0; bd < block_d; ++bd) { - // 1.8.4 - float32x4_t a0, a1; - float32x4_t b0, b1, b2, b3, b4, b5, b6, b7; - - a0 = vld1q_f32(lhs_ptr); - a1 = vld1q_f32(lhs_ptr + 4); - - b0 = vld1q_f32(rhs_ptr); - b1 = vld1q_f32(rhs_ptr + 4); - b2 = vld1q_f32(rhs_ptr + 8); - b3 = vld1q_f32(rhs_ptr + 12); - b4 = vld1q_f32(rhs_ptr + 16); - b5 = vld1q_f32(rhs_ptr + 20); - b6 = vld1q_f32(rhs_ptr + 24); - b7 = vld1q_f32(rhs_ptr + 28); - - MACE_SGEMM_PART_CAL_R1_C4_D8(0, 0, 1); - - lhs_ptr += 8; - rhs_ptr += 32; - } - - block_d = remain_d >> 2; - remain_d -= (block_d << 2); - - // d: 4 - for (index_t bd = 0; bd < block_d; ++bd) { - // 1.4.4 - float32x4_t a0; - float32x4_t b0, b1, b2, b3; - - a0 = vld1q_f32(lhs_ptr); - - b0 = vld1q_f32(rhs_ptr); - b1 = vld1q_f32(rhs_ptr + 4); - b2 = vld1q_f32(rhs_ptr + 8); - b3 = vld1q_f32(rhs_ptr + 12); - - MACE_SGEMM_PART_CAL_R1_C4_D4(0); - - lhs_ptr += 4; - rhs_ptr += 16; - } - - // d: remain - float s0 = 0; - float s1 = 0; - float s2 = 0; - float s3 = 0; - for (index_t d = 0; d < remain_d; ++d) { - // 1.1.4 - s0 += lhs_ptr[0] * rhs_ptr[0]; - s1 += lhs_ptr[0] * rhs_ptr[1]; - s2 += lhs_ptr[0] * rhs_ptr[2]; - s3 += lhs_ptr[0] * rhs_ptr[3]; - lhs_ptr += 1; - rhs_ptr += 4; - } - float32x4_t c0_remain = {s0, s1, s2, s3}; - c0 += c0_remain; - - vst1q_f32(res_ptr, c0); - res_ptr += 4; - } // bh: remain - } // bw - -#endif // MACE_ENABLE_NEON - - // ========================== remain width =========================== - - result_data += (width - remain_w) * height; - rhs_data += (width - remain_w) * depth; - - // w: 1 -#pragma omp parallel for schedule(runtime) - for (index_t bw = 0; bw < remain_w; ++bw) { - index_t remain_h = height; - - const float *lhs_ptr = lhs_data; - float *res_ptr = result_data + height * bw; - -#if defined(MACE_ENABLE_NEON) - index_t block_h = 0; -#if defined(__aarch64__) - block_h = remain_h >> 3; - remain_h -= (block_h << 3); - - // h: 8 - for (index_t bh = 0; bh < block_h; ++bh) { - const float *rhs_ptr = rhs_data + depth * bw; - - index_t remain_d = depth; - - float32x4_t c0, c1; - c0 = vdupq_n_f32(0.f); - c1 = vdupq_n_f32(0.f); - - index_t block_d = remain_d >> 2; - remain_d -= (block_d << 2); - - // d: 4 - for (index_t bd = 0; bd < block_d; ++bd) { - // 8.4.1 - float32x4_t b0, b1, b2, b3, b4, b5, b6, b7; - float32x4_t a0; - - b0 = vld1q_f32(lhs_ptr); - b1 = vld1q_f32(lhs_ptr + 4); - b2 = vld1q_f32(lhs_ptr + 8); - b3 = vld1q_f32(lhs_ptr + 12); - b4 = vld1q_f32(lhs_ptr + 16); - b5 = vld1q_f32(lhs_ptr + 20); - b6 = vld1q_f32(lhs_ptr + 24); - b7 = vld1q_f32(lhs_ptr + 28); - - a0 = vld1q_f32(rhs_ptr); - - MACE_SGEMM_PART_CAL_R1_C8_D4(0, 1, 0); - - lhs_ptr += 32; - rhs_ptr += 4; - } - - // d: remain - for (index_t d = 0; d < remain_d; ++d) { - // 8.1.1 - float32x4_t b0, b1; - float32x4_t a0 = vdupq_n_f32(rhs_ptr[0]); - - b0 = vld1q_f32(lhs_ptr); - b1 = vld1q_f32(lhs_ptr + 4); - - c0 = vfmaq_laneq_f32(c0, b0, a0, 0); - c1 = vfmaq_laneq_f32(c1, b1, a0, 0); - - lhs_ptr += 8; - rhs_ptr += 1; - } - - vst1q_f32(res_ptr, c0); - vst1q_f32(res_ptr + 4, c1); - - res_ptr += 8; - } // bh: 8 -#endif - - // h: 4 - block_h = remain_h >> 2; - remain_h -= (block_h << 2); - - for (index_t bh = 0; bh < block_h; ++bh) { - const float *rhs_ptr = rhs_data + depth * bw; - - index_t remain_d = depth; - index_t block_d = 0; - - float32x4_t c0 = vdupq_n_f32(0.f); - - block_d = remain_d >> 2; - remain_d -= (block_d << 2); - - // d: 4 - for (index_t bd = 0; bd < block_d; ++bd) { - // 4.4.1 - float32x4_t b0, b1, b2, b3; - float32x4_t a0; - - b0 = vld1q_f32(lhs_ptr); - b1 = vld1q_f32(lhs_ptr + 4); - b2 = vld1q_f32(lhs_ptr + 8); - b3 = vld1q_f32(lhs_ptr + 12); - - a0 = vld1q_f32(rhs_ptr); - - MACE_SGEMM_PART_CAL_R1_C4_D4(0); - - lhs_ptr += 16; - rhs_ptr += 4; - } - - // d: remain - for (index_t d = 0; d < remain_d; ++d) { - // 4.1.1 - float32x4_t b0; - float32x2_t a0 = vdup_n_f32(rhs_ptr[0]); - - b0 = vld1q_f32(lhs_ptr); - - c0 = vmlaq_lane_f32(c0, b0, a0, 0); - - lhs_ptr += 4; - rhs_ptr += 1; - } - vst1q_f32(res_ptr, c0); - - res_ptr += 4; - } // bh: 4 - -#endif // MACE_ENABLE_NEON - - // h: 1 - for (index_t h = 0; h < remain_h; ++h) { - const float *rhs_ptr = rhs_data + depth * bw; - - index_t remain_d = depth; - - float sum = 0.f; - -#if defined(MACE_ENABLE_NEON) - index_t block_d = 0; - - float32x4_t c0, c1; - c0 = vdupq_n_f32(0.f); - c1 = vdupq_n_f32(0.f); - - block_d = remain_d >> 3; - remain_d -= (block_d << 3); - - // d: 8 - for (index_t bd = 0; bd < block_d; ++bd) { - // 1.8.1 - float32x4_t a0, a1; - float32x4_t b0, b1; - - a0 = vld1q_f32(lhs_ptr); - a1 = vld1q_f32(lhs_ptr + 4); - b0 = vld1q_f32(rhs_ptr); - b1 = vld1q_f32(rhs_ptr + 4); - - c0 = vmlaq_f32(c0, a0, b0); - c1 = vmlaq_f32(c1, a1, b1); - - lhs_ptr += 8; - rhs_ptr += 8; - } - - block_d = remain_d >> 2; - remain_d -= (block_d << 2); - - // d: 4 - for (index_t bd = 0; bd < block_d; ++bd) { - // 1.4.1 - float32x4_t a0; - float32x4_t b0; - - a0 = vld1q_f32(lhs_ptr); - b0 = vld1q_f32(rhs_ptr); - - c0 = vmlaq_f32(c0, a0, b0); - - lhs_ptr += 4; - rhs_ptr += 4; - } - sum += vaddvq_f32(c0); - sum += vaddvq_f32(c1); -#endif // MACE_ENABLE_NEON - - // d: remain - for (index_t d = 0; d < remain_d; ++d) { - // 1.1.1 - sum += lhs_ptr[0] * rhs_ptr[0]; - lhs_ptr += 1; - rhs_ptr += 1; - } - - *res_ptr = sum; - ++res_ptr; - } // bh: remain - } // bw -} - -void SGemm::PackLhs(const SGemmMatrixMap &lhs, - PackedBlock *packed_block) { - Pack(lhs, PackOrder::SGemmColMajor, packed_block); -} - -void SGemm::PackRhs(const SGemmMatrixMap &rhs, - PackedBlock *packed_block) { - Pack(rhs, PackOrder::SGemmRowMajor, packed_block); -} - -void SGemm::Pack(const SGemmMatrixMap &src, - const PackOrder order, - PackedBlock *packed_block) { - MACE_CHECK_NOTNULL(packed_block); - - const index_t height = src.row(); - const index_t width = src.col(); - auto packed_data = packed_block->mutable_data(); - -#define MACE_SGEMM_PACK_PER_BATCH \ - for (index_t b = 0; b < src.batch(); ++b) { \ - PackPerBatch(src, order, b, packed_data + b * height * width); \ - } - if (src.batch() >= MaceOpenMPThreadCount) { -#pragma omp parallel for schedule(runtime) - MACE_SGEMM_PACK_PER_BATCH - } else { - MACE_SGEMM_PACK_PER_BATCH - } -#undef MACE_SGEMM_PACK_PER_BATCH -} - -void SGemm::UnPack(const PackedBlock &packed_result, - SGemmMatrixMap *matrix_map) { - MACE_CHECK_NOTNULL(matrix_map); - - const index_t height = matrix_map->row(); - const index_t width = matrix_map->col(); - auto packed_data = packed_result.data(); - -#define MACE_SGEMM_UNPACK_PER_BATCH \ - for (index_t b = 0; b < matrix_map->batch(); ++b) { \ - UnPackPerBatch(packed_data + b * height * width, b, matrix_map); \ - } - - if (matrix_map->batch() >= MaceOpenMPThreadCount) { -#pragma omp parallel for schedule(runtime) - MACE_SGEMM_UNPACK_PER_BATCH - } else { - MACE_SGEMM_UNPACK_PER_BATCH - } -#undef MACE_SGEMM_UNPACK_PER_BATCH -} - -void SGemm::PackPerBatch(const SGemmMatrixMap &src, - const PackOrder order, - const index_t batch_index, - float *packed_data) { - MACE_CHECK_NOTNULL(packed_data); - - const index_t height = src.row(); - const index_t width = src.col(); - auto src_data = src.batch_data(batch_index); - - if (src.map_major() == Major::SGemmRowMajor - && order == PackOrder::SGemmColMajor) { - // This is for packing no-transpose lhs. - index_t h = 0; -#if defined(MACE_ENABLE_NEON) -#if defined(__aarch64__) -#pragma omp parallel for schedule(runtime) - for (index_t ih = h; ih <= height - 8; ih += 8) { - const float *src_data_ptr = src_data + ih * width; - float *packed_data_ptr = packed_data + ih * width; - for (index_t w = 0; w < width; ++w) { - const index_t src_offset = w; - const index_t packed_offset = w * 8; - float32x4_t vs0 = {src_data_ptr[src_offset], - src_data_ptr[src_offset + width], - src_data_ptr[src_offset + 2 * width], - src_data_ptr[src_offset + 3 * width]}; - float32x4_t vs1 = {src_data_ptr[src_offset + 4 * width], - src_data_ptr[src_offset + 5 * width], - src_data_ptr[src_offset + 6 * width], - src_data_ptr[src_offset + 7 * width]}; - vst1q_f32(packed_data_ptr + packed_offset, vs0); - vst1q_f32(packed_data_ptr + packed_offset + 4, vs1); - } - } - h += (height - h) / 8 * 8; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t ih = h; ih <= height - 4; ih += 4) { - const float *src_data_ptr = src_data + ih * width; - float *packed_data_ptr = packed_data + ih * width; - for (index_t w = 0; w < width; ++w) { - const index_t src_offset = w; - const index_t packed_offset = w * 4; - float32x4_t vs = {src_data_ptr[src_offset], - src_data_ptr[src_offset + width], - src_data_ptr[src_offset + 2 * width], - src_data_ptr[src_offset + 3 * width]}; - vst1q_f32(packed_data_ptr + packed_offset, vs); - } - } - h += (height - h) / 4 * 4; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t ih = h; ih < height; ++ih) { - std::copy_n(src_data + ih * width, width, packed_data + ih * width); - } - } else if (src.map_major() == Major::SGemmColMajor && - order == PackOrder::SGemmColMajor) { - // This is for packing transpose-needed lhs. - index_t h = 0; -#if defined(MACE_ENABLE_NEON) -#if defined(__aarch64__) -#pragma omp parallel for schedule(runtime) - for (index_t ih = h; ih <= height - 8; ih += 8) { - const float *src_data_ptr = src_data + ih; - float *packed_data_ptr = packed_data + ih * width; - for (index_t w = 0; w < width; ++w) { - const index_t src_offset = w * height; - const index_t packed_offset = w * 8; - float32x4_t vs0 = vld1q_f32(src_data_ptr + src_offset); - float32x4_t vs1 = vld1q_f32(src_data_ptr + src_offset + 4); - vst1q_f32(packed_data_ptr + packed_offset, vs0); - vst1q_f32(packed_data_ptr + packed_offset + 4, vs1); - } - } - h += (height - h) / 8 * 8; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t ih = h; ih <= height - 4; ih += 4) { - const float *src_data_ptr = src_data + ih; - float *packed_data_ptr = packed_data + ih * width; - for (index_t w = 0; w < width; ++w) { - const index_t src_offset = w * height; - const index_t packed_offset = w * 4; - float32x4_t vs = vld1q_f32(src_data_ptr + src_offset); - vst1q_f32(packed_data_ptr + packed_offset, vs); - } - } - h += (height - h) / 4 * 4; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t ih = h; ih < height; ++ih) { - const float *src_data_ptr = src_data + ih; - float *packed_data_ptr = packed_data + ih * width; - for (index_t w = 0; w < width; ++w) { - packed_data_ptr[w] = src_data_ptr[w * height]; - } - } - } else if (src.map_major() == Major::SGemmRowMajor && - order == PackOrder::SGemmRowMajor) { - // This is for packing no-transpose rhs. - index_t w = 0; -#if defined(MACE_ENABLE_NEON) -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw <= width - 4; iw += 4) { - const float *src_data_ptr = src_data + iw; - float *packed_data_ptr = packed_data + iw * height; - for (index_t h = 0; h < height; ++h) { - const index_t src_offset = h * width; - const index_t packed_offset = h * 4; - float32x4_t vs = vld1q_f32(src_data_ptr + src_offset); - vst1q_f32(packed_data_ptr + packed_offset, vs); - } - } - w += (width - w) / 4 * 4; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw < width; ++iw) { - const float *src_data_ptr = src_data + iw; - float *packed_data_ptr = packed_data + iw * height; - for (index_t h = 0; h < height; ++h) { - packed_data_ptr[h] = src_data_ptr[h * width]; - } - } - } else if (src.map_major() == Major::SGemmColMajor && - order == PackOrder::SGemmRowMajor) { - // This is for packing transpose-needed rhs. - index_t w = 0; -#if defined(MACE_ENABLE_NEON) -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw <= width - 4; iw += 4) { - const float *src_data_ptr = src_data + iw * height; - float *packed_data_ptr = packed_data + iw * height; - for (index_t h = 0; h < height; ++h) { - const index_t src_offset = h; - const index_t packed_offset = h * 4; - float32x4_t vs = {src_data_ptr[src_offset], - src_data_ptr[src_offset + height], - src_data_ptr[src_offset + 2 * height], - src_data_ptr[src_offset + 3 * height]}; - vst1q_f32(packed_data_ptr + packed_offset, vs); - } - } - w += (width - w) / 4 * 4; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw < width; ++iw) { - std::copy_n(src_data + iw * height, height, packed_data + iw * height); - } - } -} - -void SGemm::UnPackPerBatch(const float *packed_data, - const index_t batch_index, - SGemmMatrixMap *matrix_map) { - MACE_CHECK_NOTNULL(matrix_map); - - const index_t height = matrix_map->row(); - const index_t width = matrix_map->col(); - auto unpacked_data = matrix_map->batch_data(batch_index); - - if (matrix_map->map_major() == Major::SGemmRowMajor) { - // This is for non-transposed result - index_t w = 0; -#if defined(MACE_ENABLE_NEON) -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw <= width - 4; iw += 4) { - const float *packed_data_ptr = packed_data + iw * height; - float *unpacked_data_ptr = unpacked_data + iw; - for (index_t h = 0; h < height; ++h) { - const index_t packed_offset = h * 4; - const index_t unpacked_offset = h * width; - float32x4_t vs = vld1q_f32(packed_data_ptr + packed_offset); - vst1q_f32(unpacked_data_ptr + unpacked_offset, vs); - } - } - w += (width - w) / 4 * 4; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw < width; ++iw) { - const float *packed_data_ptr = packed_data + iw * height; - float *unpacked_data_ptr = unpacked_data + iw; - for (index_t h = 0; h < height; ++h) { - unpacked_data_ptr[h * width] = packed_data_ptr[h]; - } - } - } else { - // This is for transposed result - index_t w = 0; -#if defined(MACE_ENABLE_NEON) -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw <= width - 4; iw += 4) { - const float *packed_data_ptr = packed_data + iw * height; - float *unpacked_data_ptr = unpacked_data + iw * height; - for (index_t h = 0; h < height; ++h) { - const index_t packed_offset = h * 4; - const index_t unpacked_offset = h; - float32x4_t vs = vld1q_f32(packed_data_ptr + packed_offset); - unpacked_data_ptr[unpacked_offset] = vs[0]; - unpacked_data_ptr[unpacked_offset + height] = vs[1]; - unpacked_data_ptr[unpacked_offset + 2 * height] = vs[2]; - unpacked_data_ptr[unpacked_offset + 3 * height] = vs[3]; - } - } - w += (width - w) / 4 * 4; -#endif -#pragma omp parallel for schedule(runtime) - for (index_t iw = w; iw < width; ++iw) { - std::copy_n( - packed_data + iw * height, height, unpacked_data + iw * height); - } - } -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/sgemm.h b/mace/ops/sgemm.h deleted file mode 100644 index 1320d1bef77710f9b9f4d662ed53c213be83d4c2..0000000000000000000000000000000000000000 --- a/mace/ops/sgemm.h +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This implementation is deprecated. use mace/ops/arm/fp32/gemm.h instead. - -#ifndef MACE_OPS_SGEMM_H_ -#define MACE_OPS_SGEMM_H_ - -#include -#include - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include "mace/core/types.h" -#include "mace/core/allocator.h" -#include "mace/core/tensor.h" - -namespace mace { -namespace ops { - -enum Major { - SGemmRowMajor, - SGemmColMajor -}; - -template -class SGemmMatrixMap { - public: - SGemmMatrixMap() {} - - SGemmMatrixMap(const index_t batch, - const index_t row, - const index_t col, - const Major major, - T *data, - const bool is_const = false) : - batch_(batch), - row_(row), - col_(col), - stride_(major == SGemmRowMajor ? col : row), - major_(major), - data_(data), - is_const_(is_const) {} - - SGemmMatrixMap transpose() const { - Major transpose_major = - major_ == SGemmRowMajor ? SGemmColMajor : SGemmRowMajor; - return SGemmMatrixMap(batch_, - col_, - row_, - transpose_major, - data_, - is_const_); - } - - index_t batch() const { - return batch_; - } - - index_t row() const { - return row_; - } - - index_t col() const { - return col_; - } - - index_t stride() const { - return stride_; - } - - Major map_major() const { - return major_; - } - - T *data() const { - return data_; - } - - T *batch_data(index_t batch) const { - return data_ + batch * row_ * col_; - } - - index_t size() const { - return batch_ * row_ * col_; - } - - bool is_const() const { - return is_const_; - } - - private: - index_t batch_; - index_t row_; - index_t col_; - index_t stride_; - Major major_; - T *data_; - bool is_const_; -}; - -typedef Major PackOrder; -typedef Tensor PackedBlock; - -class SGemm { - public: - SGemm() - : packed_lhs_(nullptr), - packed_rhs_(nullptr), - packed_(false) {} - - void operator()(const SGemmMatrixMap &lhs, - const SGemmMatrixMap &rhs, - SGemmMatrixMap *result, - ScratchBuffer *scratch_buffer = nullptr); - - void Run(const float *A, - const float *B, - const index_t batch, - const index_t height_a, - const index_t width_a, - const index_t height_b, - const index_t width_b, - const bool transpose_a, - const bool transpose_b, - const bool is_a_weight, - const bool is_b_weight, - float *C, - ScratchBuffer *scratch_buffer = nullptr); - - void PackLhs(const SGemmMatrixMap &lhs, - PackedBlock *packed_block); - - void PackRhs(const SGemmMatrixMap &rhs, - PackedBlock *packed_block); - - void UnPack(const PackedBlock &packed_result, - SGemmMatrixMap *matrix_map); - - private: - void Pack(const SGemmMatrixMap &src, - const PackOrder order, - PackedBlock *packed_block); - - void PackPerBatch(const SGemmMatrixMap &src, - const PackOrder order, - const index_t batch_index, - float *packed_data); - - void UnPackPerBatch(const float *packed_data, - const index_t batch_index, - SGemmMatrixMap *matrix_map); - - void RunInternal(const PackedBlock &lhs, - const PackedBlock &rhs, - const index_t batch, - const index_t height, - const index_t depth, - const index_t width, - PackedBlock *result); - - void RunPerBatch(const float *lhs, - const float *rhs, - const index_t height, - const index_t depth, - const index_t width, - float *result); - - std::unique_ptr packed_lhs_; - std::unique_ptr packed_rhs_; - std::unique_ptr packed_result_; - - bool packed_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_SGEMM_H_ diff --git a/mace/ops/sgemm_pack_test.cc b/mace/ops/sgemm_pack_test.cc deleted file mode 100644 index 69766cb9eaf706d31f9e637d93809404108073ba..0000000000000000000000000000000000000000 --- a/mace/ops/sgemm_pack_test.cc +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "mace/ops/sgemm.h" - -namespace mace { -namespace ops { -namespace test { - -namespace { -void TestPack(const std::vector &data, - const std::vector &expected_data, - const index_t height, - const index_t width, - Major src_order, - PackOrder pack_order) { - SGemm sg; - SGemmMatrixMap - src_matrix(1, height, width, src_order, data.data()); - PackedBlock packed; - packed.Resize({height, width}); - if (pack_order == PackOrder::SGemmColMajor) { - sg.PackLhs(src_matrix, &packed); - } else { - sg.PackRhs(src_matrix, &packed); - } - - auto packed_data = packed.data(); - for (index_t i = 0; i < packed.size(); ++i) { - EXPECT_EQ(expected_data[i], packed_data[i]); - } -} - -void TestUnPack(const index_t height, - const index_t width, - Major src_order, - PackOrder pack_order) { - static auto seed = static_cast(time(nullptr)); - const index_t matrix_size = height * width; - std::vector data(matrix_size); - for (int i = 0; i < matrix_size; ++i) { - data[i] = rand_r(&seed); - } - - SGemmMatrixMap - src_matrix(1, height, width, src_order, data.data()); - PackedBlock packed; - packed.Resize({height, width}); - SGemm sg; - if (pack_order == PackOrder::SGemmColMajor) { - sg.PackLhs(src_matrix, &packed); - } else { - sg.PackRhs(src_matrix, &packed); - } - - std::vector unpacked(matrix_size); - SGemmMatrixMap - unpacked_matrix(1, height, width, src_order, unpacked.data()); - sg.UnPack(packed, &unpacked_matrix); - auto unpacked_data = unpacked.data(); - for (index_t i = 0; i < packed.size(); ++i) { - EXPECT_EQ(data[i], unpacked_data[i]); - } -} -} // namespace - - -TEST(SGemmPackTest, Pack) { - std::vector data = - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36}; - - // For no-transpose lhs - TestPack(data, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, - 3, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor); -#if defined(MACE_ENABLE_NEON) - TestPack(data, - {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16}, - 4, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor); - TestPack(data, - {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19, - 20}, - 5, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor); -#if defined(__aarch64__) - TestPack(data, - {1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, - 15, 19, 23, 27, 31, 4, 8, 12, 16, 20, 24, 28, 32, 33, 34, 35, 36}, - 9, 4, Major::SGemmRowMajor, PackOrder::SGemmColMajor); -#endif -#endif - // For transpose-needed lhs - TestPack(data, - {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12}, - 3, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor); -#if defined(MACE_ENABLE_NEON) - TestPack(data, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, - 4, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor); - TestPack(data, - {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15, - 20}, - 5, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor); -#if defined(__aarch64__) - TestPack(data, - {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, - 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 9, 18, 27, 36}, - 9, 4, Major::SGemmColMajor, PackOrder::SGemmColMajor); -#endif -#endif - // For no-transpose rhs - TestPack(data, - {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12}, - 4, 3, Major::SGemmRowMajor, PackOrder::SGemmRowMajor); -#if defined(MACE_ENABLE_NEON) - TestPack(data, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, - 4, 4, Major::SGemmRowMajor, PackOrder::SGemmRowMajor); - TestPack(data, - {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15, - 20}, - 4, 5, Major::SGemmRowMajor, PackOrder::SGemmRowMajor); -#endif - // For transpose-needed rhs - TestPack(data, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, - 4, 3, Major::SGemmColMajor, PackOrder::SGemmRowMajor); -#if defined(MACE_ENABLE_NEON) - TestPack(data, - {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16}, - 4, 4, Major::SGemmColMajor, PackOrder::SGemmRowMajor); - TestPack(data, - {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19, - 20}, - 4, 5, Major::SGemmColMajor, PackOrder::SGemmRowMajor); -#endif -} - -TEST(SGemmPackTest, UnPack) { - TestUnPack(4, 3, Major::SGemmRowMajor, PackOrder::SGemmRowMajor); - TestUnPack(4, 4, Major::SGemmRowMajor, PackOrder::SGemmRowMajor); - TestUnPack(4, 5, Major::SGemmRowMajor, PackOrder::SGemmRowMajor); - TestUnPack(4, 100, Major::SGemmRowMajor, PackOrder::SGemmRowMajor); - TestUnPack(4, 3, Major::SGemmColMajor, PackOrder::SGemmRowMajor); - TestUnPack(4, 4, Major::SGemmColMajor, PackOrder::SGemmRowMajor); - TestUnPack(4, 5, Major::SGemmColMajor, PackOrder::SGemmRowMajor); - TestUnPack(4, 100, Major::SGemmColMajor, PackOrder::SGemmRowMajor); -} - -} // namespace test -} // namespace ops -} // namespace mace - diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc index 79d05bdcaf27bd9dc8cc49f14254d5c1316beaa2..dcca202f3229f616a3ce89dddcd008cf998a1a69 100644 --- a/mace/ops/shape.cc +++ b/mace/ops/shape.cc @@ -35,11 +35,10 @@ class ShapeOp : public Operation { Tensor::MappingGuard output_guard(output); int32_t *output_data = output->mutable_data(); - const int data_format = - Operation::GetOptionalArg("data_format", 0); - if (input->dim_size() == 4 && - D == DeviceType::CPU && - data_format == DataFormat::NCHW) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && input->data_format() == DataFormat::NCHW && + input->dim_size() != 4) { // transpose NCHW to NHWC for cpu runtime output_data[0] = static_cast(input->dim(0)); output_data[1] = static_cast(input->dim(2)); diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc new file mode 100644 index 0000000000000000000000000000000000000000..f38a2a32a861a2ca20882268bc98d96fca55d6d7 --- /dev/null +++ b/mace/ops/slice.cc @@ -0,0 +1,94 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mace/core/operator.h" + +namespace mace { +namespace ops { + +template +class SliceOp; + +template +class SliceOp : public Operation { + public: + explicit SliceOp(OpConstructContext *context) + : Operation(context), + axes_(Operation::GetRepeatedArgs("axes")), + starts_(Operation::GetRepeatedArgs("starts")), + ends_(Operation::GetRepeatedArgs("ends")) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + const index_t rank = input->dim_size(); + MACE_CHECK(rank >= 1) + << "The input dim size should >= 1"; + MACE_CHECK(starts_.size() == 1 && ends_.size() == 1 && axes_.size() == 1, + "only support slicing at one axis."); + MACE_CHECK(axes_[0] == -1 || axes_[0] == rank - 1, + "only support slicing at the last axis."); + const index_t input_dim = input->dim(rank - 1); + const index_t offset = starts_[0]; + const index_t output_dim = ends_[0] - starts_[0]; + + MACE_CHECK(output_dim >= 0, "output_dim should >= 0"); + MACE_CHECK(starts_[0] < input_dim + && output_dim <= input_dim + && ends_[0] <= input_dim) + << "The starts and ends caused over range error."; + + const index_t frames = + std::accumulate(input->shape().begin(), input->shape().end() - 1, 1, + std::multiplies()); + + std::vector output_shape = input->shape(); + output_shape[rank - 1] = output_dim; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < frames; ++i) { + const T *input_base = + input_data + i * input_dim + offset; + T *output_base = + output_data + i * output_dim; + memcpy(output_base, input_base, output_dim * sizeof(T)); + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + std::vector axes_; + std::vector starts_; + std::vector ends_; +}; + +void RegisterSlice(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Slice", SliceOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a5f82cc18d52120715b57f4388e4ce77dbb1a7d7 --- /dev/null +++ b/mace/ops/slice_test.cc @@ -0,0 +1,71 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class SliceOpTest : public OpsTestBase {}; + +namespace { +template +void TestSlice(const std::vector &input_shape, + const std::vector &input, + const int offset, + const int output_dim, + const std::vector &output_shape, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), + input_shape, + input); + + OpDefBuilder("Slice", "SliceTest") + .Input("Input") + .Output("Output") + .AddIntsArg("axes", {-1}) + .AddIntsArg("starts", {offset}) + .AddIntsArg("ends", {offset + output_dim}) + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.AddInputFromArray("ExpectedOutput", output_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(SliceOpTest, Simple2Dim) { + TestSlice( + {3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 2, 3, {3, 3}, + {3, 4, 5, 8, 9, 10, 13, 14, 15}); +} + +TEST_F(SliceOpTest, Simple3Dim) { + TestSlice( + {2, 3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 1, 2, {2, 3, 2}, + {2, 3, 7, 8, 12, 13, 2, 3, 7, 8, 12, 13}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 54f3e55bbaf07d04026ed28de0ed361bd9ff2061..cbab37adf5ebe9e0a3195483cecc287be5931bd0 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -22,7 +22,7 @@ #ifdef MACE_ENABLE_QUANTIZE #include "mace/ops/fixpoint.h" -#include "mace/ops/gemmlowp_util.h" +#include "mace/ops/common/gemmlowp_util.h" #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL @@ -30,6 +30,8 @@ #include "mace/ops/opencl/buffer/softmax.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" + namespace mace { namespace ops { @@ -132,10 +134,10 @@ class SoftmaxOp : public Operation { } }; +#ifdef MACE_ENABLE_QUANTIZE static const int kInputDeltaIntBits = 6; static const int kSumExpIntBits = 12; -#ifdef MACE_ENABLE_QUANTIZE template <> class SoftmaxOp : public Operation { public: @@ -374,10 +376,10 @@ class SoftmaxOp : public Operation { explicit SoftmaxOp(OpConstructContext *context) : Operation(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SoftmaxKernel); + kernel_ = make_unique>(); } else { context->set_output_mem_type(MemoryType::GPU_BUFFER); - kernel_.reset(new opencl::buffer::SoftmaxKernel); + kernel_ = make_unique>(); } } MaceStatus Run(OpContext *context) override { diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index e1da96664abe010a84bd287cc9b2cd940ed7e736..ece9b6f61dd25e0fe4c6d2f5aff1aeea4ed55302 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -19,6 +19,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/space_to_batch.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -309,7 +310,7 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { explicit SpaceToBatchNDOp(OpConstructContext *context) : SpaceToBatchOpBase(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SpaceToBatchKernel); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index fb98de71dd118448d02c64f06fb1a79f9d3a8302..4e40227c5b5857d065195d509bcafe55fbef1c59 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -19,6 +19,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/space_to_depth.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -95,7 +96,7 @@ class SpaceToDepthOp : public Operation { : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SpaceToDepthKernel(block_size)); + kernel_ = make_unique>(block_size); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/splice.cc b/mace/ops/splice.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf1dfe36b41e8c79675f3d75b0578fa2ce76816e --- /dev/null +++ b/mace/ops/splice.cc @@ -0,0 +1,121 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is for SpliceComponent in Kaldi. +// It splices a context window of frames together [over time] +// (copy and append the frame whose time-index in in context_) +// The context_ values indicate which frame (over time) to splice. +// if context value is less than the first time-index, +// copy and append the first frame's dada, +// when context value is larger than frame's count, +// copy and append the last frame's data. +// i.e., give input data: [[1, 2, 3], [4, 5, 6]], +// with input-dim = 3, frame count = 2, context = [-1, 0, 1] +// Then, the output should be: +// [1, 2, 3, 1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6, 4, 5, 6] +// if const_component_dim_ != 0, const_dim_ will be used to determine which +// row of "in" we copy the last part of each row of "out" from (this part is +// not subject to splicing, it's assumed constant for each frame of "input". + +#include +#include + +#include "mace/core/operator.h" +#include "mace/utils/math.h" + +namespace mace { +namespace ops { + +template +class SpliceOp; + +template +class SpliceOp : public Operation { + public: + explicit SpliceOp(OpConstructContext *context) + : Operation(context), + context_(Operation::GetRepeatedArgs("context")), + const_dim_( + Operation::GetOptionalArg("const_component_dim", 0)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + MACE_CHECK(context_.size() > 0) + << "The context param should not be empty in Splice Op."; + + Tensor *output = this->Output(0); + const std::vector &input_shape = input->shape(); + + const index_t frames = + std::accumulate(input->shape().begin(), input->shape().end() - 1, 1, + std::multiplies()); + + const index_t rank = input->dim_size(); + const index_t input_dim = input_shape[rank - 1]; + + const index_t num_splice = static_cast(context_.size()); + const index_t dim = input_dim - const_dim_; + MACE_CHECK(input_dim > const_dim_, + "input dim should be greater than const dim."); + const index_t output_dim = dim * num_splice + const_dim_; + + std::vector output_shape = input->shape(); + output_shape[rank - 1] = output_dim; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t i = 0; i < frames; ++i) { + for (index_t c = 0; c < num_splice; ++c) { + const index_t offset = + Clamp(context_[c] + i, 0, frames - 1); + T *output_base = output_data + i * output_dim + c * dim; + const T *input_base = input_data + offset * input_dim; + memcpy(output_base, input_base, dim * sizeof(T)); + } + } + + if (const_dim_ > 0) { + const index_t output_offset = output_dim - const_dim_; + const index_t input_offset = dim; +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < frames; ++i) { + index_t offset = i + context_[0] >= 0 ? i + context_[0] : 0; + T *output_base = output_data + i * output_dim; + const T *input_base = input_data + offset * input_dim; + memcpy(output_base + output_offset, + input_base + input_offset, + const_dim_ * sizeof(T)); + } + } + return MaceStatus::MACE_SUCCESS; + } + + private: + std::vector context_; + int const_dim_; +}; + +void RegisterSplice(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Splice", SpliceOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/splice_benchmark.cc b/mace/ops/splice_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..253808b8385e1526432cfdc3cd5befd98f70736b --- /dev/null +++ b/mace/ops/splice_benchmark.cc @@ -0,0 +1,92 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void BMSpliceHelper(int iters, + const std::vector &input_shape, + const index_t left_context, + const index_t right_context, + const int const_component_dim) { + mace::testing::StopTiming(); + + // Construct graph + OpsTestNet net; + + const int num_splice = left_context + right_context + 1; + std::vector contexts(num_splice); + for (int i = 0; i < num_splice; ++i) { + contexts[i] = left_context + i; + } + const index_t input_size = std::accumulate(input_shape.begin(), + input_shape.end(), + 1, + std::multiplies()); + std::vector input_data(input_size); + GenerateRandomRealTypeData(input_shape, &input_data); + net.AddInputFromArray("Input", input_shape, input_data); + + OpDefBuilder("Splice", "SpliceTest") + .Input("Input") + .Output("Output") + .AddIntsArg("context", contexts) + .AddIntArg("const_component_dim", const_component_dim) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + net.Sync(); + } + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + net.Sync(); + } +} +} // namespace + +#define MACE_BM_SPLICE_MACRO(N, H, W, L, R, C, TYPE, DEVICE) \ + static void \ + MACE_BM_SPLICE_##N##_##H##_##W##_##L##_##R##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W; \ + mace::testing::MacsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMSpliceHelper(iters, {N, H, W}, L, R, C); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_SPLICE_##N##_##H##_##W##_##L##_##R##_##C##_##TYPE##_##DEVICE) + +#define MACE_BM_SPLICE(N, H, W, L, R, C) \ + MACE_BM_SPLICE_MACRO(N, H, W, L, R, C, float, CPU); + +MACE_BM_SPLICE(1, 32, 32, 5, 5, 10); +MACE_BM_SPLICE(1, 32, 32, 7, 7, 5); +MACE_BM_SPLICE(1, 32, 32, 3, 3, 20); +MACE_BM_SPLICE(1, 128, 128, 9, 9, 100); +MACE_BM_SPLICE(1, 128, 128, 7, 7, 100); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/splice_test.cc b/mace/ops/splice_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..60e1652a394d7d1a7b88c0b1f537ec5fc688d613 --- /dev/null +++ b/mace/ops/splice_test.cc @@ -0,0 +1,84 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class SpliceOpTest : public OpsTestBase {}; + +namespace { +template +void TestSplice(const std::vector &input_shape, + const std::vector &input, + const std::vector &context, + const int const_dim, + const std::vector &output_shape, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), + input_shape, + input); + + OpDefBuilder("Splice", "SpliceTest") + .Input("Input") + .Output("Output") + .AddIntsArg("context", context) + .AddIntArg("const_component_dim", const_dim) + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.AddInputFromArray("ExpectedOutput", output_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(SpliceOpTest, WithoutConstDim) { + TestSplice( + {1, 7, 2}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {-2, -1, 0, 1, 2}, 0, + {1, 7, 10}, + {1, 2, 1, 2, 1, 2, 3, 4, 5, 6, + 1, 2, 1, 2, 3, 4, 5, 6, 7, 8, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 7, 8, 9, 10, 11, 12, 13, 14, 13, 14, + 9, 10, 11, 12, 13, 14, 13, 14, 13, 14}); +} + +TEST_F(SpliceOpTest, WithConstDim) { + TestSplice( + {1, 5, 10}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {-2, -1, 0, 1, 2}, 7, + {1, 5, 22}, + {1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, + 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 5, 6, 7, 5, 6, 7, 8, 9, 10, 11, + 3, 4, 5, 4, 5, 6, 5, 6, 7, 5, 6, 7, 5, 6, 7, 6, 7, 8, 9, 10, 11, 12}); +} +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/split.cc b/mace/ops/split.cc index 7fe05be1edf474cc92ee8c049f27e8a265ca7219..7c920d4c115f9650973ab62a2c79d29b677faf83 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -19,6 +19,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/split.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -35,9 +36,9 @@ class SplitOp : public Operation { checked_(false) {} void Validate() { - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && this->Input(0)->dim_size() == 4) { if (axis_ == 3) axis_ = 1; else if (axis_ == 2) axis_ = 3; else if (axis_ == 1) axis_ = 2; @@ -108,7 +109,7 @@ class SplitOp : public Operation { : Operation(context) { int32_t axis = Operation::GetOptionalArg("axis", 3); if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SplitKernel(axis)); + kernel_ = make_unique>(axis); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc index 45331685059228b32ef92f7abffbc98791d90d0b..17584778a8ae93994530bdbad9f8a53d476b1e18 100644 --- a/mace/ops/split_benchmark.cc +++ b/mace/ops/split_benchmark.cc @@ -44,6 +44,7 @@ void BMSplitHelper(int iters, } builder .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Warm-up diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc index 726d12e6fae54054d504b1a5a07fb9aa70a4e8e5..b693fd0cd3da81e00c5627a852ef6e1c7b97b4c7 100644 --- a/mace/ops/split_test.cc +++ b/mace/ops/split_test.cc @@ -54,7 +54,7 @@ void RandomTest(const int num_outputs, int axis) { builder = builder.Output(MakeString("Output", i)); } builder.AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc index 1bd8a2e33e872715f57b712102643b411b142fbb..b937b259322615abcbb929e4c17c0f41e3844167 100644 --- a/mace/ops/sqrdiff_mean.cc +++ b/mace/ops/sqrdiff_mean.cc @@ -19,6 +19,7 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/sqrdiff_mean.h" #endif // MACE_ENABLE_OPENCL +#include "mace/utils/memory.h" namespace mace { namespace ops { @@ -83,7 +84,7 @@ class SqrDiffMeanOp : public Operation { explicit SqrDiffMeanOp(OpConstructContext *context) : Operation(context) { if (context->device()->gpu_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SqrDiffMeanKernel()); + kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index e67d2672b4df63795cb63bbce9b0e4960d33fa43..15c3408c2bbbfbc6832af699045036d1580152c7 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -32,9 +32,9 @@ class SqueezeOp : public Operation { MACE_UNUSED(context); if (!checked_ && D == DeviceType::CPU && DataTypeToEnum::value != DT_UINT8) { - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && this->Input(0)->dim_size() == 4) { if (axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2) { axis_[0] = 2; axis_[1] = 3; diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc index 3c27f6b9c0ca127726c04599012698a8d4a5d236..8cd829794c16c71b3df1853fedc79eed75d317a8 100644 --- a/mace/ops/squeeze_test.cc +++ b/mace/ops/squeeze_test.cc @@ -30,7 +30,7 @@ void TestSqueeze(const std::vector &org_shape, OpDefBuilder("Squeeze", "SqueezeTest") .Input("Input") .AddIntsArg("axis", axis) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc index 221a75d46442afd1b3f385350b6ddd943bdb5db9..c10914f27fb87e7e1159749eb990a66bb6506f42 100644 --- a/mace/ops/strided_slice.cc +++ b/mace/ops/strided_slice.cc @@ -17,6 +17,7 @@ #include #include "mace/core/operator.h" +#include "mace/utils/math.h" namespace mace { namespace ops { @@ -32,21 +33,69 @@ class StridedSliceOp : public Operation { new_axis_mask_(Operation::GetOptionalArg("new_axis_mask", 0)), shrink_axis_mask_( Operation::GetOptionalArg("shrink_axis_mask", 0)), - is_slice_(Operation::GetOptionalArg("slice", false)) { + is_slice_(Operation::GetOptionalArg("slice", false)), + has_data_format_(Operation::GetOptionalArg("has_data_format", 0)), + checked_(false) { MACE_CHECK(ellipsis_mask_ == 0 && new_axis_mask_ == 0, "ellipsis_mask and new_axis_mask are not supported yet."); } + void TransposeMaskValueFromNHWCToNCHW(int* mask_value) { + size_t dims[4]; + int count; + for (count = 0; count < 4; ++count) { + dims[count] = *mask_value & 1; + *mask_value >>= 1; + } + size_t new_dims[4] = {dims[0], dims[3], dims[1], dims[2]}; + for (count = 3; count >= 0; --count) { + *mask_value <<= 1; + *mask_value += new_dims[count]; + } + } + + void TransposeDimsFromNHWCToNCHW(std::vector* dims) { + int32_t h = (*dims)[1]; + int32_t w = (*dims)[2]; + int32_t c = (*dims)[3]; + + (*dims)[1] = c; + (*dims)[2] = h; + (*dims)[3] = w; + } + + void TransposeDimsFromNCHWToNHWC(std::vector* dims) { + int32_t c = (*dims)[1]; + int32_t h = (*dims)[2]; + int32_t w = (*dims)[3]; + + (*dims)[1] = h; + (*dims)[2] = w; + (*dims)[3] = c; + } + MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); + + if (!checked_) { + if (has_data_format_ && this->Input(0)->dim_size() == 4) { + TransposeMaskValueFromNHWCToNCHW(&begin_mask_); + TransposeMaskValueFromNHWCToNCHW(&end_mask_); + TransposeMaskValueFromNHWCToNCHW(&ellipsis_mask_); + TransposeMaskValueFromNHWCToNCHW(&new_axis_mask_); + TransposeMaskValueFromNHWCToNCHW(&shrink_axis_mask_); + } + checked_ = true; + } + const Tensor *input = this->Input(INPUT); const Tensor *begin_indices = this->Input(BEGIN); const Tensor *end_indices = this->Input(END); const Tensor *strides = nullptr; + if (this->InputSize() > 3) { strides = this->Input(STRIDES); } - Tensor *output = this->Output(OUTPUT); if (strides == nullptr) { tmp_strides_tensor_.Resize({begin_indices->size()}); Tensor::MappingGuard strides_guard(&tmp_strides_tensor_); @@ -55,6 +104,11 @@ class StridedSliceOp : public Operation { strides = &tmp_strides_tensor_; } + MACE_CHECK(begin_indices->dim_size() == 1 && + end_indices->dim_size() == 1 && + strides->dim_size() == 1, + "Expected begin, end, and strides to be 1D tensor"); + Tensor::MappingGuard input_guard(input); Tensor::MappingGuard begin_indices_guard(begin_indices); Tensor::MappingGuard end_indices_guard(end_indices); @@ -63,88 +117,145 @@ class StridedSliceOp : public Operation { const int32_t *begin_indices_data = begin_indices->data(); const int32_t *end_indices_data = end_indices->data(); const int32_t *strides_data = strides->data(); - std::vector pad_begin_indices(input->dim_size(), 0); - std::vector pad_end_indices(input->dim_size(), 0); - std::vector pad_strides_indices(input->dim_size(), 1); - - if (begin_indices->size() < input->dim_size()) { - for (index_t i = 0; i < begin_indices->size(); ++i) { - pad_begin_indices[i] = begin_indices_data[i]; - pad_end_indices[i] = end_indices_data[i]; - pad_strides_indices[i] = strides_data[i]; - } - for (index_t i = begin_indices->size(); i < input->dim_size(); ++i) { - pad_end_indices[i] = input->dim(i); - } - begin_indices_data = pad_begin_indices.data(); - end_indices_data = pad_end_indices.data(); - strides_data = pad_strides_indices.data(); - } - std::vector slice_end_data; + std::vector begin_indices_vec( + begin_indices_data, begin_indices_data + begin_indices->size()); + std::vector end_indices_vec( + end_indices_data, end_indices_data + end_indices->size()); + std::vector strides_indices_vec( + strides_data, strides_data + strides->size()); + + MACE_CHECK(input->size() > 0 && input->dim_size() > 0 && + input->dim_size() <= 4, + "The input size should larger than 0." + " And input dims should be an integer in (0, 4]."); + + std::vector output_shape = {}; + + const size_t input_dims = input->dim_size(); if (is_slice_) { - // if this op is slice, the end_indices_data is size actually - slice_end_data.resize(end_indices->size()); - for (size_t i = 0; i < slice_end_data.size(); ++i) { - if (end_indices_data[i] == -1) { - slice_end_data[i] = input->dim(i); - } else { - slice_end_data[i] = begin_indices_data[i] + end_indices_data[i]; + MACE_CHECK(begin_indices_vec.size() == input_dims && + end_indices_vec.size() == input_dims, + "In slice, begin and size elements num should be equal"); + + // transpose + if (has_data_format_ && this->Input(0)->dim_size() == 4) { + TransposeDimsFromNHWCToNCHW(&begin_indices_vec); + TransposeDimsFromNHWCToNCHW(&end_indices_vec); + TransposeDimsFromNHWCToNCHW(&strides_indices_vec); + } + + for (size_t i = 0; i < input_dims; ++i) { + if (end_indices_vec[i] == -1) { + end_indices_vec[i] = input->dim(i) - begin_indices_vec[i]; } } - end_indices_data = slice_end_data.data(); - } - std::vector output_shape; - std::vector real_begin_indices(input->dim_size(), 0); - std::vector real_end_indices(input->dim_size(), 0); - for (index_t d = 0; d < input->dim_size(); ++d) { - index_t dim_len = input->dim(d); - if (begin_mask_ & (1 << d)) { - real_begin_indices[d] = strides_data[d] > 0 ? 0 : dim_len - 1; - } else { - real_begin_indices[d] = (begin_indices_data[d] + dim_len) % dim_len; + for (size_t i = 0; i < input_dims; ++i) { + int32_t b = begin_indices_vec[i]; + int32_t s = end_indices_vec[i]; + int32_t input_i = input->dim(i); + MACE_CHECK(0 <= b && b <= input_i, + "In Slice, expected begin[", i, "] in [0, ", input_i, + "], but got ", b); + MACE_CHECK(0 <= s && b + s <= input_i, + "In Slice, expected size[", i, "] in [0, ", + input_i - b, "], but got", s); + end_indices_vec[i] = b + s; + output_shape.push_back(s); } - if (end_mask_ & (1 << d)) { - real_end_indices[d] = strides_data[d] > 0 ? dim_len : -1; - } else { - real_end_indices[d] = - end_indices_data[d] < -dim_len - ? -1 - : (end_indices_data[d] < 0 - ? (end_indices_data[d] + dim_len) - : std::min(static_cast(end_indices_data[d]), - dim_len)); + } else { + MACE_CHECK(begin_indices_vec.size() == end_indices_vec.size() && + end_indices_vec.size() == strides_indices_vec.size(), + "In strided_slice, expected begin, end, and strides to be", + " equal size tensors"); + for (index_t i = 0; i < strides->size(); ++i) { + MACE_CHECK(strides_indices_vec[i] != 0, "strides data cannot be 0!"); } - int32_t out_dim_len = std::max( - 0.f, std::ceil((real_end_indices[d] - real_begin_indices[d]) / - static_cast(strides_data[d]))); - if (!(shrink_axis_mask_ & (1 << d))) { - output_shape.push_back(out_dim_len); - } else { - MACE_CHECK(out_dim_len == 1, - "cannot shrink axis that has len > 1, dim(", d, "): [", - real_begin_indices[d], ", ", real_end_indices[d], "]"); + // pad + begin_indices_vec.resize(input_dims, 0); + strides_indices_vec.resize(input_dims, 1); + std::vector tmp_input_dims(input->shape().begin(), + input->shape().end()); + if (has_data_format_ && input_dims == 4) { + TransposeDimsFromNCHWToNHWC(&tmp_input_dims); + } + for (size_t i = end_indices_vec.size(); i < input_dims; ++i) { + end_indices_vec.push_back(tmp_input_dims[i]); + } + + // transpose + if (has_data_format_ && this->Input(0)->dim_size() == 4) { + TransposeDimsFromNHWCToNCHW(&begin_indices_vec); + TransposeDimsFromNHWCToNCHW(&end_indices_vec); + TransposeDimsFromNHWCToNCHW(&strides_indices_vec); + } + + // mask and shrink + for (index_t d = 0; d < input->dim_size(); ++d) { + index_t dim_len = input->dim(d); + const std::vector valid_range = { + strides_indices_vec[d] > 0 ? 0 : -1, + strides_indices_vec[d] > 0 ? dim_len : dim_len - 1}; + + auto format_indices = [valid_range, d, dim_len](index_t indice) { + index_t forward = indice < 0 ? indice + dim_len : indice; + return Clamp(forward, valid_range[0], valid_range[1]); + }; + + if (!(shrink_axis_mask_ & (1 << d))) { + if (begin_mask_ & (1 << d)) { + begin_indices_vec[d] = strides_indices_vec[d] > 0 ? 0 : dim_len - 1; + } else { + begin_indices_vec[d] = format_indices(begin_indices_vec[d]); + } + if (end_mask_ & (1 << d)) { + end_indices_vec[d] = strides_indices_vec[d] > 0 ? dim_len : -1; + } else { + end_indices_vec[d] = format_indices(end_indices_vec[d]); + } + + int32_t out_dim_len = std::max( + 0.f, std::ceil((end_indices_vec[d] - begin_indices_vec[d]) / + static_cast(strides_indices_vec[d]))); + output_shape.push_back(out_dim_len); + } else { + begin_indices_vec[d] = begin_indices_vec[d] < 0 + ? begin_indices_vec[d] + dim_len + : begin_indices_vec[d]; + end_indices_vec[d] = begin_indices_vec[d] + 1; + MACE_CHECK( + begin_indices_vec[d] >= 0 && begin_indices_vec[d] < dim_len, + "slice begin indice of dimension '", d, "': ", + begin_indices_vec[d], ", is out of bound"); + } } } + for (size_t i = 0; i < output_shape.size(); ++i) { + MACE_CHECK(output_shape[i] > 0, + "Expected output_shape[", i, "] larger than 0, but got ", + output_shape[i]); + } + std::vector dim_stride(input->dim_size(), 1); for (index_t d = input->dim_size() - 2; d >= 0; --d) { dim_stride[d] = dim_stride[d + 1] * input->dim(d + 1); } + Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->Resize(output_shape)); Tensor::MappingGuard output_guard(output); T *output_data = output->mutable_data(); bool slice_by_first_axis = true; - if (strides_data[0] != 1) { + if (strides_indices_vec[0] != 1) { slice_by_first_axis = false; } else { for (index_t d = 1; d < input->dim_size(); ++d) { - if (strides_data[d] != 1 || real_begin_indices[d] != 0 || - real_end_indices[d] != input->dim(d)) { + if (strides_indices_vec[d] != 1 || begin_indices_vec[d] != 0 || + end_indices_vec[d] != input->dim(d)) { slice_by_first_axis = false; break; } @@ -152,47 +263,71 @@ class StridedSliceOp : public Operation { } if (slice_by_first_axis) { - memcpy(output_data, input_data + real_begin_indices[0] * dim_stride[0], - sizeof(T) * (real_end_indices[0] - real_begin_indices[0]) * + memcpy(output_data, input_data + begin_indices_vec[0] * dim_stride[0], + sizeof(T) * (end_indices_vec[0] - begin_indices_vec[0]) * dim_stride[0]); } else { if (input->dim_size() == 1) { - for (index_t i = real_begin_indices[0]; - strides_data[0] > 0 ? i < real_end_indices[0] - : i > real_end_indices[0]; - i += strides_data[0]) { + for (index_t i = begin_indices_vec[0]; + strides_indices_vec[0] > 0 ? i < end_indices_vec[0] + : i > end_indices_vec[0]; + i += strides_indices_vec[0]) { *output_data++ = input_data[i]; } } else if (input->dim_size() == 2) { - for (index_t i = real_begin_indices[0]; - strides_data[0] > 0 ? i < real_end_indices[0] - : i > real_end_indices[0]; - i += strides_data[0]) { - for (index_t j = real_begin_indices[1]; - strides_data[1] > 0 ? j < real_end_indices[1] - : j > real_end_indices[1]; - j += strides_data[1]) { + for (index_t i = begin_indices_vec[0]; + strides_indices_vec[0] > 0 ? i < end_indices_vec[0] + : i > end_indices_vec[0]; + i += strides_indices_vec[0]) { + for (index_t j = begin_indices_vec[1]; + strides_indices_vec[1] > 0 ? j < end_indices_vec[1] + : j > end_indices_vec[1]; + j += strides_indices_vec[1]) { *output_data++ = input_data[i * input->dim(1) + j]; } } } else if (input->dim_size() == 3) { - for (index_t i = real_begin_indices[0]; - strides_data[0] > 0 ? i < real_end_indices[0] - : i > real_end_indices[0]; - i += strides_data[0]) { - for (index_t j = real_begin_indices[1]; - strides_data[1] > 0 ? j < real_end_indices[1] - : j > real_end_indices[1]; - j += strides_data[1]) { - for (index_t k = real_begin_indices[2]; - strides_data[2] > 0 ? k < real_end_indices[2] - : k > real_end_indices[2]; - k += strides_data[2]) { + for (index_t i = begin_indices_vec[0]; + strides_indices_vec[0] > 0 ? i < end_indices_vec[0] + : i > end_indices_vec[0]; + i += strides_indices_vec[0]) { + for (index_t j = begin_indices_vec[1]; + strides_indices_vec[1] > 0 ? j < end_indices_vec[1] + : j > end_indices_vec[1]; + j += strides_indices_vec[1]) { + for (index_t k = begin_indices_vec[2]; + strides_indices_vec[2] > 0 ? k < end_indices_vec[2] + : k > end_indices_vec[2]; + k += strides_indices_vec[2]) { *output_data++ = input_data[(i * input->dim(1) + j) * input->dim(2) + k]; } } } + } else if (input->dim_size() == 4) { + for (index_t i = begin_indices_vec[0]; + strides_indices_vec[0] > 0 ? i < end_indices_vec[0] + : i > end_indices_vec[0]; + i += strides_indices_vec[0]) { + for (index_t j = begin_indices_vec[1]; + strides_indices_vec[1] > 0 ? j < end_indices_vec[1] + : j > end_indices_vec[1]; + j += strides_indices_vec[1]) { + for (index_t k = begin_indices_vec[2]; + strides_indices_vec[2] > 0 ? k < end_indices_vec[2] + : k > end_indices_vec[2]; + k += strides_indices_vec[2]) { + for (index_t l = begin_indices_vec[3]; + strides_indices_vec[3] > 0 ? l < end_indices_vec[3] + : l > end_indices_vec[3]; + l += strides_indices_vec[3]) { + *output_data++ = + input_data[((i * input->dim(1) + j) * input->dim(2) + k) + * input->dim(3) + l]; + } + } + } + } } else { MACE_NOT_IMPLEMENTED; } @@ -207,6 +342,8 @@ class StridedSliceOp : public Operation { int new_axis_mask_; int shrink_axis_mask_; bool is_slice_; + int has_data_format_; + bool checked_; Tensor tmp_strides_tensor_; MACE_OP_INPUT_TAGS(INPUT, BEGIN, END, STRIDES); diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc index df691ce682f2a0a55db2f93a9077a265f61cbef0..8b085fe532694f7c343e0cfda735d91332aea294 100644 --- a/mace/ops/strided_slice_test.cc +++ b/mace/ops/strided_slice_test.cc @@ -64,6 +64,54 @@ void TestStridedSlice(const std::vector &input_shape, *net.GetOutput("Output")); } +void TestStridedSliceWithDataFormat(const std::vector &input_shape, + const std::vector &input, + const std::vector &begin_indices, + const std::vector &end_indices, + const std::vector &strides, + const int begin_mask, + const int end_mask, + const int ellipsis_mask, + const int new_axis_mask, + const int shrink_axis_mask, + const std::vector &output_shape, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray("Input", input_shape, input); + net.AddInputFromArray( + "BeginIndices", {static_cast(begin_indices.size())}, + begin_indices); + net.AddInputFromArray( + "EndIndices", {static_cast(end_indices.size())}, end_indices); + net.AddInputFromArray( + "Strides", {static_cast(strides.size())}, strides); + + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + + OpDefBuilder("StridedSlice", "StridedSliceOpTest") + .Input("InputNCHW") + .Input("BeginIndices") + .Input("EndIndices") + .Input("Strides") + .Output("OutputNCHW") + .AddIntArg("begin_mask", begin_mask) + .AddIntArg("end_mask", end_mask) + .AddIntArg("ellipsis_mask", ellipsis_mask) + .AddIntArg("new_axis_mask", new_axis_mask) + .AddIntArg("shrink_axis_mask", shrink_axis_mask) + .AddIntArg("has_data_format", 1) + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.TransformDataFormat("OutputNCHW", NCHW, "Output", + NHWC); + net.AddInputFromArray("ExpectedOutput", output_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} + void TestSlice(const std::vector &input_shape, const std::vector &input, const std::vector &begin_indices, @@ -92,6 +140,41 @@ void TestSlice(const std::vector &input_shape, *net.GetOutput("Output")); } +void TestSliceWithDataFormat(const std::vector &input_shape, + const std::vector &input, + const std::vector &begin_indices, + const std::vector &indices_size, + const std::vector &output_shape, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray("Input", input_shape, input); + net.AddInputFromArray( + "BeginIndices", {static_cast(input_shape.size())}, + begin_indices); + net.AddInputFromArray( + "IndicesSize", {static_cast(indices_size.size())}, indices_size); + + net.TransformDataFormat("Input", NHWC, "InputNCHW", + NCHW); + + OpDefBuilder("StridedSlice", "StridedSliceOpTest") + .Input("InputNCHW") + .Input("BeginIndices") + .Input("IndicesSize") + .Output("OutputNCHW") + .AddIntArg("slice", 1) + .AddIntArg("has_data_format", 1) + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.TransformDataFormat("OutputNCHW", NCHW, "Output", + NHWC); + net.AddInputFromArray("ExpectedOutput", output_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} + } // namespace TEST_F(StridedSliceOpTest, TestStridedSliceByFirstAxis) { @@ -157,6 +240,66 @@ TEST_F(StridedSliceOpTest, TestStridedSliceRank3) { 1, 2}, {1, 1, 3, 3}); } + +TEST_F(StridedSliceOpTest, TestStridedSliceRank4) { + TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 0, 0, 0, 0, {1, 2, 1, 2}, + {15, 16, 21, 22}); + TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 3, 0, 0, 0, 0, {2, 2, 1, 2}, + {3, 4, 9, 10, 15, 16, 21, 22}); + TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 0, {1, 2, 1, 3}, + {15, 16, 17, 21, 22, 23}); + TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 8, {1, 2, 1}, + {15, 21}); + TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 15, {}, {15}); + TestStridedSlice({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {-1, 2, 1, 3}, + {0, 0, 0, 0}, {-1, -1, -1, -1}, 0, 0, 0, 0, 0, {1, 1, 1, 2}, + {23, 22}); +} + +TEST_F(StridedSliceOpTest, TestStridedSliceWithDataFormat) { + TestStridedSliceWithDataFormat( + {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 0, 0, 0, 0, {1, 2, 1, 2}, + {15, 16, 21, 22}); + TestStridedSliceWithDataFormat( + {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 3, 0, 0, 0, 0, {2, 2, 1, 2}, + {3, 4, 9, 10, 15, 16, 21, 22}); + TestStridedSliceWithDataFormat( + {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0, 1, 0}, + {2, 2, 2, 2}, {1, 1, 1, 1}, 0, 8, 0, 0, 0, {1, 2, 1, 3}, + {15, 16, 17, 21, 22, 23}); + TestStridedSliceWithDataFormat( + {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0}, + {2, 1}, {1, 1}, 0, 8, 0, 0, 0, {1, 1, 2, 3}, + {12, 13, 14, 15, 16, 17}); + TestStridedSliceWithDataFormat( + {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {1, 0}, + {2, 1}, {1, 1}, 0, 2, 0, 0, 0, {1, 2, 2, 3}, + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); + TestStridedSliceWithDataFormat( + {2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {-1, 2, 1, 3}, + {0, 0, 0, 0}, {-1, -1, -1, -1}, 0, 0, 0, 0, 0, {1, 1, 1, 2}, + {23, 22}); +} + TEST_F(StridedSliceOpTest, TestSlice) { TestSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 0}, {2, 3}, {2, 3}, {1, 2, 3, 4, 5, 6}); @@ -166,6 +309,17 @@ TEST_F(StridedSliceOpTest, TestSlice) { TestSlice({2, 3}, {1, 2, 3, 4, 5, 6}, {0, 1}, {2, -1}, {2, 2}, {2, 3, 5, 6}); } +TEST_F(StridedSliceOpTest, TestSliceWithDataFormat) { + TestSliceWithDataFormat({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, + {1, 0, 1, 0}, {1, 2, 1, 2}, {1, 2, 1, 2}, + {15, 16, 21, 22}); + TestSliceWithDataFormat({2, 2, 2, 3}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, + {1, 0, 1, 0}, {-1, -1, -1, -1}, {1, 2, 1, 3}, + {15, 16, 17, 21, 22, 23}); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/sum_group.cc b/mace/ops/sum_group.cc new file mode 100644 index 0000000000000000000000000000000000000000..21c83b68f98b791a9a061fb1226b6b86edfceba6 --- /dev/null +++ b/mace/ops/sum_group.cc @@ -0,0 +1,107 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is for SumGroupComponent in Kaldi. +// It's used to sum up groups of posteriors, +// and to introduce a kind of Gaussian-mixture-model-like +// idea into neural nets. + +#include +#include + +#include "mace/core/operator.h" + +namespace mace { +namespace ops { + +template +class SumGroupOp; + +template +class SumGroupOp : public Operation { + public: + explicit SumGroupOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + MACE_CHECK(this->InputSize() >= 2, + "SumGroup should have at least 2 inputs."); + const Tensor *input = this->Input(0); + // Sizes-input gets a vector saying, for + // each output-dim, how many + // inputs data were summed over. + const Tensor *sizes = this->Input(1); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim_size() >= 1, + "SumGroup's input's rank should be >= 1."); + MACE_CHECK(sizes->dim_size() == 1, + "SumGroup's sizes input should be a vector."); + + const std::vector &input_shape = input->shape(); + const index_t bh = + std::accumulate(input_shape.begin(), input_shape.end() - 1, 1, + std::multiplies()); + std::vector output_shape(input_shape); + const index_t output_dim = sizes->dim(0); + const index_t dim_size = input->dim_size(); + const index_t input_dim = input_shape[dim_size -1]; + output_shape[dim_size - 1] = output_dim; + + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_sizes(sizes); + Tensor::MappingGuard guard_output(output); + const T *input_data = input->data(); + const int *sizes_data = sizes->data(); + T *output_data = output->mutable_data(); + + std::vector> + sum_indexes(static_cast(output_dim)); + + int cur_index = 0; + for (index_t i = 0; i < output_dim; ++i) { + int size_value = sizes_data[i]; + MACE_CHECK(size_value > 0, "size value should be > 0"); + sum_indexes[i].first = cur_index; + cur_index += size_value; + sum_indexes[i].second = cur_index; + MACE_CHECK(cur_index <= input_dim) + << "size value over-ranged:" << cur_index << "<=" << input_dim; + } + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t i = 0; i < bh; ++i) { + for (index_t j = 0; j < output_dim; ++j) { + int start_col = sum_indexes[j].first; + int end_col = sum_indexes[j].second; + T sum = 0; + for (int src_col = start_col; src_col < end_col; ++src_col) { + sum += input_data[i * input_dim + src_col]; + } + output_data[i * output_dim + j] = sum; + } + } + + return MaceStatus::MACE_SUCCESS; + } +}; + +void RegisterSumGroup(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "SumGroup", SumGroupOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/sum_group_benchmark.cc b/mace/ops/sum_group_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb3b20e855b23a0babd9d31cee840425cce9545c --- /dev/null +++ b/mace/ops/sum_group_benchmark.cc @@ -0,0 +1,75 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void SumGroupBenchmark(int iters, int n, int h, int w) { + mace::testing::StopTiming(); + OpsTestNet net; + // Add input data + net.AddRandomInput("Input", {n, h, w}); + net.AddRepeatedInput("Sizes", + {w / 2}, + 2); + OpDefBuilder("SumGroup", "SumGroupBM") + .Input("Input") + .Input("Sizes") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + net.Sync(); + } +} +} // namespace + +#define MACE_BM_SUMGROUP_MACRO(N, H, W, TYPE, DEVICE) \ + static void \ + MACE_BM_SUMGROUP_##N##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + SumGroupBenchmark(iters, N, H, W); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_SUMGROUP_##N##_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_SUMGROUP(N, H, W) \ + MACE_BM_SUMGROUP_MACRO(N, H, W, float, CPU); + +MACE_BM_SUMGROUP(1, 10, 256); +MACE_BM_SUMGROUP(1, 20, 128); +MACE_BM_SUMGROUP(1, 10, 128); +MACE_BM_SUMGROUP(1, 20, 512); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/sum_group_test.cc b/mace/ops/sum_group_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e5a4ef904871e38688fe18d699955379aeeaf539 --- /dev/null +++ b/mace/ops/sum_group_test.cc @@ -0,0 +1,71 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class SumGroupOpTest : public OpsTestBase {}; + +namespace { +template +void TestSumGroup(const std::vector &input_shape, + const std::vector &input, + const std::vector &sizes, + const std::vector &output_shape, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), + input_shape, + input); + const index_t output_dim = sizes.size(); + net.AddInputFromArray(MakeString("Sizes"), + {output_dim}, + sizes); + + OpDefBuilder("SumGroup", "SumGroupTest") + .Input("Input") + .Input("Sizes") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.AddInputFromArray("ExpectedOutput", output_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(SumGroupOpTest, SimpleTest) { + TestSumGroup( + {1, 5, 10}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, + {2, 1, 2, 3, 2}, + {1, 5, 5}, + {3, 3, 9, 21, 19, + 5, 4, 11, 24, 21, + 7, 5, 13, 27, 23, + 9, 6, 15, 30, 25, + 11, 7, 17, 33, 27}); +} +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/target_rms_norm.cc b/mace/ops/target_rms_norm.cc new file mode 100644 index 0000000000000000000000000000000000000000..7b769fe712c35cc39cf282731f2a5d64d21d8695 --- /dev/null +++ b/mace/ops/target_rms_norm.cc @@ -0,0 +1,116 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This op is implemented for kaldi's NormalizeComponent. +// The output y_i = scale * x_i, +// and we want the RMS value of the y_i equals to target_rms, +// so y^t y = Dim * target_rms^2 (if y is one row of the input). +// Dim is the length of a row. +// we need the scale = 1.0 / sqrt(x^t x / (Dim * target_rms^2)). + +#include +#include + +#include "mace/core/operator.h" + +namespace mace { +namespace ops { + +template +class TargetRMSNormOp; + +template +class TargetRMSNormOp : public Operation { + public: + explicit TargetRMSNormOp(OpConstructContext *context) + : Operation(context), + target_rms_(Operation::GetOptionalArg("target_rms", 1.0)) {} + + // Calculate the square sum of an array + float SquareSum(const float *data, const index_t data_len) { + const int num_parts = 4; + float result = 0.0f; + if (data_len <= 2 * num_parts) { + for (index_t i = 0; i < data_len; ++i) { + result += data[i] * data[i]; + } + } else { + const index_t part_len = data_len / num_parts; + const index_t left_len = data_len % num_parts; + float results[4] = {0.f, 0.f, 0.f, 0.f}; + for (index_t i = 0; i < num_parts; ++i) { + for (index_t j = 0; j < part_len; ++j) { + results[i] += data[i * part_len + j] * data[i * part_len + j]; + } + } + for (index_t k = 0; k < left_len; ++k) { + float d = data[num_parts * part_len + k]; + results[3] += d * d; + } + + for (index_t i = 0; i < num_parts; ++i) { + result += results[i]; + } + } + + return result; + } + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + const std::vector &input_shape = input->shape(); + const index_t dim_size = input->dim_size(); + MACE_CHECK(dim_size >= 1, + "TargetRMSNorm's input dim size should be >= 1."); + const index_t dim = input_shape[dim_size -1]; + MACE_CHECK(dim > 0 && target_rms_ > 0, + "Both input dim and target rms should be greater than zero."); + const index_t bh = + std::accumulate(input_shape.begin(), input_shape.end() - 1, 1, + std::multiplies()); + const float d_scale = dim * target_rms_ * target_rms_; + + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_output(output); + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < bh; ++i) { + float scale = SquareSum(input_data + i * dim, dim); + scale = static_cast(1.0 / std::sqrt(scale / d_scale)); + for (index_t j = 0; j < dim; ++j) { + output_data[i * dim + j] = input_data[i * dim + j] * scale; + } + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + float target_rms_; +}; + +void RegisterTargetRMSNorm(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "TargetRMSNorm", TargetRMSNormOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/target_rms_norm_benchmark.cc b/mace/ops/target_rms_norm_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..d496bb8b101bc8f083e077bc0c8754c1cf932b0f --- /dev/null +++ b/mace/ops/target_rms_norm_benchmark.cc @@ -0,0 +1,74 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void TargetRMSNormBenchmark(int iters, int n, int h, int w, float target_rms) { + mace::testing::StopTiming(); + + OpsTestNet net; + // Add input data + net.AddRandomInput("Input", {n, h, w}); + + OpDefBuilder("TargetRMSNorm", "TargetRMSNormBM") + .Input("Input") + .AddFloatArg("target_rms", target_rms) + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + net.Sync(); + } +} +} // namespace + +#define MACE_BM_TARGETRMSNORM_MACRO(N, H, W, RMS, TYPE, DEVICE) \ + static void \ + MACE_BM_TARGETRMSNORM_##N##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + TargetRMSNormBenchmark(iters, N, H, W, RMS); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_TARGETRMSNORM_##N##_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_TARGETRMSNORM(N, H, W, RMS) \ + MACE_BM_TARGETRMSNORM_MACRO(N, H, W, RMS, float, CPU); + +MACE_BM_TARGETRMSNORM(1, 10, 256, 1.0); +MACE_BM_TARGETRMSNORM(1, 20, 128, 2.0); +MACE_BM_TARGETRMSNORM(1, 10, 128, 0.5); +MACE_BM_TARGETRMSNORM(1, 20, 512, 1.0); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/target_rms_norm_test.cc b/mace/ops/target_rms_norm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..950824470cac136a77ef274f5fc2895876b73213 --- /dev/null +++ b/mace/ops/target_rms_norm_test.cc @@ -0,0 +1,62 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class TargetRMSNormOpTest : public OpsTestBase {}; + +namespace { +template +void TestTargetRMSNorm(const std::vector &input_shape, + const std::vector &input, + const float target_rms, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), + input_shape, + input); + + OpDefBuilder("TargetRMSNorm", "TargetRMSNormTest") + .Input("Input") + .AddFloatArg("target_rms", target_rms) + .Output("Output") + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.AddInputFromArray("ExpectedOutput", input_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(TargetRMSNormOpTest, SimpleTest) { + TestTargetRMSNorm( + {1, 3, 3}, + {1, 2, 3, + 2, 3, 4, + 3, 4, 5}, + 1.0, + {0.46291, 0.92582, 1.38873, + 0.64327, 0.9649, 1.28654, + 0.734847, 0.979796, 1.224745}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/testing/test_utils.h b/mace/ops/testing/test_utils.h index 59e71448f6421d51a8d67f77d17e49420fe7a915..6a0a045b6326a67689f9755bc911a2f54fbc798a 100644 --- a/mace/ops/testing/test_utils.h +++ b/mace/ops/testing/test_utils.h @@ -27,6 +27,7 @@ #include #include "mace/core/tensor.h" +#include "gtest/gtest.h" namespace mace { namespace ops { diff --git a/mace/ops/time_offset.cc b/mace/ops/time_offset.cc new file mode 100644 index 0000000000000000000000000000000000000000..d9343fc327438a965fe4b3e98a583783a6d4993a --- /dev/null +++ b/mace/ops/time_offset.cc @@ -0,0 +1,81 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This Op is for offset descriptor in Kaldi. +// It defines time offset. + +#include +#include + +#include "mace/core/operator.h" +#include "mace/utils/math.h" + +namespace mace { +namespace ops { + +template +class TimeOffsetOp; + +template +class TimeOffsetOp : public Operation { + public: + explicit TimeOffsetOp(OpConstructContext *context) + : Operation(context), + offset_(Operation::GetOptionalArg("offset", 0)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + index_t rank = input->dim_size(); + MACE_CHECK(rank >= 2, "input's rank should >= 2."); + const std::vector &input_shape = input->shape(); + const index_t batch = + std::accumulate(input_shape.begin(), input_shape.end() - 2, 1, + std::multiplies()); + const index_t frames = input_shape[rank - 2]; + const index_t input_dim = input_shape[rank - 1]; + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + +#pragma omp parallel for collapse(2) schedule(runtime) + for (index_t i = 0; i < batch; ++i) { + for (index_t j = 0; j < frames; ++j) { + index_t time_index = offset_ + j; + index_t index = Clamp(time_index, 0, frames - 1); + T *output_base = output_data + (i * frames + j) * input_dim; + const T *input_base = input_data + (i * frames + index) * input_dim; + memcpy(output_base, input_base, input_dim * sizeof(T)); + } + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + int offset_; +}; + +void RegisterTimeOffset(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "TimeOffset", TimeOffsetOp, + DeviceType::CPU, float); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/time_offset_benchmark.cc b/mace/ops/time_offset_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..82ea9967a9bd95542f012666593e81005cd64c48 --- /dev/null +++ b/mace/ops/time_offset_benchmark.cc @@ -0,0 +1,78 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void TimeOffsetBenchmark(int iters, + std::vector shape, + int offset) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", shape); + + OpDefBuilder("TimeOffset", "TimeOffsetBM") + .Input("Input") + .Output("Output") + .AddIntArg("offset", offset) + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} +} // namespace + +#define MACE_BM_TIMEOFFSET2D_MACRO(H, W, TYPE, DEVICE) \ + static void MACE_BM_TIMEOFFSET2D_##H##_##W##_##TYPE##_##DEVICE(\ + int iters) { \ + const int64_t tot = static_cast(iters) * H * W; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + TimeOffsetBenchmark(iters, {H, W}, 1); \ + } \ + MACE_BENCHMARK(MACE_BM_TIMEOFFSET2D_##H##_##W##_##TYPE##_##DEVICE) \ + +#define MACE_BM_TIMEOFFSET2D(H, W) \ + MACE_BM_TIMEOFFSET2D_MACRO(H, W, float, CPU); + + +MACE_BM_TIMEOFFSET2D(20, 128); +MACE_BM_TIMEOFFSET2D(40, 512); +MACE_BM_TIMEOFFSET2D(1, 1024); +MACE_BM_TIMEOFFSET2D(20, 2048); +MACE_BM_TIMEOFFSET2D(20, 512); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/time_offset_test.cc b/mace/ops/time_offset_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b32b8c52acf3b8af715dac74f92d4a87efe1a102 --- /dev/null +++ b/mace/ops/time_offset_test.cc @@ -0,0 +1,125 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class TimeOffsetOpTest : public OpsTestBase {}; + +namespace { +template +void TestTimeOffset(const std::vector &input_shape, + const std::vector &input, + const int offset, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), + input_shape, + input); + + OpDefBuilder("TimeOffset", "TimeOffsetTest") + .Input("Input") + .Output("Output") + .AddIntArg("offset", offset) + .Finalize(net.NewOperatorDef()); + + net.RunOp(); + + net.AddInputFromArray("ExpectedOutput", input_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(TimeOffsetOpTest, Simple2Dim) { + TestTimeOffset( + {3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + -2, + {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); + + TestTimeOffset( + {3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + -1, + {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + + TestTimeOffset( + {3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 0, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + TestTimeOffset( + {3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 1, + {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}); + + TestTimeOffset( + {3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 2, + {11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}); +} + + +TEST_F(TimeOffsetOpTest, Simple3Dim) { + TestTimeOffset( + {2, 3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + -2, + {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); + + TestTimeOffset( + {2, 3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + -1, + {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + + TestTimeOffset( + {2, 3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 0, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + TestTimeOffset( + {2, 3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 1, + {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}); + + TestTimeOffset( + {2, 3, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + 2, + {11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, + 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/port/BUILD.bazel b/mace/port/BUILD.bazel new file mode 100644 index 0000000000000000000000000000000000000000..d23633a6a6290f109c1061191bcbf48d81aa2fa9 --- /dev/null +++ b/mace/port/BUILD.bazel @@ -0,0 +1,52 @@ +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) # Apache 2.0 + +cc_library( + name = "port", + deps = [ + "//mace/port/android:port_android", + "//mace/port/darwin:port_darwin", + "//mace/port/linux:port_linux", + ], +) + +cc_library( + name = "port_api", + hdrs = [ + "env.h", + "file_system.h", + "logger.h", + ], + deps = [ + "//mace/public", + ], +) + +cc_library( + name = "port_base", + srcs = [ + "env.cc", + "logger.cc", + ], + deps = [ + ":port_api", + "//mace/utils", + ], +) + +cc_test( + name = "port_test", + testonly = 1, + srcs = glob([ + "*_test.cc", + ]), + linkstatic = 1, + deps = [ + ":port", + "@gtest//:gtest", + "@gtest//:gtest_main", + ], +) diff --git a/mace/port/README.md b/mace/port/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9ecfff01571db8bcfa4733e1f4e4979763b81d8d --- /dev/null +++ b/mace/port/README.md @@ -0,0 +1,14 @@ +# port + +This module contains the interface and implementations for different platforms. +All platform specific code should go here. It's not allowed to use non standard +headers in other modules. + +This module splits into `port_api` and `port`. `port_api` is the interface, and +it should not depends on any other modules including `utils`. + +If the code base goes large in the future, it should be split into core and +test to keep the footprint for production libs as small as possible. + +Currently Linux, Darwin (MacOS, iOS etc.) are treated as POSIX. They will be +handled differently if needed. diff --git a/mace/port/android/BUILD.bazel b/mace/port/android/BUILD.bazel new file mode 100644 index 0000000000000000000000000000000000000000..fd5aacc51f3653a32a6fa4b5f5752772d6dd20bc --- /dev/null +++ b/mace/port/android/BUILD.bazel @@ -0,0 +1,22 @@ +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) # Apache 2.0 + +load("//mace:mace.bzl", "if_android") + +cc_library( + name = "port_android", + srcs = if_android(glob([ + "*.cc", + ])), + hdrs = if_android(glob([ + "*.h", + ])), + deps = [ + "//mace/port:port_base", + "//mace/port/posix:port_posix", + ], + alwayslink = 1, +) diff --git a/mace/port/android/env.cc b/mace/port/android/env.cc new file mode 100644 index 0000000000000000000000000000000000000000..2940d344cf3a2d8f3b2fdafe72ef85904e4db442 --- /dev/null +++ b/mace/port/android/env.cc @@ -0,0 +1,204 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/android/env.h" + +#include +#include +#include +#include +#include +#include + +#ifdef __hexagon__ +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "mace/port/android/malloc_logger.h" +#include "mace/port/posix/time.h" +#include "mace/utils/macros.h" +#include "mace/utils/memory.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace port { + +int64_t AndroidEnv::NowMicros() { +#ifdef __hexagon__ + return HAP_perf_get_time_us(); +#else + return mace::port::posix::NowMicros(); +#endif +} + +FileSystem *AndroidEnv::GetFileSystem() { + return &posix_file_system_; +} + +LogWriter *AndroidEnv::GetLogWriter() { + return &log_writer_; +} + +namespace { + +int GetCPUCount() { + int cpu_count = 0; + std::string cpu_sys_conf = "/proc/cpuinfo"; + std::ifstream f(cpu_sys_conf); + if (!f.is_open()) { + LOG(ERROR) << "failed to open " << cpu_sys_conf; + return -1; + } + std::string line; + const std::string processor_key = "processor"; + while (std::getline(f, line)) { + if (line.size() >= processor_key.size() + && line.compare(0, processor_key.size(), processor_key) == 0) { + ++cpu_count; + } + } + if (f.bad()) { + LOG(ERROR) << "failed to read " << cpu_sys_conf; + } + if (!f.eof()) { + LOG(ERROR) << "failed to read end of " << cpu_sys_conf; + } + f.close(); + VLOG(1) << "CPU cores: " << cpu_count; + return cpu_count; +} + +struct BacktraceState { + void** current; + void** end; +}; + +_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context* context, void* arg) { + BacktraceState* state = static_cast(arg); + uintptr_t pc = _Unwind_GetIP(context); + if (pc) { + if (state->current == state->end) { + return _URC_END_OF_STACK; + } else { + *state->current++ = reinterpret_cast(pc); + } + } + return _URC_NO_REASON; +} + +size_t BackTrace(void** buffer, size_t max) { + BacktraceState state = {buffer, buffer + max}; + _Unwind_Backtrace(UnwindCallback, &state); + + return state.current - buffer; +} + +} // namespace + +MaceStatus AndroidEnv::GetCPUMaxFreq(std::vector *max_freqs) { + MACE_CHECK_NOTNULL(max_freqs); + int cpu_count = GetCPUCount(); + if (cpu_count < 0) { + return MaceStatus::MACE_RUNTIME_ERROR; + } + for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) { + std::string cpuinfo_max_freq_sys_conf = MakeString( + "/sys/devices/system/cpu/cpu", + cpu_id, + "/cpufreq/cpuinfo_max_freq"); + std::ifstream f(cpuinfo_max_freq_sys_conf); + if (!f.is_open()) { + LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf; + return MaceStatus::MACE_RUNTIME_ERROR; + } + std::string line; + if (std::getline(f, line)) { + float freq = strtof(line.c_str(), nullptr); + max_freqs->push_back(freq); + } + if (f.bad()) { + LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf; + } + f.close(); + } + + VLOG(1) << "CPU freq: " << MakeString(*max_freqs); + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus AndroidEnv::SchedSetAffinity(const std::vector &cpu_ids) { + // compute mask + cpu_set_t mask; + CPU_ZERO(&mask); + for (auto cpu_id : cpu_ids) { + CPU_SET(cpu_id, &mask); + } + pid_t pid = gettid(); + int err = sched_setaffinity(pid, sizeof(mask), &mask); + if (err) { + LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno); + return MaceStatus(MaceStatus::MACE_INVALID_ARGS, + "SchedSetAffinity failed: " + + std::string(strerror(errno))); + } + + return MaceStatus::MACE_SUCCESS; +} + +std::vector AndroidEnv::GetBackTraceUnsafe(int max_steps) { + std::vector buffer(max_steps, 0); + int steps = BackTrace(buffer.data(), max_steps); + + std::vector bt; + for (int i = 0; i < steps; ++i) { + std::ostringstream os; + + const void* addr = buffer[i]; + const char* symbol = ""; + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } + + os << "pc " << addr << " " << symbol; + + bt.push_back(os.str()); + } + + return bt; +} + +std::unique_ptr AndroidEnv::NewMallocLogger( + std::ostringstream *oss, + const std::string &name) { + return make_unique(oss, name); +} + +Env *Env::Default() { + static AndroidEnv android_env; + return &android_env; +} + +} // namespace port +} // namespace mace diff --git a/mace/port/android/env.h b/mace/port/android/env.h new file mode 100644 index 0000000000000000000000000000000000000000..c51c57727d999ee2709fa14302ac51a7dbe021cf --- /dev/null +++ b/mace/port/android/env.h @@ -0,0 +1,49 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_ANDROID_ENV_H_ +#define MACE_PORT_ANDROID_ENV_H_ + +#include +#include +#include + +#include "mace/port/android/logger.h" +#include "mace/port/posix/file_system.h" +#include "mace/port/env.h" + +namespace mace { +namespace port { + +class AndroidEnv : public Env { + public: + int64_t NowMicros() override; + MaceStatus GetCPUMaxFreq(std::vector *max_freqs) override; + MaceStatus SchedSetAffinity(const std::vector &cpu_ids) override; + FileSystem *GetFileSystem() override; + LogWriter *GetLogWriter() override; + std::vector GetBackTraceUnsafe(int max_steps) override; + std::unique_ptr NewMallocLogger( + std::ostringstream *oss, + const std::string &name) override; + + private: + PosixFileSystem posix_file_system_; + AndroidLogWriter log_writer_; +}; + +} // namespace port +} // namespace mace + +#endif // MACE_PORT_ANDROID_ENV_H_ diff --git a/mace/port/android/logger.cc b/mace/port/android/logger.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6f57f4336f3313ef7624d1e32615ad7ba725d9a --- /dev/null +++ b/mace/port/android/logger.cc @@ -0,0 +1,58 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/android/logger.h" + +#include + +#include + +namespace mace { +namespace port { + +void AndroidLogWriter::WriteLogMessage(const char *fname, + const int line, + const LogLevel severity, + const char *message) { + int android_log_level; + switch (severity) { + case INFO: + android_log_level = ANDROID_LOG_INFO; + break; + case WARNING: + android_log_level = ANDROID_LOG_WARN; + break; + case ERROR: + android_log_level = ANDROID_LOG_ERROR; + break; + case FATAL: + android_log_level = ANDROID_LOG_FATAL; + break; + default: + android_log_level = ANDROID_LOG_ERROR; + break; + } + + std::stringstream ss; + const char *const partial_name = strrchr(fname, '/'); + ss << (partial_name != nullptr ? partial_name + 1 : fname) << ":" << line + << " " << message; + __android_log_write(android_log_level, "MACE", ss.str().c_str()); + + // Also log to stderr (for standalone Android apps) and abort. + LogWriter::WriteLogMessage(fname, line, severity, message); +} + +} // namespace port +} // namespace mace diff --git a/mace/port/android/logger.h b/mace/port/android/logger.h new file mode 100644 index 0000000000000000000000000000000000000000..fccfb83515c360bf61245dfcabfe73776b7702a7 --- /dev/null +++ b/mace/port/android/logger.h @@ -0,0 +1,34 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_ANDROID_LOGGER_H_ +#define MACE_PORT_ANDROID_LOGGER_H_ + +#include "mace/port/logger.h" + +namespace mace { +namespace port { + +class AndroidLogWriter : public LogWriter { + protected: + void WriteLogMessage(const char *fname, + const int line, + const LogLevel severity, + const char *message) override; +}; + +} // namespace port +} // namespace mace + +#endif // MACE_PORT_ANDROID_LOGGER_H_ diff --git a/mace/port/android/malloc_logger.cc b/mace/port/android/malloc_logger.cc new file mode 100644 index 0000000000000000000000000000000000000000..afaef724309d103ed15ac584b8f41c49d92c363d --- /dev/null +++ b/mace/port/android/malloc_logger.cc @@ -0,0 +1,100 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/android/malloc_logger.h" + +#include + +#include +#include + +namespace mace { +namespace port { + +namespace { +struct mallinfo LogMallinfoChange(std::ostringstream *oss, + const std::string &name, + const struct mallinfo curr, + const struct mallinfo prev) { + if (prev.arena != curr.arena) { + (*oss) << "[" << name << "] " + << "Non-mmapped space allocated (bytes): " << curr.arena + << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena); + } + if (prev.ordblks != curr.ordblks) { + (*oss) << "[" << name << "] " + << "Number of free chunks: " << curr.ordblks << ", diff: " + << ((int64_t)curr.ordblks - (int64_t)prev.ordblks); + } + if (prev.smblks != curr.smblks) { + (*oss) << "[" << name << "] " + << "Number of free fastbin blocks: " << curr.smblks + << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks); + } + if (prev.hblks != curr.hblks) { + (*oss) << "[" << name << "] " + << "Number of mmapped regions: " << curr.hblks + << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks); + } + if (prev.hblkhd != curr.hblkhd) { + (*oss) << "[" << name << "] " + << "Space allocated in mmapped regions (bytes): " << curr.hblkhd + << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd); + } + if (prev.usmblks != curr.usmblks) { + (*oss) << "[" << name << "] " + << "Maximum total allocated space (bytes): " << curr.usmblks + << ", diff: " + << ((int64_t)curr.usmblks - (int64_t)prev.usmblks); + } + if (prev.fsmblks != curr.fsmblks) { + (*oss) << "[" << name << "] " + << "Space in freed fastbin blocks (bytes): " << curr.fsmblks + << ", diff: " + << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks); + } + if (prev.uordblks != curr.uordblks) { + (*oss) << "[" << name << "] " + << "Total allocated space (bytes): " << curr.uordblks + << ", diff: " + << ((int64_t)curr.uordblks - (int64_t)prev.uordblks); + } + if (prev.fordblks != curr.fordblks) { + (*oss) << "[" << name << "] " + << "Total free space (bytes): " << curr.fordblks << ", diff: " + << ((int64_t)curr.fordblks - (int64_t)prev.fordblks); + } + if (prev.keepcost != curr.keepcost) { + (*oss) << "[" << name << "] " + << "Top-most, releasable space (bytes): " << curr.keepcost + << ", diff: " + << ((int64_t)curr.keepcost - (int64_t)prev.keepcost); + } + return curr; +} +} // namespace + +AndroidMallocLogger::AndroidMallocLogger(std::ostringstream *oss, + const std::string &name) : + oss_(oss), name_(name) { + prev_ = mallinfo(); +} + +AndroidMallocLogger::~AndroidMallocLogger() { + struct mallinfo curr = mallinfo(); + LogMallinfoChange(oss_, name_, curr, prev_); +} + +} // namespace port +} // namespace mace diff --git a/mace/port/android/malloc_logger.h b/mace/port/android/malloc_logger.h new file mode 100644 index 0000000000000000000000000000000000000000..9bc7052455b1a8445fa8b0719a82c956a6436ea4 --- /dev/null +++ b/mace/port/android/malloc_logger.h @@ -0,0 +1,43 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_ANDROID_MALLOC_LOGGER_H_ +#define MACE_PORT_ANDROID_MALLOC_LOGGER_H_ + +#include + +#include + +#include "mace/port/env.h" + +namespace mace { +namespace port { + +class AndroidMallocLogger : public MallocLogger { + public: + explicit AndroidMallocLogger(std::ostringstream *oss, + const std::string &name); + ~AndroidMallocLogger() override; + + private: + std::ostringstream *oss_; + const std::string name_; + struct mallinfo prev_; +}; + +} // namespace port +} // namespace mace + + +#endif // MACE_PORT_ANDROID_MALLOC_LOGGER_H_ diff --git a/mace/port/darwin/BUILD.bazel b/mace/port/darwin/BUILD.bazel new file mode 100644 index 0000000000000000000000000000000000000000..987dafd16ea22f3f8b5b97052d0672f18c81c98d --- /dev/null +++ b/mace/port/darwin/BUILD.bazel @@ -0,0 +1,22 @@ +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) # Apache 2.0 + +load("//mace:mace.bzl", "if_darwin") + +cc_library( + name = "port_darwin", + srcs = if_darwin(glob([ + "*.cc", + ])), + hdrs = if_darwin(glob([ + "*.h", + ])), + deps = [ + "//mace/port:port_base", + "//mace/port/posix:port_posix", + ], + alwayslink = 1, +) diff --git a/mace/port/darwin/env.cc b/mace/port/darwin/env.cc new file mode 100644 index 0000000000000000000000000000000000000000..f951e64753b9736705b67153a7ef3ba82cb72e73 --- /dev/null +++ b/mace/port/darwin/env.cc @@ -0,0 +1,53 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/darwin/env.h" + +#include +#include + +#include +#include +#include + +#include "mace/port/posix/backtrace.h" +#include "mace/port/posix/file_system.h" +#include "mace/port/posix/time.h" + +namespace mace { +namespace port { + +int64_t DarwinEnv::NowMicros() { + return mace::port::posix::NowMicros(); +} + +FileSystem *DarwinEnv::GetFileSystem() { + return &posix_file_system_; +} + +LogWriter *DarwinEnv::GetLogWriter() { + return &log_writer_; +} + +std::vector DarwinEnv::GetBackTraceUnsafe(int max_steps) { + return mace::port::posix::GetBackTraceUnsafe(max_steps); +} + +Env *Env::Default() { + static DarwinEnv darwin_env; + return &darwin_env; +} + +} // namespace port +} // namespace mace diff --git a/mace/port/darwin/env.h b/mace/port/darwin/env.h new file mode 100644 index 0000000000000000000000000000000000000000..667cf9f0a0e2f102c1ddc183605eea1f22dfa0c6 --- /dev/null +++ b/mace/port/darwin/env.h @@ -0,0 +1,43 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_DARWIN_ENV_H_ +#define MACE_PORT_DARWIN_ENV_H_ + +#include +#include + +#include "mace/port/env.h" +#include "mace/port/logger.h" +#include "mace/port/posix/file_system.h" + +namespace mace { +namespace port { + +class DarwinEnv : public Env { + public: + int64_t NowMicros() override; + FileSystem *GetFileSystem() override; + LogWriter *GetLogWriter() override; + std::vector GetBackTraceUnsafe(int max_steps) override; + + private: + PosixFileSystem posix_file_system_; + LogWriter log_writer_; +}; + +} // namespace port +} // namespace mace + +#endif // MACE_PORT_DARWIN_ENV_H_ diff --git a/mace/port/env.cc b/mace/port/env.cc new file mode 100644 index 0000000000000000000000000000000000000000..b78e1c82d4d417ccc2d2be9e2dc24cd3867e4cc1 --- /dev/null +++ b/mace/port/env.cc @@ -0,0 +1,40 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/env.h" + +#include + +#include "mace/utils/memory.h" +#include "mace/public/mace.h" + +namespace mace { +namespace port { + +MaceStatus Env::GetCPUMaxFreq(std::vector *max_freqs) { + return MaceStatus::MACE_UNSUPPORTED; +} + +MaceStatus Env::SchedSetAffinity(const std::vector &cpu_ids) { + return MaceStatus::MACE_UNSUPPORTED; +} + +std::unique_ptr Env::NewMallocLogger( + std::ostringstream *oss, + const std::string &name) { + return make_unique(); +} + +} // namespace port +} // namespace mace diff --git a/mace/port/env.h b/mace/port/env.h new file mode 100644 index 0000000000000000000000000000000000000000..af98cc5a5bc61e40fdc52edc04376aac80c2f740 --- /dev/null +++ b/mace/port/env.h @@ -0,0 +1,75 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_ENV_H_ +#define MACE_PORT_ENV_H_ + +#include +#include +#include +#include +#include + +#include "mace/public/mace.h" + +namespace mace { +namespace port { + +class MallocLogger { + public: + MallocLogger() = default; + virtual ~MallocLogger() = default; +}; + +class FileSystem; +class LogWriter; + +class Env { + public: + virtual int64_t NowMicros() = 0; + virtual MaceStatus GetCPUMaxFreq(std::vector *max_freqs); + virtual MaceStatus SchedSetAffinity(const std::vector &cpu_ids); + virtual FileSystem *GetFileSystem() = 0; + virtual LogWriter *GetLogWriter() = 0; + // Return the current backtrace, will allocate memory inside the call + // which may fail + virtual std::vector GetBackTraceUnsafe(int max_steps) = 0; + virtual std::unique_ptr NewMallocLogger( + std::ostringstream *oss, + const std::string &name); + + static Env *Default(); +}; + +} // namespace port + +inline int64_t NowMicros() { + return port::Env::Default()->NowMicros(); +} + +inline MaceStatus GetCPUMaxFreq(std::vector *max_freqs) { + return port::Env::Default()->GetCPUMaxFreq(max_freqs); +} + +inline MaceStatus SchedSetAffinity(const std::vector &cpu_ids) { + return port::Env::Default()->SchedSetAffinity(cpu_ids); +} + +inline port::FileSystem *GetFileSystem() { + return port::Env::Default()->GetFileSystem(); +} + +} // namespace mace + +#endif // MACE_PORT_ENV_H_ diff --git a/mace/port/env_test.cc b/mace/port/env_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d23b5787a231d09722470efcf42f9ce9eedc2c13 --- /dev/null +++ b/mace/port/env_test.cc @@ -0,0 +1,41 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/env.h" + +#include + +namespace mace { +namespace { + +class EnvTest : public ::testing::Test { +}; + +TEST_F(EnvTest, NowMicros) { + EXPECT_GT(NowMicros(), 0); +} + +TEST_F(EnvTest, GetFileSystem) { + GetFileSystem(); +} + +TEST_F(EnvTest, CPUInfo) { + std::vector freq; + GetCPUMaxFreq(&freq); + std::vector cpu_ids; + SchedSetAffinity(cpu_ids); +} + +} // namespace +} // namespace mace diff --git a/mace/port/file_system.h b/mace/port/file_system.h new file mode 100644 index 0000000000000000000000000000000000000000..91b6f458d3a021c9163e7cc07c1404805f2aae43 --- /dev/null +++ b/mace/port/file_system.h @@ -0,0 +1,45 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_FILE_SYSTEM_H_ +#define MACE_PORT_FILE_SYSTEM_H_ + +#include +#include + +#include "mace/public/mace.h" + +namespace mace { +namespace port { + +class ReadOnlyMemoryRegion { + public: + ReadOnlyMemoryRegion() = default; + virtual ~ReadOnlyMemoryRegion() = default; + virtual const void *data() = 0; + virtual uint64_t length() = 0; +}; + +class FileSystem { + public: + FileSystem() = default; + virtual ~FileSystem() = default; + virtual MaceStatus NewReadOnlyMemoryRegionFromFile(const char *fname, + std::unique_ptr* result) = 0; +}; + +} // namespace port +} // namespace mace + +#endif // MACE_PORT_FILE_SYSTEM_H_ diff --git a/mace/port/linux/BUILD.bazel b/mace/port/linux/BUILD.bazel new file mode 100644 index 0000000000000000000000000000000000000000..5d1351baf844c4e90f6259fddb97b6217dd769b2 --- /dev/null +++ b/mace/port/linux/BUILD.bazel @@ -0,0 +1,22 @@ +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) # Apache 2.0 + +load("//mace:mace.bzl", "if_linux") + +cc_library( + name = "port_linux", + srcs = if_linux(glob([ + "*.cc", + ])), + hdrs = if_linux(glob([ + "*.h", + ])), + deps = [ + "//mace/port:port_base", + "//mace/port/posix:port_posix", + ], + alwayslink = 1, +) diff --git a/mace/port/linux/env.cc b/mace/port/linux/env.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a50b4a1198049d5610f3daad5b33f47efb97c4a --- /dev/null +++ b/mace/port/linux/env.cc @@ -0,0 +1,53 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/linux/env.h" + +#include +#include + +#include +#include +#include + +#include "mace/port/posix/backtrace.h" +#include "mace/port/posix/file_system.h" +#include "mace/port/posix/time.h" + +namespace mace { +namespace port { + +int64_t LinuxEnv::NowMicros() { + return mace::port::posix::NowMicros(); +} + +FileSystem *LinuxEnv::GetFileSystem() { + return &posix_file_system_; +} + +LogWriter *LinuxEnv::GetLogWriter() { + return &log_writer_; +} + +std::vector LinuxEnv::GetBackTraceUnsafe(int max_steps) { + return mace::port::posix::GetBackTraceUnsafe(max_steps); +} + +Env *Env::Default() { + static LinuxEnv linux_env; + return &linux_env; +} + +} // namespace port +} // namespace mace diff --git a/mace/port/linux/env.h b/mace/port/linux/env.h new file mode 100644 index 0000000000000000000000000000000000000000..5d1d243a1ab616c3bf13d6d9069147e7cced4519 --- /dev/null +++ b/mace/port/linux/env.h @@ -0,0 +1,43 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_LINUX_ENV_H_ +#define MACE_PORT_LINUX_ENV_H_ + +#include +#include + +#include "mace/port/env.h" +#include "mace/port/logger.h" +#include "mace/port/posix/file_system.h" + +namespace mace { +namespace port { + +class LinuxEnv : public Env { + public: + int64_t NowMicros() override; + FileSystem *GetFileSystem() override; + LogWriter *GetLogWriter() override; + std::vector GetBackTraceUnsafe(int max_steps) override; + + private: + PosixFileSystem posix_file_system_; + LogWriter log_writer_; +}; + +} // namespace port +} // namespace mace + +#endif // MACE_PORT_LINUX_ENV_H_ diff --git a/mace/port/logger.cc b/mace/port/logger.cc new file mode 100644 index 0000000000000000000000000000000000000000..b02f6f4455d92a275470c0c762edf20c257d2b38 --- /dev/null +++ b/mace/port/logger.cc @@ -0,0 +1,115 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/logger.h" + +#include +#include +#include +#include + +#include "mace/port/env.h" +#include "mace/utils/string_util.h" + +namespace mace { +namespace port { + +inline bool IsValidLogLevel(const LogLevel level) { + return level > LogLevel::INVALID_MIN && + level < LogLevel::INVALID_MAX; +} + +LogLevel LogLevelFromStr(const char *log_level_str) { + if (log_level_str != nullptr) { + std::string ls = ToUpper(log_level_str); + + if (ls == "I" || ls == "INFO") { + return LogLevel::INFO; + } + if (ls == "W" || ls == "WARNING") { + return LogLevel::WARNING; + } + if (ls == "E" || ls == "ERROR") { + return LogLevel::ERROR; + } + if (ls == "F" || ls == "FATAL") { + return LogLevel::FATAL; + } + } + + return LogLevel::INVALID_MIN; +} + +char LogLevelToShortStr(LogLevel level) { + if (!IsValidLogLevel(level)) { + level = LogLevel::INFO; + } + + return "IWEF"[static_cast(level) - 1]; +} + +int VLogLevelFromStr(const char *vlog_level_str) { + if (vlog_level_str != nullptr) { + return atoi(vlog_level_str); + } + + return 0; +} + + +void LogWriter::WriteLogMessage(const char *fname, + const int line, + const LogLevel severity, + const char *message) { + printf("%c %s:%d] %s\n", LogLevelToShortStr(severity), fname, line, message); +} + +Logger::Logger(const char *fname, int line, LogLevel severity) + : fname_(fname), line_(line), severity_(severity) {} + +void Logger::GenerateLogMessage() { + LogWriter *log_writer = Env::Default()->GetLogWriter(); + log_writer->WriteLogMessage(fname_, line_, severity_, str().c_str()); + + // When there is a fatal log, terminate execution + if (severity_ == LogLevel::FATAL) { + DealWithFatal(); + } +} + +void Logger::DealWithFatal() { + // When there is a fatal log, log the backtrace and abort. + LogWriter *log_writer = Env::Default()->GetLogWriter(); + std::vector bt = Env::Default()->GetBackTraceUnsafe(50); + if (!bt.empty()) { + log_writer->WriteLogMessage(fname_, line_, severity_, "backtrace:"); + for (size_t i = 0; i < bt.size(); ++i) { + std::ostringstream os; + os << " " << bt[i]; + log_writer->WriteLogMessage(fname_, line_, severity_, os.str().c_str()); + } + } + + abort(); +} + +Logger::~Logger() { + static const LogLevel min_log_level = MinLogLevelFromEnv(); + if (LogLevelPassThreashold(severity_, min_log_level)) { + GenerateLogMessage(); + } +} + +} // namespace port +} // namespace mace diff --git a/mace/port/logger.h b/mace/port/logger.h new file mode 100644 index 0000000000000000000000000000000000000000..08bcbbe4a8c3447332d897d602e05d9a38f6659e --- /dev/null +++ b/mace/port/logger.h @@ -0,0 +1,95 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_LOGGER_H_ +#define MACE_PORT_LOGGER_H_ + +#include +#include +#include + +namespace mace { + +enum LogLevel { + INVALID_MIN = 0, + INFO = 1, + WARNING = 2, + ERROR = 3, + FATAL = 4, + INVALID_MAX, +}; + +namespace port { + +inline bool LogLevelPassThreashold(const LogLevel level, + const LogLevel threshold) { + return level >= threshold; +} + +LogLevel LogLevelFromStr(const char *log_level_str); +int VLogLevelFromStr(const char *vlog_level_str); + +inline LogLevel MinLogLevelFromEnv() { + // Read the min log level from env once during the first call to logging. + static LogLevel log_level = LogLevelFromStr(getenv("MACE_CPP_MIN_LOG_LEVEL")); + return log_level; +} + +inline int MinVLogLevelFromEnv() { + // Read the min vlog level from env once during the first call to logging. + static int vlog_level = VLogLevelFromStr(getenv("MACE_CPP_MIN_VLOG_LEVEL")); + return vlog_level; +} + +class LogWriter { + public: + LogWriter() = default; + virtual ~LogWriter() = default; + virtual void WriteLogMessage(const char *fname, + const int line, + const LogLevel severity, + const char *message); +}; + +class Logger : public std::ostringstream { + public: + Logger(const char *fname, int line, LogLevel severity); + ~Logger(); + + private: + void GenerateLogMessage(); + void DealWithFatal(); + + const char *fname_; + int line_; + LogLevel severity_; +}; + +} // namespace port + +// Whether the log level pass the env configured threshold, can be used for +// short cutting. +inline bool ShouldGenerateLogMessage(LogLevel severity) { + LogLevel threshold = port::MinLogLevelFromEnv(); + return port::LogLevelPassThreashold(severity, threshold); +} + +inline bool ShouldGenerateVLogMessage(int vlog_level) { + int threshold = port::MinVLogLevelFromEnv(); + return ShouldGenerateLogMessage(INFO) && + vlog_level <= threshold; +} +} // namespace mace + +#endif // MACE_PORT_LOGGER_H_ diff --git a/mace/port/logger_test.cc b/mace/port/logger_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..93df626ba8a2d61821dd68f07b3a3823fff8a5de --- /dev/null +++ b/mace/port/logger_test.cc @@ -0,0 +1,44 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/logger.h" + +#include + +namespace mace { +namespace { + +class LoggerTest : public ::testing::Test { +}; + +TEST_F(LoggerTest, LogLevel) { + EXPECT_EQ(INFO, port::LogLevelFromStr("i")); + EXPECT_EQ(INFO, port::LogLevelFromStr("I")); + EXPECT_EQ(INFO, port::LogLevelFromStr("INFO")); + + EXPECT_EQ(WARNING, port::LogLevelFromStr("w")); + EXPECT_EQ(WARNING, port::LogLevelFromStr("W")); + EXPECT_EQ(WARNING, port::LogLevelFromStr("WARNING")); + + EXPECT_EQ(ERROR, port::LogLevelFromStr("e")); + EXPECT_EQ(ERROR, port::LogLevelFromStr("E")); + EXPECT_EQ(ERROR, port::LogLevelFromStr("ERROR")); + + EXPECT_EQ(FATAL, port::LogLevelFromStr("f")); + EXPECT_EQ(FATAL, port::LogLevelFromStr("F")); + EXPECT_EQ(FATAL, port::LogLevelFromStr("FATAL")); +} + +} // namespace +} // namespace mace diff --git a/mace/port/posix/BUILD.bazel b/mace/port/posix/BUILD.bazel new file mode 100644 index 0000000000000000000000000000000000000000..321a18a516d88749bde3e5cf2677fda941f12480 --- /dev/null +++ b/mace/port/posix/BUILD.bazel @@ -0,0 +1,19 @@ +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) # Apache 2.0 + +cc_library( + name = "port_posix", + srcs = glob([ + "*.cc", + ]), + hdrs = glob([ + "*.h", + ]), + deps = [ + "//mace/port:port_base", + "//mace/utils", + ], +) diff --git a/mace/port/posix/backtrace.h b/mace/port/posix/backtrace.h new file mode 100644 index 0000000000000000000000000000000000000000..d96419319f874b34149a25493ca44ecd22680976 --- /dev/null +++ b/mace/port/posix/backtrace.h @@ -0,0 +1,45 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_POSIX_BACKTRACE_H_ +#define MACE_PORT_POSIX_BACKTRACE_H_ + +#include + +#include +#include + +namespace mace { +namespace port { +namespace posix { + +inline std::vector GetBackTraceUnsafe(int max_steps) { + std::vector buffer(max_steps, 0); + int steps = backtrace(buffer.data(), max_steps); + + std::vector bt; + char **symbols = backtrace_symbols(buffer.data(), steps); + if (symbols != nullptr) { + for (int i = 0; i < steps; i++) { + bt.push_back(symbols[i]); + } + } + return bt; +} + +} // namespace posix +} // namespace port +} // namespace mace + +#endif // MACE_PORT_POSIX_BACKTRACE_H_ diff --git a/mace/port/posix/file_system.cc b/mace/port/posix/file_system.cc new file mode 100644 index 0000000000000000000000000000000000000000..a7873b9635e9754568df63ccd7a23491e5d49f30 --- /dev/null +++ b/mace/port/posix/file_system.cc @@ -0,0 +1,80 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/port/posix/file_system.h" + +#include +#include +#include +#include + +#include +#include + +#include "mace/utils/memory.h" + +namespace mace { +namespace port { + +namespace { +class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion { + public: + PosixReadOnlyMemoryRegion() = delete; + PosixReadOnlyMemoryRegion(const void* addr, uint64_t length) + : addr_(addr), length_(length) {} + ~PosixReadOnlyMemoryRegion() override { + if (length_ > 0) { + munmap(const_cast(addr_), length_); + } + }; + const void *data() override { return addr_; }; + uint64_t length() override { return length_; }; + + private: + const void *addr_; + const uint64_t length_; +}; +} // namespace + +MaceStatus PosixFileSystem::NewReadOnlyMemoryRegionFromFile( + const char *fname, + std::unique_ptr* result) { + MaceStatus s = MaceStatus(MaceStatus::MACE_SUCCESS); + int fd = open(fname, O_RDONLY); + if (fd < 0) { + // TODO(heliangliang) check errno + s = MaceStatus(MaceStatus::MACE_RUNTIME_ERROR); + } else { + struct stat st; + fstat(fd, &st); + if (st.st_size > 0) { + const void* address = + mmap(nullptr, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (address == MAP_FAILED) { + // TODO(heliangliang) check errno + s = MaceStatus(MaceStatus::MACE_RUNTIME_ERROR); + } else { + *result = make_unique(address, st.st_size); + } + close(fd); + } else { + // Empty file: mmap returns EINVAL (since Linux 2.6.12) length was 0 + *result = make_unique(nullptr, 0); + } + } + return s; +} + +} // namespace port +} // namespace mace diff --git a/mace/port/posix/file_system.h b/mace/port/posix/file_system.h new file mode 100644 index 0000000000000000000000000000000000000000..8eb370757fcce9a558b993ac7f80e2d0ca1d2024 --- /dev/null +++ b/mace/port/posix/file_system.h @@ -0,0 +1,37 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_PORT_POSIX_FILE_SYSTEM_H_ +#define MACE_PORT_POSIX_FILE_SYSTEM_H_ + +#include +#include + +#include "mace/port/file_system.h" + +namespace mace { +namespace port { + +class PosixFileSystem : public FileSystem { + public: + PosixFileSystem() = default; + ~PosixFileSystem() override = default; + MaceStatus NewReadOnlyMemoryRegionFromFile(const char *fname, + std::unique_ptr* result) override; +}; + +} // namespace port +} // namespace mace + +#endif // MACE_PORT_POSIX_FILE_SYSTEM_H_ diff --git a/mace/utils/env_time.h b/mace/port/posix/time.h similarity index 72% rename from mace/utils/env_time.h rename to mace/port/posix/time.h index 18d6e5a6ad6229284a2ae2e3e2fbbeb50fc952d7..84ab478a9580ad67618d53517a8f87afc4f2699b 100644 --- a/mace/utils/env_time.h +++ b/mace/port/posix/time.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,28 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_UTILS_ENV_TIME_H_ -#define MACE_UTILS_ENV_TIME_H_ +#ifndef MACE_PORT_POSIX_TIME_H_ +#define MACE_PORT_POSIX_TIME_H_ -#include -#ifdef __hexagon__ -#include -#else #include -#endif + +#include namespace mace { +namespace port { +namespace posix { inline int64_t NowMicros() { -#ifdef __hexagon__ - return HAP_perf_get_time_us(); -#else struct timeval tv; gettimeofday(&tv, nullptr); return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; -#endif } +} // namespace posix +} // namespace port } // namespace mace -#endif // MACE_UTILS_ENV_TIME_H_ +#endif // MACE_PORT_POSIX_TIME_H_ diff --git a/mace/proto/BUILD b/mace/proto/BUILD.bazel similarity index 100% rename from mace/proto/BUILD rename to mace/proto/BUILD.bazel diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index 530de3aedfcd6a94d9ee840f8e368a4447d6cd8c..d3b564fc6a9de2b7b79f9c73df53b3fa9e310788 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -86,21 +86,15 @@ message OperatorDef { } // for hexagon mace-nnlib -message InputInfo { - optional string name = 1; - optional int32 node_id = 2; - repeated int32 dims = 3; - optional int32 max_byte_size = 4; // only support 32-bit len - optional DataType data_type = 5 [default = DT_FLOAT]; - optional int32 data_format = 6 [default = 1]; // NHWC -} -message OutputInfo { +message InputOutputInfo { optional string name = 1; optional int32 node_id = 2; repeated int32 dims = 3; optional int32 max_byte_size = 4; // only support 32-bit len optional DataType data_type = 5 [default = DT_FLOAT]; optional int32 data_format = 6 [default = 1]; // NHWC + optional float scale = 7; + optional int32 zero_point = 8; } message NetDef { @@ -109,6 +103,6 @@ message NetDef { repeated ConstTensor tensors = 3; // for hexagon mace-nnlib - repeated InputInfo input_info = 100; - repeated OutputInfo output_info = 101; + repeated InputOutputInfo input_info = 100; + repeated InputOutputInfo output_info = 101; } diff --git a/mace/public/BUILD b/mace/public/BUILD.bazel similarity index 87% rename from mace/public/BUILD rename to mace/public/BUILD.bazel index b434312bcfdd4ec65a78bfc879a2dfcb41cc129c..158bc564dff7c4118ff368d0dfd1cb6a0eb0547f 100644 --- a/mace/public/BUILD +++ b/mace/public/BUILD.bazel @@ -12,5 +12,8 @@ cc_library( hdrs = [ "mace.h", ], + srcs = [ + "status.cc", + ], copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], ) diff --git a/mace/public/mace.h b/mace/public/mace.h index 575ca32877374badf249a3b7bcad89f2e740793e..c265401ed3ca3f0eb88a51ed03ab206aa2c7c2b3 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -32,9 +32,12 @@ namespace mace { class NetDef; -enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 }; +enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 }; -enum DataFormat { DF_NONE = 0, NHWC = 1, NCHW = 2}; +enum DataFormat { + DF_NONE = 0, NHWC = 1, NCHW = 2, + HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 +}; enum GPUPerfHint { PERF_DEFAULT = 0, @@ -102,7 +105,7 @@ class RunMetadata { /// Consistent with Android NNAPI struct PerformanceInfo { - // Time of executing some workload. + // Time of executing some workload(millisecond). // negative value for unsupported. float exec_time; }; @@ -144,7 +147,9 @@ class MaceStatus { enum Code { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1, - MACE_OUT_OF_RESOURCES = 2 + MACE_OUT_OF_RESOURCES = 2, + MACE_UNSUPPORTED = 3, + MACE_RUNTIME_ERROR = 4, }; public: @@ -167,18 +172,6 @@ class MaceStatus { std::unique_ptr impl_; }; - -#define MACE_RETURN_IF_ERROR(stmt) \ - { \ - MaceStatus status = (stmt); \ - if (status != MaceStatus::MACE_SUCCESS) { \ - VLOG(0) << "Mace runtime failure: " \ - << __FILE__ << ":" << __LINE__ << ". " \ - << status.information(); \ - return status; \ - } \ - } - /// \brief GPU context contain the status used for GPU device. /// /// There are some data in common between different MaceEngines using GPU, diff --git a/mace/core/status.cc b/mace/public/status.cc similarity index 86% rename from mace/core/status.cc rename to mace/public/status.cc index 12134f88a73940e26c8eb6c70a65011dcb25d647..c377c9b64112750bd9e46f53bdccf664b1aa8ca3 100644 --- a/mace/core/status.cc +++ b/mace/public/status.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "mace/public/mace.h" namespace mace { @@ -26,10 +28,16 @@ class MaceStatus::Impl { void SetCode(const Code code) { code_ = code; } Code code() const { return code_; } void SetInformation(const std::string &info) { information_ = info; } - std::string information() const { return Code2Str() + ": " + information_; } + std::string information() const { + if (information_.empty()) { + return CodeToString(); + } else { + return CodeToString() + ": " + information_; + } + } private: - std::string Code2Str() const { + std::string CodeToString() const { switch (code_) { case MaceStatus::MACE_SUCCESS: return "Success"; @@ -37,8 +45,14 @@ class MaceStatus::Impl { return "Invalid Arguments"; case MaceStatus::MACE_OUT_OF_RESOURCES: return "Out of resources"; + case MACE_UNSUPPORTED: + return "Unsupported"; + case MACE_RUNTIME_ERROR: + return "Runtime error"; default: - return ""; + std::ostringstream os; + os << code_; + return os.str(); } } diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD.bazel similarity index 100% rename from mace/python/tools/BUILD rename to mace/python/tools/BUILD.bazel diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 56f2cceca6863672fa168209504187142ad83d05..0de68ce4f6af1c0ae6c995e77738015b998dafba 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -37,11 +37,14 @@ FLAGS = None device_type_map = {'cpu': cvt.DeviceType.CPU.value, 'gpu': cvt.DeviceType.GPU.value, 'dsp': cvt.DeviceType.HEXAGON.value, + 'hta': cvt.DeviceType.HTA.value, 'cpu+gpu': cvt.DeviceType.CPU.value} data_format_map = { 'NONE': cvt.DataFormat.DF_NONE, 'NHWC': cvt.DataFormat.NHWC, + 'NCHW': cvt.DataFormat.NCHW, + 'OIHW': cvt.DataFormat.OIHW, } @@ -52,10 +55,11 @@ def parse_data_type(data_type, device_type): return mace_pb2.DT_FLOAT else: return mace_pb2.DT_HALF - elif device_type == cvt.DeviceType.HEXAGON.value: + elif device_type == cvt.DeviceType.HEXAGON.value or \ + device_type == cvt.DeviceType.HTA.value: return mace_pb2.DT_FLOAT else: - print("Invalid device type: " + device_type) + print("Invalid device type: " + str(device_type)) def file_checksum(fname): @@ -66,12 +70,26 @@ def file_checksum(fname): return hash_func.hexdigest() +def split_shape(shape): + if shape.strip() == "": + return [] + else: + return shape.split(',') + + def parse_int_array_from_str(ints_str): - return [int(int_str) for int_str in ints_str.split(',')] + return [int(i) for i in split_shape(ints_str)] + + +def parse_float_array_from_str(floats_str): + return [float(i) for i in floats_str.split(',')] -def parse_float_array_from_str(ints_str): - return [float(int_str) for int_str in ints_str.split(',')] +def transpose_shape(shape, dst_order): + t_shape = [0] * len(shape) + for i in range(len(shape)): + t_shape[i] = shape[dst_order[i]] + return t_shape def main(unused_args): @@ -106,7 +124,7 @@ def main(unused_args): six.print_("platform %s is not supported." % FLAGS.platform, file=sys.stderr) sys.exit(-1) - if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'cpu+gpu']: + if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'hta', 'cpu+gpu']: six.print_("runtime %s is not supported." % FLAGS.runtime, file=sys.stderr) sys.exit(-1) @@ -139,6 +157,10 @@ def main(unused_args): else: input_node.data_format = data_format_map[input_node_formats[i]] input_node.shape = parse_int_array_from_str(input_node_shapes[i]) + if input_node.data_format == cvt.DataFormat.NCHW and\ + len(input_node.shape) == 4: + input_node.shape = transpose_shape(input_node.shape, [0, 2, 3, 1]) + input_node.data_format = cvt.DataFormat.NHWC if len(input_node_ranges) > i: input_node.range = parse_float_array_from_str(input_node_ranges[i]) option.add_input_node(input_node) @@ -156,6 +178,11 @@ def main(unused_args): else: output_node.data_format = data_format_map[output_node_formats[i]] output_node.shape = parse_int_array_from_str(output_node_shapes[i]) + if output_node.data_format == cvt.DataFormat.NCHW and\ + len(output_node.shape) == 4: + output_node.shape = transpose_shape(output_node.shape, + [0, 2, 3, 1]) + output_node.data_format = cvt.DataFormat.NHWC option.add_output_node(output_node) if FLAGS.check_node != '': @@ -196,7 +223,8 @@ def main(unused_args): option, output_graph_def) output_graph_def, quantize_activation_info = mace_transformer.run() - if FLAGS.runtime == 'dsp': + if option.device in [cvt.DeviceType.HEXAGON.value, + cvt.DeviceType.HTA.value]: from mace.python.tools.converter_tool import hexagon_converter converter = hexagon_converter.HexagonConverter( option, output_graph_def, quantize_activation_info) diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 409c3b321b7b9b9a1b8cc1614647468f5e5c0efc..7fc877d662a90bc4d6030daab3843b27cb801f80 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -22,15 +22,13 @@ class DeviceType(Enum): CPU = 0 GPU = 2 HEXAGON = 3 + HTA = 4 class DataFormat(Enum): DF_NONE = 0 NHWC = 1 NCHW = 2 - - -class FilterFormat(Enum): HWIO = 100 OIHW = 101 HWOI = 102 @@ -104,6 +102,7 @@ class FrameworkType(Enum): MaceSupportedOps = [ 'Activation', 'AddN', + 'Affine', 'ArgMax', 'BatchNorm', 'BatchToSpaceND', @@ -127,9 +126,11 @@ MaceSupportedOps = [ 'InferConv2dShape', 'LocalResponseNorm', 'LSTMCell', + # 'LstmNonlinear', 'MatMul', 'OneHot', 'Pad', + 'PNorm', 'Pooling', 'PriorBox', 'Proposal', @@ -141,6 +142,8 @@ MaceSupportedOps = [ 'ResizeNearestNeighbor', 'Reverse', 'ScalarMath', + 'Slice', + 'Splice', 'Split', 'Shape', 'Squeeze', @@ -151,9 +154,13 @@ MaceSupportedOps = [ 'SpaceToBatchND', 'SpaceToDepth', 'SqrDiffMean', + 'SumGroup', + 'TargetRMSNorm', + 'TimeOffset', 'Transpose', 'WinogradInverseTransform', 'WinogradTransform', + 'Cumsum', ] MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str) @@ -166,6 +173,7 @@ class MaceKeyword(object): mace_buffer_type = 'buffer_type' # arg related str mace_padding_str = 'padding' + mace_padding_type_str = 'padding' mace_padding_values_str = 'padding_values' mace_strides_str = 'strides' mace_dilations_str = 'dilations' @@ -173,6 +181,7 @@ class MaceKeyword(object): mace_global_pooling_str = 'global_pooling' mace_kernel_str = 'kernels' mace_data_format_str = 'data_format' + mace_has_data_format_str = 'has_data_format' mace_filter_format_str = 'filter_format' mace_element_type_str = 'type' mace_activation_type_str = 'activation' @@ -228,7 +237,10 @@ class MaceKeyword(object): mace_step_h_str = 'step_h' mace_step_w_str = 'step_w' mace_find_range_every_time = 'find_range_every_time' + mace_non_zero = 'non_zero' mace_pad_type_str = 'pad_type' + mace_exclusive_str = 'exclusive' + mace_reverse_str = 'reverse' class TransformerRule(Enum): @@ -271,6 +283,7 @@ class TransformerRule(Enum): FOLD_FC_RESHAPE = 37 TRANSFORM_CHANNEL_SHUFFLE = 38 UPDATE_DATA_FORMAT = 39 + QUANTIZE_SPECIFIC_OPS_ONLY = 40 class ConverterInterface(object): @@ -481,6 +494,7 @@ class ConverterOption(object): # Model data format related transformation TransformerRule.TRANSPOSE_FILTERS, TransformerRule.TRANSPOSE_DATA_FORMAT, + TransformerRule.TRANSPOSE_MATMUL_WEIGHT, # Add winograd argument TransformerRule.ADD_WINOGRAD_ARG, # Mace model structure related transformation @@ -514,6 +528,16 @@ class ConverterUtil(object): return arg return None + @staticmethod + def del_arg(op, arg_name): + found_idx = -1 + for idx in range(len(op.arg)): + if op.arg[idx].name == arg_name: + found_idx = idx + break + if found_idx != -1: + del op.arg[found_idx] + @staticmethod def add_data_format_arg(op, data_format): data_format_arg = op.arg.add() @@ -549,11 +573,11 @@ class ConverterUtil(object): arg = ConverterUtil.get_arg(net, MaceKeyword.mace_filter_format_str) if arg is None: return None - elif arg.i == FilterFormat.HWIO.value: - return FilterFormat.HWIO - elif arg.i == FilterFormat.HWOI.value: - return FilterFormat.HWOI - elif arg.i == FilterFormat.OIHW.value: - return FilterFormat.OIHW + elif arg.i == DataFormat.HWIO.value: + return DataFormat.HWIO + elif arg.i == DataFormat.HWOI.value: + return DataFormat.HWOI + elif arg.i == DataFormat.OIHW.value: + return DataFormat.OIHW else: return None diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py index 3231ea9fa58b9f6e43470250c2997f37b3ed87c3..c5b6176824d28dcf67a4dd68defdebdfecafcbed 100644 --- a/mace/python/tools/converter_tool/caffe_converter.py +++ b/mace/python/tools/converter_tool/caffe_converter.py @@ -27,7 +27,6 @@ from mace.python.tools.converter_tool.base_converter import ActivationType from mace.python.tools.converter_tool.base_converter import EltwiseType from mace.python.tools.converter_tool.base_converter import FrameworkType from mace.python.tools.converter_tool.base_converter import DataFormat -from mace.python.tools.converter_tool.base_converter import FilterFormat from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import ConverterUtil @@ -183,6 +182,7 @@ class CaffeConverter(base_converter.ConverterInterface): 'Slice': self.convert_slice, 'Softmax': self.convert_softmax, 'InnerProduct': self.convert_fully_connected, + 'Interp': self.convert_interp, 'BatchNorm': self.convert_folded_batchnorm, 'Crop': self.convert_crop, 'Scale': self.convert_scale, @@ -194,7 +194,7 @@ class CaffeConverter(base_converter.ConverterInterface): } self._option = option self._mace_net_def = mace_pb2.NetDef() - ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.OIHW) + ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW) self._caffe_net = CaffeNet() self._caffe_layers = caffe_pb2.NetParameter() caffe_weights = caffe_pb2.NetParameter() @@ -552,18 +552,20 @@ class CaffeConverter(base_converter.ConverterInterface): param = caffe_op.layer.crop_param op.type = MaceOp.Crop.name - axis_arg = op.arg.add() - axis_arg.name = MaceKeyword.mace_axis_str - axis_arg.i = 2 - if param.HasField('axis'): - axis_arg.i = param.axis - axis_arg.i = 4 + axis_arg.i if axis_arg.i < 0 else axis_arg.i + axis = param.axis + axis = 4 + axis if axis < 0 else axis + offset_value = -1 * np.ones(4, dtype=np.int32) + offset_len = len(param.offset) + if offset_len == 1: + while axis < 4: + offset_value[axis] = param.offset[0] + axis += 1 + else: + offset_value[axis:] = param.offset + offset_arg = op.arg.add() offset_arg.name = MaceKeyword.mace_offset_str - if len(param.offset) > 0: - offset_arg.ints.extend(list(param.offset)) - else: - offset_arg.i = 0 + offset_arg.ints.extend(offset_value) def convert_concat(self, caffe_op): op = self.convert_general_op(caffe_op) @@ -573,7 +575,7 @@ class CaffeConverter(base_converter.ConverterInterface): axis_arg = op.arg.add() axis_arg.name = MaceKeyword.mace_axis_str axis_arg.i = 1 - if param.HasField('axis'): + if param.HasField(MaceKeyword.mace_axis_str): axis_arg.i = param.axis elif param.HasField('concat_dim'): axis_arg.i = param.concat_dim @@ -593,6 +595,18 @@ class CaffeConverter(base_converter.ConverterInterface): axis_arg.name = MaceKeyword.mace_axis_str axis_arg.i = 1 + def convert_interp(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.interp_param + mace_check(param.HasField("height") and param.HasField("width"), + 'Only support bilinear interp with height and width') + op.type = MaceOp.ResizeBilinear.name + + size_arg = op.arg.add() + size_arg.name = MaceKeyword.mace_resize_size_str + size_value = np.array([param.height, param.width], dtype=np.int32) + size_arg.ints.extend(size_value) + def convert_fully_connected(self, caffe_op): op = self.convert_general_op(caffe_op) param = caffe_op.layer.inner_product_param diff --git a/mace/python/tools/converter_tool/hexagon_converter.py b/mace/python/tools/converter_tool/hexagon_converter.py index 60226ef887eca9f800ca650eff13feff5fbe11e6..53598243b247094ce43b5a832b65d1498c796547 100644 --- a/mace/python/tools/converter_tool/hexagon_converter.py +++ b/mace/python/tools/converter_tool/hexagon_converter.py @@ -20,6 +20,7 @@ from operator import mul from mace.proto import mace_pb2 from mace.python.tools.converter_tool import base_converter from mace.python.tools.converter_tool.base_converter import ConverterUtil +from mace.python.tools.converter_tool.base_converter import DeviceType from mace.python.tools.converter_tool.base_converter import EltwiseType from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import MaceOp @@ -29,11 +30,15 @@ from mace.python.tools.converter_tool.base_converter import ReduceType from mace.python.tools.convert_util import mace_check from mace.python.tools import graph_util +from six.moves import reduce + HexagonSupportedOps = [ 'BatchToSpaceND_8', 'DepthwiseSupernode_8x8p32to8', 'DequantizeOUTPUT_8tof', + 'INPUT', + 'OUTPUT', 'QuantizedAdd_8p8to8', 'QuantizedAvgPool_8', 'QuantizedConcat_8', @@ -126,9 +131,9 @@ class HexagonConverter(base_converter.ConverterInterface): self.add_input_output_node() if not self._option.check_nodes: - output_name = self._option.output_nodes.values()[0].name + output_name = list(self._option.output_nodes.values())[0].name else: - output_name = self._option.check_nodes.values()[0].name + output_name = list(self._option.check_nodes.values())[0].name output_name = normalize_name(output_name) self._model = graph_util.sort_mace_graph(self._model, output_name) @@ -330,7 +335,7 @@ class HexagonConverter(base_converter.ConverterInterface): else: op.type = self._hexagon_ops.map_nn_op(op.type) - def add_min_max(self, name, val): + def add_const_node(self, name, val): if name not in self._consts: tensor = self._model.tensors.add() self._consts[name] = tensor @@ -362,14 +367,14 @@ class HexagonConverter(base_converter.ConverterInterface): min_tensor_name = op + ':1' else: min_tensor_name = op + '_min:0' - self.add_min_max(min_tensor_name, minval) + self.add_const_node(min_tensor_name, minval) this_op.input.extend([min_tensor_name]) if add_max: if is_activation and diff_port: max_tensor_name = op + ':2' else: max_tensor_name = op + '_max:0' - self.add_min_max(max_tensor_name, maxval) + self.add_const_node(max_tensor_name, maxval) this_op.input.extend([max_tensor_name]) def add_shape_const_node(self, op, values, name): @@ -380,27 +385,48 @@ class HexagonConverter(base_converter.ConverterInterface): tensor.dims.extend(values) return tensor.name - def add_input_output_node(self): - for op in self._model.op: - if op.name.startswith(MaceKeyword.mace_input_node_name): - del op.input[0] - break + def add_constant_min_max_for_first_op(self, op): + minval = self._quantize_activation_info[op.input[0]].minval + maxval = self._quantize_activation_info[op.input[0]].maxval + input_op, _ = get_op_and_port_from_tensor(op.input[0]) + input_min = input_op + '_min:0' + input_max = input_op + '_max:0' + self.add_const_node(input_min, minval) + self.add_const_node(input_max, maxval) + for i in range(len(op.input)): + if op.input[i] == input_op + ':1': + op.input[i] = input_min + elif op.input[i] == input_op + ':2': + op.input[i] = input_max - output_node = None - if not self._option.check_nodes: - output_name = self._option.output_nodes.values()[0].name - else: - output_name = self._option.check_nodes.values()[0].name - output_name = normalize_name(output_name) - for op in self._model.op: - if op.name == output_name: - output_node = op - break - mace_check(output_node is not None, - "mace_output_node_* not found.") - del output_node.output_shape[:] - del output_node.output_type[:] - del output_node.out_max_byte_size[:] + def add_input_output_node(self): + mace_check( + self._model.op[0].type == HexagonOp.QuantizeINPUT_f_to_8.name, + "Not started with Quantize op.") + quantize_input_op = self._model.op[0] + del quantize_input_op.input[:] + + mace_check( + self._model.op[-1].type == HexagonOp.DequantizeOUTPUT_8tof.name, + "Not ended with Dequantize op.") + dequantize_output_op = self._model.op[-1] + del dequantize_output_op.output_shape[:] + del dequantize_output_op.output_type[:] + del dequantize_output_op.out_max_byte_size[:] + + if self._option.device == DeviceType.HTA.value: + # replace QuantizeINPUT_f_to_8 with INPUT + quantize_input_op.type = HexagonOp.INPUT.name + del quantize_input_op.output_shape[1:] + del quantize_input_op.output_type[1:] + del quantize_input_op.out_max_byte_size[1:] + + # replace first op's input min max with constant + self.add_constant_min_max_for_first_op(self._model.op[1]) + + # replace DequantizeOUTPUT_8tof with OUTPUT + dequantize_output_op.type = HexagonOp.OUTPUT.name + del dequantize_output_op.input[1:] def add_node_id(self): node_id_counter = 0 diff --git a/mace/python/tools/converter_tool/onnx_converter.py b/mace/python/tools/converter_tool/onnx_converter.py index 2f3570d59cc0a9ac5cd28ae9c43ab13974f51395..68f781a23dfc4fe5d09163b59422be15fec31f87 100644 --- a/mace/python/tools/converter_tool/onnx_converter.py +++ b/mace/python/tools/converter_tool/onnx_converter.py @@ -27,27 +27,28 @@ from mace.python.tools.converter_tool.base_converter import ReduceType from mace.python.tools.converter_tool.base_converter import FrameworkType from mace.python.tools.converter_tool.base_converter import RoundMode from mace.python.tools.converter_tool.base_converter import DataFormat -from mace.python.tools.converter_tool.base_converter import FilterFormat from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import ConverterUtil from mace.python.tools.convert_util import mace_check +import numpy as np + import onnx import onnx.utils -from onnx import helper, shape_inference, numpy_helper, optimizer -import numpy as np -from onnx import mapping -from onnx import TensorProto +from onnx import mapping, numpy_helper, TensorProto from numbers import Number +IS_PYTHON3 = sys.version_info > (3,) OnnxSupportedOps = [ 'Abs', # 'Acos', # 'Acosh', 'Add', + 'Affine', # 'And', + 'Append', 'ArgMax', 'ArgMin', # 'Asin', @@ -68,6 +69,7 @@ OnnxSupportedOps = [ # 'Cos', # 'Cosh', 'DepthToSpace', + 'DimRange', 'Div', 'Dropout', 'Elu', @@ -88,10 +90,12 @@ OnnxSupportedOps = [ # 'Hardmax', 'Identity', # 'If', + 'IfDefined', 'ImageScaler', # 'InstanceNormalization', # 'LRN', - # 'LSTM', + 'LSTM', + # 'LstmNonlinear', 'LeakyRelu', # 'Less', # 'Log', @@ -109,11 +113,15 @@ OnnxSupportedOps = [ 'Mul', # 'Multinomial', 'Neg', + 'Normalize', # 'Not', + 'Offset', # 'OneHot', # 'Or', 'PRelu', - 'Pad', + # 'Pad', + 'Padding', + 'PNorm', 'Pow', # 'RNN', # 'RandomNormal', @@ -133,6 +141,7 @@ OnnxSupportedOps = [ # 'ReduceSumSquare', 'Relu', 'Reshape', + 'Scale', # 'Scan', # 'Selu', 'Shape', @@ -140,18 +149,21 @@ OnnxSupportedOps = [ # 'Sin', # 'Sinh', # 'Size', - # 'Slice', + 'Slice', 'Softmax', # 'Softplus', # 'Softsign', 'SpaceToDepth', + 'Splice', 'Split', 'Sqrt', 'Squeeze', 'Sub', 'Sum', + 'SumGroup', # 'Tan', 'Tanh', + 'TargetRMSNorm', # 'Tile', # 'TopK', 'Transpose', @@ -188,7 +200,7 @@ def convert_onnx_attribute_proto(attr_proto): return attr_proto.i elif attr_proto.HasField('s'): return str(attr_proto.s, 'utf-8')\ - if sys.version_info.major == 3 else attr_proto.s + if IS_PYTHON3 else attr_proto.s elif attr_proto.HasField('t'): return attr_proto.t # this is a proto! elif attr_proto.floats: @@ -217,6 +229,8 @@ def onnx_dtype(dtype): class OnnxNode(object): def __init__(self, node): self.name = str(node.name) + if self.name == '': + self.name = str(node.output) self.op_type = str(node.op_type) self.domain = str(node.domain) self.attrs = dict([(attr.name, @@ -227,14 +241,14 @@ class OnnxNode(object): self.node_proto = node def print_info(self): - print "node: ", self.name - print " type: ", self.op_type - print " domain: ", self.domain - print " inputs: ", self.inputs - print " outputs: ", self.outputs - print " attrs:" + print("node: ", self.name) + print(" type: ", self.op_type) + print(" domain: ", self.domain) + print(" inputs: ", self.inputs) + print(" outputs: ", self.outputs) + print(" attrs:") for arg in self.attrs: - print " %s: %s" % (arg, self.attrs[arg]) + print(" %s: %s" % (arg, self.attrs[arg])) class OnnxTensor(object): @@ -273,6 +287,7 @@ class OnnxConverter(base_converter.ConverterInterface): OnnxOpType.Equal.name: EltwiseType.EQUAL, OnnxOpType.Sqrt.name: EltwiseType.POW, OnnxOpType.Reciprocal.name: EltwiseType.POW, + OnnxOpType.Scale.name: EltwiseType.PROD, } reduce_type = { @@ -296,6 +311,8 @@ class OnnxConverter(base_converter.ConverterInterface): self._op_converters = { OnnxOpType.Abs.name: self.convert_eltwise, OnnxOpType.Add.name: self.convert_eltwise, + OnnxOpType.Affine.name: self.convert_affine, + OnnxOpType.Append.name: self.convert_concat, OnnxOpType.ArgMax.name: self.convert_argmax, OnnxOpType.ArgMin.name: self.convert_argmax, OnnxOpType.AveragePool.name: self.convert_pooling, @@ -306,6 +323,7 @@ class OnnxConverter(base_converter.ConverterInterface): OnnxOpType.ConvTranspose.name: self.convert_deconv, OnnxOpType.DepthToSpace.name: self.convert_depth_space, OnnxOpType.Dropout.name: self.convert_identity, + OnnxOpType.DimRange.name: self.convert_dim_range, OnnxOpType.Div.name: self.convert_eltwise, OnnxOpType.Equal.name: self.convert_eltwise, OnnxOpType.Gather.name: self.convert_gather, @@ -313,53 +331,77 @@ class OnnxConverter(base_converter.ConverterInterface): OnnxOpType.GlobalAveragePool.name: self.convert_reduce, OnnxOpType.GlobalMaxPool.name: self.convert_reduce, OnnxOpType.Identity.name: self.convert_identity, + OnnxOpType.IfDefined.name: self.convert_identity, OnnxOpType.ImageScaler.name: self.convert_imagescaler, OnnxOpType.LeakyRelu.name: self.convert_activation, + # OnnxOpType.LogSoftmax.name: self.convert_softmax, + OnnxOpType.LSTM.name: self.convert_lstm, + # OnnxOpType.LstmNonlinear.name: self.convert_lstm_nonlinear, OnnxOpType.Max.name: self.convert_eltwise, OnnxOpType.MaxPool.name: self.convert_pooling, OnnxOpType.MatMul.name: self.convert_matmul, OnnxOpType.Min.name: self.convert_eltwise, OnnxOpType.Mul.name: self.convert_eltwise, OnnxOpType.Neg.name: self.convert_eltwise, - OnnxOpType.Pad.name: self.convert_pad, + OnnxOpType.Normalize: self.convert_normalize, + OnnxOpType.Offset.name: self.convert_timeoffset, + OnnxOpType.Padding.name: self.convert_identity, + OnnxOpType.PNorm.name: self.convert_pnorm, OnnxOpType.Pow.name: self.convert_eltwise, OnnxOpType.PRelu.name: self.convert_activation, OnnxOpType.Relu.name: self.convert_activation, OnnxOpType.Reshape.name: self.convert_reshape, OnnxOpType.Reciprocal.name: self.convert_eltwise, + OnnxOpType.Scale.name: self.convert_eltwise, OnnxOpType.Sigmoid.name: self.convert_activation, + OnnxOpType.Slice.name: self.convert_slice, OnnxOpType.Softmax.name: self.convert_softmax, OnnxOpType.SpaceToDepth.name: self.convert_depth_space, + OnnxOpType.Splice.name: self.convert_splice, OnnxOpType.Split.name: self.convert_split, OnnxOpType.Sqrt.name: self.convert_eltwise, OnnxOpType.Squeeze.name: self.convert_squeeze, OnnxOpType.Sub.name: self.convert_eltwise, OnnxOpType.Sum.name: self.convert_eltwise, + OnnxOpType.SumGroup.name: self.convert_sum_group, OnnxOpType.Tanh.name: self.convert_activation, + OnnxOpType.TargetRMSNorm: self.convert_target_rms_norm, OnnxOpType.Transpose.name: self.convert_transpose, } self._option = option self._mace_net_def = mace_pb2.NetDef() - ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.OIHW) + self._data_format = DataFormat.NCHW + ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW) onnx_model = onnx.load(src_model_file) - polished_model = onnx.utils.polish_model(onnx_model) - - print "onnx model IR version: ", onnx_model.ir_version - print "onnx model opset import: ", onnx_model.opset_import - - self._onnx_model = shape_inference.infer_shapes(polished_model) + ir_version = onnx_model.ir_version + opset_imp = onnx_model.opset_import + + polish_available = True + print("onnx model IR version: ", ir_version) + for imp in opset_imp: + domain = imp.domain + version = imp.version + print("constains ops domain: ", domain, "version:", version) + if 'kaldi2onnx' in domain: + polish_available = False + self._data_format = DataFormat.DF_NONE + if polish_available: + onnx_model = onnx.utils.polish_model(onnx_model) + + self._onnx_model = onnx_model self._graph_shapes_dict = {} self._consts = {} self._replace_tensors = {} - def print_graph_info(self, graph): + @staticmethod + def print_graph_info(graph): for value_info in graph.value_info: - print "value info:", value_info + print("value info:", value_info) for value_info in graph.input: - print "inputs info:", value_info + print("inputs info:", value_info) for value_info in graph.output: - print "outputs info:", value_info + print("outputs info:", value_info) def extract_shape_info(self, graph): def extract_value_info(shape_dict, value_info): @@ -368,12 +410,12 @@ class OnnxConverter(base_converter.ConverterInterface): if t: shape_dict[value_info.name] = t - for value_info in graph.value_info: - extract_value_info(self._graph_shapes_dict, value_info) - for value_info in graph.input: - extract_value_info(self._graph_shapes_dict, value_info) - for value_info in graph.output: - extract_value_info(self._graph_shapes_dict, value_info) + for vi in graph.value_info: + extract_value_info(self._graph_shapes_dict, vi) + for vi in graph.input: + extract_value_info(self._graph_shapes_dict, vi) + for vi in graph.output: + extract_value_info(self._graph_shapes_dict, vi) def add_tensor(self, name, shape, data_type, value): tensor = self._mace_net_def.tensors.add() @@ -387,11 +429,6 @@ class OnnxConverter(base_converter.ConverterInterface): self.extract_shape_info(graph_def) self.convert_tensors(graph_def) self.convert_ops(graph_def) - # self.print_graph_info(graph_def) - # shape_inferer = mace_shape_inference.ShapeInference( - # self._mace_net_def, - # self._option.input_nodes.values()) - # shape_inferer.run() return self._mace_net_def def add_stride_pad_kernel_arg(self, attrs, op_def): @@ -435,6 +472,32 @@ class OnnxConverter(base_converter.ConverterInterface): padding_arg.name = MaceKeyword.mace_padding_values_str padding_arg.ints.extend(pad) + def remove_node(self, node): + input_name = node.inputs[0] + output_name = node.outputs[0] + self._replace_tensors[output_name] = input_name + + @staticmethod + def squeeze_shape(shape, axis): + new_shape = [] + if len(axis) > 0: + for i in range(len(shape)): + if i not in axis: + new_shape.append(shape[i]) + else: + new_shape = shape + return new_shape + + @staticmethod + def transpose_const(tensor): + shape = tensor.dims + mace_check(len(shape) == 2, "gemm only supports 2-dim input.") + tensor_data = np.array(tensor.float_data).reshape( + shape[0], shape[1]) + tensor_data = tensor_data.transpose(1, 0) + tensor.float_data[:] = tensor_data.flat + tensor.dims[:] = tensor_data.shape + def convert_ops(self, graph_def): for n in graph_def.node: node = OnnxNode(n) @@ -471,7 +534,7 @@ class OnnxConverter(base_converter.ConverterInterface): "Not supported tensor type: %s" % data_type) self._consts[tensor.name] = tensor - def convert_general_op(self, node): + def convert_general_op(self, node, with_shape=True): op = self._mace_net_def.op.add() op.name = node.name @@ -481,9 +544,11 @@ class OnnxConverter(base_converter.ConverterInterface): op.input.append(input) for output in node.outputs: op.output.append(output) - output_shape = op.output_shape.add() - shape_info = self._graph_shapes_dict[output] - output_shape.dims.extend(shape_info) + if with_shape: + if output in self._graph_shapes_dict: + output_shape = op.output_shape.add() + shape_info = self._graph_shapes_dict[output] + output_shape.dims.extend(shape_info) data_type_arg = op.arg.add() data_type_arg.name = 'T' @@ -493,91 +558,9 @@ class OnnxConverter(base_converter.ConverterInterface): framework_type_arg.name = MaceKeyword.mace_framework_type_str framework_type_arg.i = FrameworkType.ONNX.value - ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) + ConverterUtil.add_data_format_arg(op, self._data_format) return op - def convert_fused_batchnorm(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.BatchNorm.name - - if "epsilon" in node.attrs: - epsilon_value = node.attrs["epsilon"] - else: - epsilon_value = 1e-5 - - mace_check(len(node.inputs) == 5, "batch norm should have 5 inputs.") - - gamma_value = np.array(self._consts[node.inputs[1]].float_data) - beta_value = np.array(self._consts[node.inputs[2]].float_data) - mean_value = np.array(self._consts[node.inputs[3]].float_data) - var_value = np.array(self._consts[node.inputs[4]].float_data) - - scale_name = node.name + 'scale' - offset_name = node.name + 'offset' - scale_value = ( - (1.0 / np.sqrt( - var_value + epsilon_value)) * gamma_value) - offset_value = (-mean_value * scale_value) + beta_value - self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT, - scale_value) - self.add_tensor(offset_name, offset_value.shape, mace_pb2.DT_FLOAT, - offset_value) - del op.input[1:] - op.input.extend([scale_name, offset_name]) - del op.output[1:] - del op.output_shape[1:] - - def convert_conv2d(self, node): - op = self.convert_general_op(node) - self.add_stride_pad_kernel_arg(node.attrs, op) - group_arg = op.arg.add() - group_arg.name = MaceKeyword.mace_group_str - if 'group' in node.attrs: - group_val = node.attrs["group"] - else: - group_val = 1 - group_arg.i = group_val - - is_depthwise = False - if group_val > 1: - filter_shape = self._graph_shapes_dict[node.inputs[1]] - mace_check(group_val == filter_shape[0] and - filter_shape[1] == 1, - "Mace does not support group convolution yet") - filter_tensor = self._consts[node.inputs[1]] - new_shape = [filter_shape[1], filter_shape[0], - filter_shape[2], filter_shape[3]] - del filter_tensor.dims[:] - filter_tensor.dims.extend(new_shape) - is_depthwise = True - if is_depthwise: - op.type = MaceOp.DepthwiseConv2d.name - else: - op.type = MaceOp.Conv2D.name - - dilation_arg = op.arg.add() - dilation_arg.name = MaceKeyword.mace_dilations_str - if 'dilations' in node.attrs: - dilation_val = node.attrs["dilations"] - else: - dilation_val = [1, 1] - dilation_arg.ints.extend(dilation_val) - - def convert_biasadd(self, node): - self.convert_general_op(node) - - def convert_concat(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.Concat.name - mace_check('axis' in node.attrs, - 'Concat op should have axis attribute.') - axis_arg = op.arg.add() - axis_arg.name = MaceKeyword.mace_axis_str - axis_arg.i = node.attrs['axis'] - axis_arg.i = 4 + axis_arg.i if axis_arg.i < 0 else axis_arg.i - mace_check(axis_arg.i == 1, - "only support concat at channel dimension") - def convert_activation(self, node): op = self.convert_general_op(node) op.type = MaceOp.Activation.name @@ -597,100 +580,12 @@ class OnnxConverter(base_converter.ConverterInterface): alpha_arg.name = MaceKeyword.mace_activation_max_limit_str alpha_arg.f = alpha_value - def convert_pooling(self, node): - op = self.convert_general_op(node) - - op.type = MaceOp.Pooling.name - self.add_stride_pad_kernel_arg(node.attrs, op) - pooling_type_arg = op.arg.add() - pooling_type_arg.name = MaceKeyword.mace_pooling_type_str - pooling_type_arg.i = self.pooling_type_mode[node.op_type].value - - round_mode_arg = op.arg.add() - round_mode_arg.name = MaceKeyword.mace_round_mode_str - round_mode_arg.i = RoundMode.FLOOR.value - - def convert_reshape(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.Reshape.name - - def convert_flatten(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.Reshape.name - - def remove_node(self, node): - input_name = node.inputs[0] - output_name = node.outputs[0] - self._replace_tensors[output_name] = input_name - - def convert_eltwise(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.Eltwise.name - type_arg = op.arg.add() - type_arg.name = MaceKeyword.mace_element_type_str - type_arg.i = self.eltwise_type[node.op_type].value - - if node.op_type == OnnxOpType.Sqrt.name: - value_arg = op.arg.add() - value_arg.name = MaceKeyword.mace_scalar_input_str - value_arg.f = 0.5 - elif node.op_type == OnnxOpType.Reciprocal.name: - value_arg = op.arg.add() - value_arg.name = MaceKeyword.mace_scalar_input_str - value_arg.f = -1 - - def convert_reduce(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.Reduce.name - - reduce_type_arg = op.arg.add() - reduce_type_arg.name = MaceKeyword.mace_reduce_type_str - reduce_type_arg.i = self.reduce_type[node.op_type].value - - if node.op_type in [OnnxOpType.GlobalAveragePool.name, - OnnxOpType.GlobalMaxPool.name]: - reduce_dims = [2, 3] - keep_dims = 1 - else: - if 'axes' in node.attrs: - reduce_dims = node.attrs['axes'] - else: - reduce_dims = [] - if 'keepdims' in node.attrs: - keep_dims = node.attrs['keepdims'] - else: - keep_dims = 1 - axis_arg = op.arg.add() - axis_arg.name = MaceKeyword.mace_axis_str - axis_arg.ints.extend(reduce_dims) - - keep_dims_arg = op.arg.add() - keep_dims_arg.name = MaceKeyword.mace_keepdims_str - keep_dims_arg.i = keep_dims - - def convert_imagescaler(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.BatchNorm.name - - scale = node.attrs['scale'] - bias_value = np.array(node.attrs['bias']) - scale_value = scale * np.ones_like(bias_value) - - scale_name = node.name + "_scale" - bias_name = node.name + "_bias" - self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT, - scale_value) - self.add_tensor(bias_name, bias_value.shape, mace_pb2.DT_FLOAT, - bias_value) - op.input.extend([scale_name, bias_name]) - - def convert_matmul(self, node): + def convert_affine(self, node): op = self.convert_general_op(node) op.type = MaceOp.MatMul.name - - def convert_softmax(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.Softmax.name + transpose_b_arg = op.arg.add() + transpose_b_arg.name = MaceKeyword.mace_transpose_b_str + transpose_b_arg.i = 1 def convert_argmax(self, node): op = self.convert_general_op(node) @@ -717,6 +612,10 @@ class OnnxConverter(base_converter.ConverterInterface): min_arg.name = MaceKeyword.mace_argmin_str min_arg.i = 1 + def convert_biasadd(self, node): + self.convert_general_op(node) + op.type = MaceOp.BiasAdd.name + def convert_cast(self, node): op = self.convert_general_op(node) op.type = MaceOp.Cast.name @@ -732,41 +631,51 @@ class OnnxConverter(base_converter.ConverterInterface): else: op.output_type.extend([self._option.data_type]) - def convert_depth_space(self, node): + def convert_concat(self, node): op = self.convert_general_op(node) - if op.type == OnnxOpType.DepthToSpace.name: - op.type = MaceOp.DepthToSpace.name - else: - op.type = MaceOp.SpaceToDepth.name - mace_check(('block_size' in node.attrs), - "depth to space op should have block size attribute.") - block_size = node.attrs['block_size'] - size_arg = op.arg.add() - size_arg.name = MaceKeyword.mace_space_depth_block_size_str - size_arg.i = block_size + op.type = MaceOp.Concat.name + axis_value = 1 + if node.op_type == OnnxOpType.Concat.name: + mace_check('axis' in node.attrs, + 'Concat op should have axis attribute.') + axis_value = node.attrs['axis'] + mace_check(axis_value == 1 or axis_value == -3, + "only support concat at channel dimension") + elif node.op_type == OnnxOpType.Append.name: + axis_value = 2 + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = 4 + axis_value if axis_value < 0 else axis_value - def convert_deconv(self, node): + def convert_conv2d(self, node): op = self.convert_general_op(node) - self.add_stride_pad_kernel_arg(node.attrs, op) - + group_arg = op.arg.add() + group_arg.name = MaceKeyword.mace_group_str if 'group' in node.attrs: group_val = node.attrs["group"] else: group_val = 1 + group_arg.i = group_val + + is_depthwise = False if group_val > 1: - op.type = MaceOp.DepthwiseDeconv2d.name filter_shape = self._graph_shapes_dict[node.inputs[1]] + mace_check(group_val == filter_shape[0] and + filter_shape[1] == 1, + "Mace does not support group convolution yet") filter_tensor = self._consts[node.inputs[1]] new_shape = [filter_shape[1], filter_shape[0], filter_shape[2], filter_shape[3]] del filter_tensor.dims[:] filter_tensor.dims.extend(new_shape) + is_depthwise = True + if is_depthwise: + op.type = MaceOp.DepthwiseConv2d.name else: - op.type = MaceOp.Deconv2D.name - group_arg = op.arg.add() - group_arg.name = MaceKeyword.mace_group_str - group_arg.i = group_val + op.type = MaceOp.Conv2D.name + mace_check(op.input[1] in self._consts, + "Mace does not support non-const filter convolution.") dilation_arg = op.arg.add() dilation_arg.name = MaceKeyword.mace_dilations_str @@ -775,16 +684,47 @@ class OnnxConverter(base_converter.ConverterInterface): else: dilation_val = [1, 1] dilation_arg.ints.extend(dilation_val) - mace_check(dilation_val == [1, 1], - "not support convtranspose with dilation != 1 yet.") - mace_check('output_padding' not in node.attrs, - "not support convtranspose with output_padding yet.") - mace_check('output_shape' not in node.attrs, - "not support convtranspose with output_shape yet.") - # TODO: if output shape specified, calculate padding value - # if 'output_padding' in node.attrs: - # output_padding = node.attrs['output_padding'] + def convert_deconv(self, node): + op = self.convert_general_op(node) + + self.add_stride_pad_kernel_arg(node.attrs, op) + + if 'group' in node.attrs: + group_val = node.attrs["group"] + else: + group_val = 1 + if group_val > 1: + op.type = MaceOp.DepthwiseDeconv2d.name + filter_shape = self._graph_shapes_dict[node.inputs[1]] + filter_tensor = self._consts[node.inputs[1]] + new_shape = [filter_shape[1], filter_shape[0], + filter_shape[2], filter_shape[3]] + del filter_tensor.dims[:] + filter_tensor.dims.extend(new_shape) + else: + op.type = MaceOp.Deconv2D.name + group_arg = op.arg.add() + group_arg.name = MaceKeyword.mace_group_str + group_arg.i = group_val + + dilation_arg = op.arg.add() + dilation_arg.name = MaceKeyword.mace_dilations_str + if 'dilations' in node.attrs: + dilation_val = node.attrs["dilations"] + else: + dilation_val = [1, 1] + dilation_arg.ints.extend(dilation_val) + mace_check(dilation_val == [1, 1], + "not support convtranspose with dilation != 1 yet.") + + mace_check('output_padding' not in node.attrs, + "not support convtranspose with output_padding yet.") + mace_check('output_shape' not in node.attrs, + "not support convtranspose with output_shape yet.") + # TODO: if output shape specified, calculate padding value + # if 'output_padding' in node.attrs: + # output_padding = node.attrs['output_padding'] # output_padding_arg = op.arg.add() # output_padding_arg.name = MaceKeyword.mace_output_padding_str # output_padding_arg.ints.extend(output_padding) @@ -794,43 +734,98 @@ class OnnxConverter(base_converter.ConverterInterface): # output_shape_arg.name = MaceKeyword.mace_output_shape_str # output_shape_arg.ints.extend(output_shape) - def convert_nop(self, node): - pass + def convert_depth_space(self, node): + op = self.convert_general_op(node) + if op.type == OnnxOpType.DepthToSpace.name: + op.type = MaceOp.DepthToSpace.name + else: + op.type = MaceOp.SpaceToDepth.name + mace_check(('block_size' in node.attrs), + "depth to space op should have block size attribute.") + block_size = node.attrs['block_size'] + size_arg = op.arg.add() + size_arg.name = MaceKeyword.mace_space_depth_block_size_str + size_arg.i = block_size - def convert_identity(self, node): + def convert_dim_range(self, node): op = self.convert_general_op(node) - op.type = MaceOp.Identity.name + op.type = MaceOp.Slice.name + + mace_check('offset' in node.attrs, + "Attribute dim required!") + mace_check('output_dim' in node.attrs, + "Attribute output_dim required!") + offset = node.attrs['offset'] + starts_arg = op.arg.add() + starts_arg.name = 'starts' + starts_arg.ints.append(offset) + output_dim = node.attrs['output_dim'] + ends_arg = op.arg.add() + ends_arg.name = 'output_dim' + ends_arg.ints.append(output_dim) + axes_arg = op.arg.add() + axes_arg.name = 'axes' + axes_arg.ints.append(-1) - def convert_pad(self, node): + def convert_eltwise(self, node): op = self.convert_general_op(node) - op.type = MaceOp.Pad.name + op.type = MaceOp.Eltwise.name + type_arg = op.arg.add() + type_arg.name = MaceKeyword.mace_element_type_str + type_arg.i = self.eltwise_type[node.op_type].value - if 'pads' in node.attrs: - paddings_arg = op.arg.add() - paddings_arg.name = MaceKeyword.mace_paddings_str - paddings_value = node.attrs['pads'] - paddings_arg.ints.extend(paddings_value) + if node.op_type == OnnxOpType.Sqrt.name: + value_arg = op.arg.add() + value_arg.name = MaceKeyword.mace_scalar_input_str + value_arg.f = 0.5 + elif node.op_type == OnnxOpType.Reciprocal.name: + value_arg = op.arg.add() + value_arg.name = MaceKeyword.mace_scalar_input_str + value_arg.f = -1 + elif node.op_type == OnnxOpType.Scale.name and 'scale' in node.attrs: + value = node.attrs['scale'] + value_arg = op.arg.add() + value_arg.name = MaceKeyword.mace_scalar_input_str + value_arg.f = value - if 'value' in node.attrs: - constant_value_arg = op.arg.add() - constant_value_arg.name = MaceKeyword.mace_constant_value_str - constant_value_arg.i = node.attrs['value'] + def convert_flatten(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Reshape.name - def convert_gather(self, node): + def convert_fused_batchnorm(self, node): op = self.convert_general_op(node) - op.type = MaceOp.Gather.name + op.type = MaceOp.BatchNorm.name - if 'axis' in node.attrs: - value = node.attrs['axis'] + if "epsilon" in node.attrs: + epsilon_value = node.attrs["epsilon"] else: - value = 0 - axis_arg = op.arg.add() - axis_arg.name = MaceKeyword.mace_axis_str - axis_arg.i = value + epsilon_value = 1e-5 - def convert_split(self, node): + mace_check(len(node.inputs) == 5, "batch norm should have 5 inputs.") + + gamma_value = np.array(self._consts[node.inputs[1]].float_data) + beta_value = np.array(self._consts[node.inputs[2]].float_data) + mean_value = np.array(self._consts[node.inputs[3]].float_data) + var_value = np.array(self._consts[node.inputs[4]].float_data) + + scale_name = node.name + 'scale' + offset_name = node.name + 'offset' + scale_value = ( + (1.0 / np.sqrt( + var_value + epsilon_value)) * gamma_value) + offset_value = (-mean_value * scale_value) + beta_value + self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT, + scale_value) + self.add_tensor(offset_name, offset_value.shape, mace_pb2.DT_FLOAT, + offset_value) + del op.input[1:] + op.input.extend([scale_name, offset_name]) + del op.output[1:] + del op.output_shape[1:] + + def convert_gather(self, node): op = self.convert_general_op(node) - op.type = MaceOp.Split.name + op.type = MaceOp.Gather.name if 'axis' in node.attrs: value = node.attrs['axis'] @@ -840,64 +835,6 @@ class OnnxConverter(base_converter.ConverterInterface): axis_arg.name = MaceKeyword.mace_axis_str axis_arg.i = value - def convert_transpose(self, node): - op = self.convert_general_op(node) - op.type = MaceOp.Transpose.name - - if np.array_equal(perm, ordered_perm): - op.type = MaceOp.Identity.name - del op.input[1:] - if 'perm' in node.attrs: - perm = node.attrs['perm'] - ordered_perm = np.sort(perm) - if np.array_equal(perm, ordered_perm): - op.type = MaceOp.Identity.name - else: - dims_arg = op.arg.add() - dims_arg.name = MaceKeyword.mace_dims_str - dims_arg.ints.extend(perm) - - @staticmethod - def squeeze_shape(shape, axis): - new_shape = [] - if len(axis) > 0: - for i in range(len(shape)): - if i not in axis: - new_shape.append(shape[i]) - else: - new_shape = shape - return new_shape - - def convert_squeeze(self, node): - axis_value = node.attrs['axes'] - if node.inputs[0] in self._consts: - tensor = self._consts[node.inputs[0]] - shape = tensor.dims - new_shape = self.squeeze_shape(shape, axis_value) - del tensor.dims[:] - tensor.dims.extend(new_shape) - self.remove_node(node) - else: - op = self.convert_general_op(node) - op.type = MaceOp.Squeeze.name - axis_arg = op.arg.add() - axis_arg.name = MaceKeyword.mace_axis_str - if 'axis' in node.attrs: - axis_value = node.attrs['axis'] - else: - axis_value = [] - axis_arg.ints.extend(axis_value) - - @staticmethod - def transpose_const(tensor): - shape = tensor.dims - mace_check(len(shape) == 2, "gemm only supports 2-dim input.") - tensor_data = np.array(tensor.float_data).reshape( - shape[0], shape[1]) - tensor_data = tensor_data.transpose(1, 0) - tensor.float_data[:] = tensor_data.flat - tensor.dims[:] = tensor_data.shape - def convert_gemm(self, node): # only supports FullyConnected Style Gemm for now. trans_a = node.attrs['transA'] if 'transA' in node.attrs else 0 @@ -915,7 +852,7 @@ class OnnxConverter(base_converter.ConverterInterface): elif len(shape_b) == 2: tensor_b = self._consts[node.inputs[1]] tensor_data = np.array(tensor_b.float_data).reshape( - shape_b[0], shape_b[1], 1, 1) + shape_b[0], shape_b[1], 1, 1) tensor_b.float_data[:] = tensor_data.flat tensor_b.dims[:] = tensor_data.shape else: @@ -949,4 +886,224 @@ class OnnxConverter(base_converter.ConverterInterface): shape_info = [shape_info[0], shape_info[1], 1, 1] output_shape.dims.extend(shape_info) - return op + def convert_identity(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Identity.name + + def convert_imagescaler(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.BatchNorm.name + + scale = node.attrs['scale'] + bias_value = np.array(node.attrs['bias']) + scale_value = scale * np.ones_like(bias_value) + + scale_name = node.name + "_scale" + bias_name = node.name + "_bias" + self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT, + scale_value) + self.add_tensor(bias_name, bias_value.shape, mace_pb2.DT_FLOAT, + bias_value) + op.input.extend([scale_name, bias_name]) + + def convert_lstm(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.LSTMCell.name + + def convert_lstm_nonlinear(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.LstmNonlinear.name + + def convert_matmul(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.MatMul.name + + def convert_nop(self, node): + pass + + def convert_normalize(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.BatchNorm.name + + def convert_pnorm(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.PNorm.name + if 'output_dim' in node.attrs: + output_dim_arg = op.arg.add() + output_dim_arg.name = 'output_dim' + output_dim_arg.i = node.attrs['output_dim'] + if 'p' in node.attrs: + p_value = node.attrs['p'] + mace_check((p_value >= 0) and (p_value <= 2), + "PNorm only supports p = 0, 1, 2") + p_arg = op.arg.add() + p_arg.name = 'p' + p_arg.i = p_value + + def convert_pooling(self, node): + op = self.convert_general_op(node) + + op.type = MaceOp.Pooling.name + self.add_stride_pad_kernel_arg(node.attrs, op) + pooling_type_arg = op.arg.add() + pooling_type_arg.name = MaceKeyword.mace_pooling_type_str + pooling_type_arg.i = self.pooling_type_mode[node.op_type].value + + round_mode_arg = op.arg.add() + round_mode_arg.name = MaceKeyword.mace_round_mode_str + round_mode_arg.i = RoundMode.FLOOR.value + + def convert_reduce(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Reduce.name + + reduce_type_arg = op.arg.add() + reduce_type_arg.name = MaceKeyword.mace_reduce_type_str + reduce_type_arg.i = self.reduce_type[node.op_type].value + + if node.op_type in [OnnxOpType.GlobalAveragePool.name, + OnnxOpType.GlobalMaxPool.name]: + reduce_dims = [2, 3] + keep_dims = 1 + else: + if 'axes' in node.attrs: + reduce_dims = node.attrs['axes'] + else: + reduce_dims = [] + if 'keepdims' in node.attrs: + keep_dims = node.attrs['keepdims'] + else: + keep_dims = 1 + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.ints.extend(reduce_dims) + + keep_dims_arg = op.arg.add() + keep_dims_arg.name = MaceKeyword.mace_keepdims_str + keep_dims_arg.i = keep_dims + + def convert_reshape(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Reshape.name + + def convert_slice(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Slice.name + + mace_check('starts' in node.attrs, "Attribute starts required!") + mace_check('ends' in node.attrs, "Attribute ends required!") + starts = node.attrs['starts'] + starts_arg = op.arg.add() + starts_arg.name = 'starts' + starts_arg.ints.extend(starts) + ends = node.attrs['ends'] + ends_arg = op.arg.add() + ends_arg.name = 'ends' + ends_arg.ints.extend(ends) + if 'axes' in node.attrs: + axes = node.attrs['axes'] + axes_arg = op.arg.add() + axes_arg.name = 'axes' + axes_arg.ints.extend(axes) + + def convert_softmax(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Softmax.name + # TODO: add logsoftmax in softmax op + # if node.op_type == OnnxOpType.LogSoftmax.name: + # use_log_arg = op.arg.add() + # use_log_arg.name = 'use_log' + # use_log_arg.i = 1 + + def convert_splice(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Splice.name + if 'context' in node.attrs: + context = node.attrs['context'] + else: + context = [0] + context_arg = op.arg.add() + context_arg.name = 'context' + context_arg.ints.extend(context) + if 'const_component_dim' in node.attrs: + const_dim = node.attrs['const_component_dim'] + else: + const_dim = 0 + const_dim_arg = op.arg.add() + const_dim_arg.name = 'const_component_dim' + const_dim_arg.i = const_dim + + def convert_split(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Split.name + + if 'axis' in node.attrs: + value = node.attrs['axis'] + else: + value = 0 + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = value + + def convert_squeeze(self, node): + axis_value = node.attrs['axes'] + if node.inputs[0] in self._consts: + tensor = self._consts[node.inputs[0]] + shape = tensor.dims + new_shape = self.squeeze_shape(shape, axis_value) + del tensor.dims[:] + tensor.dims.extend(new_shape) + self.remove_node(node) + else: + op = self.convert_general_op(node) + op.type = MaceOp.Squeeze.name + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + if 'axis' in node.attrs: + axis_value = node.attrs['axis'] + else: + axis_value = [] + axis_arg.ints.extend(axis_value) + + def convert_sum_group(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.SumGroup.name + + def convert_target_rms_norm(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.TargetRMSNorm.name + + if 'target_rms' in node.attrs: + value = node.attrs['target_rms'] + target_rms_arg = op.arg.add() + target_rms_arg.name = 'target_rms' + target_rms_arg.f = value + + def convert_transpose(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.Transpose.name + + if 'perm' in node.attrs: + perm = node.attrs['perm'] + ordered_perm = np.sort(perm) + if np.array_equal(perm, ordered_perm): + op.type = MaceOp.Identity.name + del op.input[1:] + else: + dims_arg = op.arg.add() + dims_arg.name = MaceKeyword.mace_dims_str + dims_arg.ints.extend(perm) + + def convert_timeoffset(self, node): + op = self.convert_general_op(node) + mace_check('offset' in node.attrs, + 'Offset attribute required in Offset Node.') + offset = node.attrs['offset'] + if offset == 0: + op.type = MaceOp.Identity.name + else: + op.type = MaceOp.TimeOffset.name + + offset_arg = op.arg.add() + offset_arg.name = 'offset' + offset_arg.i = offset diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py index 3e472216efa3651a663a32ee2db729497d059ff2..45254333915250c9366add9de94f626a3f6f5e65 100644 --- a/mace/python/tools/converter_tool/shape_inference.py +++ b/mace/python/tools/converter_tool/shape_inference.py @@ -20,7 +20,6 @@ import six from mace.python.tools.converter_tool.transformer import Transformer from mace.python.tools.converter_tool.base_converter import DataFormat -from mace.python.tools.converter_tool.base_converter import FilterFormat from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import ConverterUtil @@ -52,6 +51,7 @@ class ShapeInference(object): MaceOp.Transpose.name: self.infer_shape_permute, MaceOp.PriorBox.name: self.infer_shape_prior_box, MaceOp.Reshape.name: self.infer_shape_reshape, + MaceOp.ResizeBilinear.name: self.infer_shape_resize_bilinear, } self._net = net @@ -129,7 +129,7 @@ class ShapeInference(object): output_shape[0] = input_shape[0] if ConverterUtil.data_format(op) == DataFormat.NCHW \ - and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW: # noqa + and ConverterUtil.filter_format(self._net) == DataFormat.OIHW: # noqa # filter format: OIHW if op.type == MaceOp.DepthwiseConv2d.name: output_shape[1] = filter_shape[0] * filter_shape[1] @@ -170,7 +170,7 @@ class ShapeInference(object): MaceKeyword.mace_group_str) output_shape[0] = input_shape[0] if ConverterUtil.data_format(op) == DataFormat.NCHW \ - and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW: # noqa + and ConverterUtil.filter_format(self._net) == DataFormat.OIHW: # noqa # filter format: IOHW output_shape[1] = filter_shape[1] if group_arg is not None and group_arg.i > 1: @@ -224,7 +224,12 @@ class ShapeInference(object): def infer_shape_crop(self, op): mace_check(len(op.input) == 2, "crop layer needs two inputs") - output_shape = self._output_shape_cache[op.input[1]] + output_shape = self._output_shape_cache[op.input[0]] + input1_shape = self._output_shape_cache[op.input[1]] + offsets = ConverterUtil.get_arg(op, MaceKeyword.mace_offset_str).ints + for i in range(len(offsets)): + if offsets[i] >= 0: + output_shape[i] = input1_shape[i] self.add_output_shape(op, [output_shape]) def infer_shape_channel_shuffle(self, op): @@ -289,3 +294,17 @@ class ShapeInference(object): output_shape.append(self._output_shape_cache[op.input[0]][i]) output_shape[axis] = dim self.add_output_shape(op, [output_shape]) + + def infer_shape_resize_bilinear(self, op): + input_shape = self._output_shape_cache[op.input[0]] + size = ConverterUtil.get_arg( + op, MaceKeyword.mace_resize_size_str).ints + if ConverterUtil.data_format(op) == DataFormat.NCHW: + output_shape = [input_shape[0], input_shape[1], size[0], size[1]] + elif ConverterUtil.data_format(op) == DataFormat.NHWC: + output_shape = [input_shape[0], size[0], size[1], input_shape[3]] + else: + output_shape = [] + mace_check(False, "format %s is not supported" + % ConverterUtil.data_format(op)) + self.add_output_shape(op, [output_shape]) diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py index 3a7bf380e132c303edfa3a98c75f4fdab54d82e2..ec255e3a90296a04d8538c1ff464edb097fe5193 100644 --- a/mace/python/tools/converter_tool/tensorflow_converter.py +++ b/mace/python/tools/converter_tool/tensorflow_converter.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import os import math import numpy as np import six @@ -29,7 +29,6 @@ from mace.python.tools.converter_tool.base_converter import PadType from mace.python.tools.converter_tool.base_converter import FrameworkType from mace.python.tools.converter_tool.base_converter import ReduceType from mace.python.tools.converter_tool.base_converter import DataFormat -from mace.python.tools.converter_tool.base_converter import FilterFormat from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import ConverterUtil @@ -117,6 +116,7 @@ TFSupportedOps = [ 'FloorDiv', 'Sqrt', 'MirrorPad', + 'Cumsum', 'OneHot', ] @@ -124,39 +124,16 @@ TFOpType = Enum('TFOpType', [(op, op) for op in TFSupportedOps], type=str) TFSupportedOps = [six.b(op) for op in TFSupportedOps] -TFTransformGraphOptions = { - base_converter.DeviceType.CPU.value: [ - 'strip_unused_nodes', - 'remove_nodes(op=Identity, op=CheckNumerics)', - 'fold_constants(ignore_errors=true)', - 'fold_batch_norms', - 'fold_old_batch_norms', - 'remove_control_dependencies', - 'strip_unused_nodes', - 'sort_by_execution_order' - ], - base_converter.DeviceType.GPU.value: [ - 'strip_unused_nodes', - 'remove_nodes(op=Identity, op=CheckNumerics)', - 'fold_constants(ignore_errors=true)', - 'flatten_atrous_conv', - 'fold_batch_norms', - 'fold_old_batch_norms', - 'remove_control_dependencies', - 'strip_unused_nodes', - 'sort_by_execution_order' - ], - base_converter.DeviceType.HEXAGON.value: [ - 'strip_unused_nodes', - 'remove_nodes(op=Identity, op=CheckNumerics)', - 'fold_constants(ignore_errors=true)', - 'fold_batch_norms', - 'fold_old_batch_norms', - 'remove_control_dependencies', - 'strip_unused_nodes', - 'sort_by_execution_order' - ] -} +TFTransformGraphOptions = [ + 'strip_unused_nodes', + 'remove_nodes(op=Identity, op=CheckNumerics)', + 'fold_constants(ignore_errors=true)', + 'fold_batch_norms', + 'fold_old_batch_norms', + 'remove_control_dependencies', + 'strip_unused_nodes', + 'sort_by_execution_order' +] class TensorflowConverter(base_converter.ConverterInterface): @@ -278,11 +255,12 @@ class TensorflowConverter(base_converter.ConverterInterface): TFOpType.FloorDiv.name: self.convert_elementwise, TFOpType.Sqrt.name: self.convert_elementwise, TFOpType.MirrorPad.name: self.convert_pad, + TFOpType.Cumsum.name: self.convert_cumsum, TFOpType.OneHot.name: self.convert_one_hot, } self._option = option self._mace_net_def = mace_pb2.NetDef() - ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.HWIO) + ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.HWIO) # import tensorflow graph tf_graph_def = tf.GraphDef() @@ -290,29 +268,44 @@ class TensorflowConverter(base_converter.ConverterInterface): tf_graph_def.ParseFromString(f.read()) self._placeholders = {} - self.add_shape_info(tf_graph_def) + self._skip_tensor = set() + self._output_shape = {} - print("Run transform_graph: %s" % TFTransformGraphOptions[ - option.device]) + print("Run transform_graph: %s" % TFTransformGraphOptions) try: - print ("output keys: ", option.output_nodes.keys()) + print("output keys: ", option.output_nodes.keys()) transformed_graph_def = TransformGraph(tf_graph_def, option.input_nodes.keys(), option.output_nodes.keys(), - TFTransformGraphOptions[ - option.device]) + TFTransformGraphOptions) except Exception as ex: print("Failed to transform graph using tf tool: %s" % ex) transformed_graph_def = tf_graph_def + # To check optimized model, uncomment following code. + # tf.io.write_graph( + # transformed_graph_def, + # ".", + # os.path.basename(src_model_file)[:-3] + "_opt.pb", + # as_text=False + # ) + + self.add_shape_info(transformed_graph_def) + with tf.Session() as session: with session.graph.as_default() as graph: tf.import_graph_def(transformed_graph_def, name='') self._tf_graph = graph + self.update_output_shapes(session) - self._skip_tensor = set() - self._output_shape_list = [] - self._output_shape_op_list = [] + # we have polluted graph with 'shape' ops, so reset it and reload it + # again + tf.reset_default_graph() + + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(transformed_graph_def, name='') + self._tf_graph = graph def run(self): with tf.Session() as session: @@ -340,13 +333,19 @@ class TensorflowConverter(base_converter.ConverterInterface): for input_node in self._option.input_nodes.values(): if node.name == input_node.name \ or node.name + ':0' == input_node.name: + input_shape = input_node.shape + if input_node.data_format == DataFormat.OIHW \ + and len(input_shape) == 4: + # OIHW -> HWIO + input_shape = [input_shape[2], input_shape[3], + input_shape[1], input_shape[0]] del node.attr['shape'].shape.dim[:] node.attr['shape'].shape.dim.extend([ tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in - input_node.shape + input_shape ]) self._placeholders[node.name + ':0'] = \ - np.zeros(shape=input_node.shape, dtype=float) + np.zeros(shape=input_shape, dtype=float) @staticmethod def get_scope(tensor_name): @@ -357,10 +356,17 @@ class TensorflowConverter(base_converter.ConverterInterface): return tensor_name[:idx] def update_output_shapes(self, sess): - output_shapes = sess.run(self._output_shape_op_list, + tensors = [] + shape_tensors = [] + for tf_op in self._tf_graph.get_operations(): + for output in tf_op.outputs: + tensors.append(output.name) + shape_tensors.append(tf.shape(output)) + + tensor_shapes = sess.run(shape_tensors, feed_dict=self._placeholders) - for i in range(len(self._output_shape_list)): - self._output_shape_list[i].dims.extend(output_shapes[i]) + for i in range(len(tensors)): + self._output_shape[tensors[i]] = tensor_shapes[i] def convert_ops(self, sess): for tf_op in self._tf_graph.get_operations(): @@ -368,7 +374,7 @@ class TensorflowConverter(base_converter.ConverterInterface): "Mace does not support tensorflow op type %s yet" % tf_op.type) self._op_converters[tf_op.type](tf_op) - self.update_output_shapes(sess) + self.convert_tensors() def convert_tensors(self): @@ -402,18 +408,17 @@ class TensorflowConverter(base_converter.ConverterInterface): # this function tries to infer tensor shape, but some dimension shape # may be undefined due to variance of input length - def infer_tensor_shape(self, output_shape, tensor): - inferred_tensor_shape = tensor.shape.as_list() - inferred_success = True - for _, dim in enumerate(inferred_tensor_shape): - if dim is None: - inferred_success = False - break - if inferred_success: - output_shape.dims.extend(inferred_tensor_shape) + def infer_tensor_shape(self, tensor, output_shape=None): + shape = None + if tensor.name in self._output_shape: + shape = self._output_shape[tensor.name] else: - self._output_shape_list.append(output_shape) - self._output_shape_op_list.append(tf.shape(tensor)) + shape = tensor.shape.as_list() + + if output_shape: + output_shape.dims.extend(shape) + + return shape def convert_nop(self, tf_op): pass @@ -426,7 +431,7 @@ class TensorflowConverter(base_converter.ConverterInterface): op.output.extend([tf_output.name for tf_output in tf_op.outputs]) for tf_output in tf_op.outputs: output_shape = op.output_shape.add() - self.infer_tensor_shape(output_shape, tf_output) + self.infer_tensor_shape(tf_output, output_shape) data_type_arg = op.arg.add() data_type_arg.name = 'T' @@ -509,10 +514,10 @@ class TensorflowConverter(base_converter.ConverterInterface): def check_is_scalar(tf_op): if len(tf_op.inputs) == 1: - return len(tf_op.inputs[0].shape) == 0 + return len(self.infer_tensor_shape(tf_op.inputs[0])) == 0 elif len(tf_op.inputs) == 2: - return len(tf_op.inputs[0].shape) == 0 and \ - len(tf_op.inputs[1].shape) == 0 + return len(self.infer_tensor_shape(tf_op.inputs[0])) == 0 and \ + len(self.infer_tensor_shape(tf_op.inputs[1])) == 0 if check_is_scalar(tf_op): op.type = MaceOp.ScalarMath.name @@ -539,9 +544,9 @@ class TensorflowConverter(base_converter.ConverterInterface): EltwiseType.SUM, EltwiseType.PROD, EltwiseType.MAX, EltwiseType.MIN] - if len(tf_op.inputs) > 1 and \ - len(tf_op.inputs[1].shape) == 0 and \ - tf_op.inputs[1].op.type == TFOpType.Const.name: + if (len(tf_op.inputs) > 1 and + len(self.infer_tensor_shape(tf_op.inputs[1])) == 0 and + tf_op.inputs[1].op.type == TFOpType.Const.name): scalar = tf_op.inputs[1].eval().astype(np.float32) value_arg = op.arg.add() value_arg.name = MaceKeyword.mace_scalar_input_str @@ -553,7 +558,7 @@ class TensorflowConverter(base_converter.ConverterInterface): value_index_arg.i = 1 self._skip_tensor.add(tf_op.inputs[1].name) del op.input[1] - elif len(tf_op.inputs[0].shape) == 0 and \ + elif len(self.infer_tensor_shape(tf_op.inputs[0])) == 0 and \ tf_op.inputs[0].op.type == TFOpType.Const.name and \ is_commutative(type_arg.i): scalar = tf_op.inputs[0].eval().astype(np.float32) @@ -1034,3 +1039,23 @@ class TensorflowConverter(base_converter.ConverterInterface): self._skip_tensor.add(tf_op.inputs[1].name) self._skip_tensor.add(tf_op.inputs[2].name) + + def convert_cumsum(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Cumsum.name + + axis = tf_op.inputs[1].eval().astype(np.int32) + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = axis + del op.input[1] + + exclusive = tf_op.get_attr('exclusive') + exclusive_arg = op.arg.add() + exclusive_arg.name = MaceKeyword.mace_exclusive_str + exclusive_arg.i = int(exclusive) + + reverse = tf_op.get_attr('reverse') + reverse_arg = op.arg.add() + reverse_arg.name = MaceKeyword.mace_reverse_str + reverse_arg.i = int(reverse) diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 33d4633635528b94a3d8d0ed108398368572a36c..6cae50dc2d9aa3b4e72b826371d38538b5061844 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -25,7 +25,6 @@ from mace.python.tools.converter_tool.base_converter import DataFormat from mace.python.tools.converter_tool.base_converter import DeviceType from mace.python.tools.converter_tool.base_converter import EltwiseType from mace.python.tools.converter_tool.base_converter import FrameworkType -from mace.python.tools.converter_tool.base_converter import FilterFormat from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import PaddingMode @@ -103,6 +102,8 @@ class Transformer(base_converter.ConverterInterface): self.transform_caffe_reshape_and_flatten, TransformerRule.TRANSFORM_CHANNEL_SHUFFLE: self.transform_channel_shuffle, + TransformerRule.QUANTIZE_SPECIFIC_OPS_ONLY: + self.quantize_specific_ops_only, } self._option = option @@ -127,7 +128,7 @@ class Transformer(base_converter.ConverterInterface): self.construct_ops_and_consumers(key) changed = transformer() if not changed: - break + break self.delete_after_check_nodes() return self._model, self._quantize_activation_info @@ -147,12 +148,12 @@ class Transformer(base_converter.ConverterInterface): filter_format_value = ConverterUtil.get_arg(self._model, MaceKeyword.mace_filter_format_str).i # noqa filter_format = None - if filter_format_value == FilterFormat.HWIO.value: - filter_format = FilterFormat.HWIO - elif filter_format_value == FilterFormat.OIHW.value: - filter_format = FilterFormat.OIHW - elif filter_format_value == FilterFormat.HWOI.value: - filter_format = FilterFormat.HWOI + if filter_format_value == DataFormat.HWIO.value: + filter_format = DataFormat.HWIO + elif filter_format_value == DataFormat.OIHW.value: + filter_format = DataFormat.OIHW + elif filter_format_value == DataFormat.HWOI.value: + filter_format = DataFormat.HWOI else: mace_check(False, "filter format %d not supported" % filter_format_value) @@ -191,16 +192,23 @@ class Transformer(base_converter.ConverterInterface): op = mace_pb2.OperatorDef() op.name = self.normalize_op_name(input_node.name) op.type = "Input" + data_type_arg = op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = mace_pb2.DT_FLOAT op.output.extend([input_node.name]) output_shape = op.output_shape.add() output_shape.dims.extend(input_node.shape) - if ConverterUtil.data_format( - self._consumers[input_node.name][0]) \ - == DataFormat.NCHW: - self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) - ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) - else: - ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) + if input_node in self._consumers: + if ConverterUtil.data_format( + self._consumers[input_node.name][0]) \ + == DataFormat.NCHW: + self.transpose_shape(output_shape.dims, + [0, 3, 1, 2]) + ConverterUtil.add_data_format_arg(op, + DataFormat.NCHW) + else: + ConverterUtil.add_data_format_arg(op, + DataFormat.NHWC) self._producer[op.output[0]] = op @staticmethod @@ -221,10 +229,32 @@ class Transformer(base_converter.ConverterInterface): return name.replace(':', '_') def get_tensor_shape(self, tensor): - producer = self._producer[tensor] - for i in six.moves.range(len(producer.output)): - if producer.output[i] == tensor: - return list(producer.output_shape[i].dims) + if tensor in self._consts: + return list(self._consts[tensor].dims) + elif tensor in self._producer: + producer = self._producer[tensor] + for i in six.moves.range(len(producer.output)): + if producer.output[i] == tensor: + return list(producer.output_shape[i].dims) + else: + return None + + def get_tensor_data_type(self, tensor): + if tensor in self._consts: + return self._consts[tensor].data_type + elif tensor in self._producer: + producer = self._producer[tensor] + for i in six.moves.range(len(producer.output)): + if producer.output[i] == tensor: + if i < len(producer.output_type): + return producer.output_type[i] + elif ConverterUtil.get_arg(producer, "T") is not None: + return ConverterUtil.get_arg(producer, "T").i + else: + print("No data type filled: ", producer) + return None + else: + return None def consumer_count(self, tensor_name): return len(self._consumers.get(tensor_name, [])) @@ -583,14 +613,14 @@ class Transformer(base_converter.ConverterInterface): offset = self._consts[consumer_op.input[2]] idx = 0 filter_format = self.filter_format() - if filter_format == FilterFormat.HWIO: + if filter_format == DataFormat.HWIO: for hwi in six.moves.range(filter.dims[0] * filter.dims[1] * filter.dims[2]): for o in six.moves.range(filter.dims[3]): filter.float_data[idx] *= scale.float_data[o] idx += 1 - elif filter_format == FilterFormat.OIHW: + elif filter_format == DataFormat.OIHW: for o in six.moves.range(filter.dims[0]): for hwi in six.moves.range(filter.dims[1] * filter.dims[2] @@ -642,7 +672,7 @@ class Transformer(base_converter.ConverterInterface): idx = 0 filter_format = self.filter_format() # in deconv op O and I channel is switched - if filter_format == FilterFormat.HWIO: + if filter_format == DataFormat.HWIO: for hw in six.moves.range(filter.dims[0] * filter.dims[1]): for o in six.moves.range(filter.dims[2]): @@ -650,7 +680,7 @@ class Transformer(base_converter.ConverterInterface): filter.float_data[idx] *=\ scale.float_data[o] idx += 1 - elif filter_format == FilterFormat.OIHW: + elif filter_format == DataFormat.OIHW: for i in six.moves.range(filter.dims[0]): for o in six.moves.range(filter.dims[1]): for hw in six.moves.range(filter.dims[2] @@ -705,7 +735,7 @@ class Transformer(base_converter.ConverterInterface): idx = 0 filter_format = self.filter_format() - if filter_format == FilterFormat.HWIO: + if filter_format == DataFormat.HWIO: for hw in six.moves.range(filter.dims[0] * filter.dims[1]): for i in six.moves.range(filter.dims[2]): @@ -713,7 +743,7 @@ class Transformer(base_converter.ConverterInterface): filter.float_data[idx] *= scale.float_data[ i * filter.dims[3] + o] idx += 1 - elif filter_format == FilterFormat.OIHW: + elif filter_format == DataFormat.OIHW: for o in six.moves.range(filter.dims[0]): for i in six.moves.range(filter.dims[1]): for hw in six.moves.range(filter.dims[2] @@ -760,17 +790,17 @@ class Transformer(base_converter.ConverterInterface): @staticmethod def sort_filter_shape(filter_shape, filter_format): """Return filter shape in HWIO order""" - if filter_format == FilterFormat.HWIO: + if filter_format == DataFormat.HWIO: filter_height = filter_shape[0] filter_width = filter_shape[1] in_channels = filter_shape[2] out_channels = filter_shape[3] - elif filter_format == FilterFormat.OIHW: + elif filter_format == DataFormat.OIHW: filter_height = filter_shape[2] filter_width = filter_shape[3] in_channels = filter_shape[1] out_channels = filter_shape[0] - elif filter_format == FilterFormat.HWOI: + elif filter_format == DataFormat.HWOI: filter_height = filter_shape[0] filter_width = filter_shape[1] in_channels = filter_shape[3] @@ -933,7 +963,9 @@ class Transformer(base_converter.ConverterInterface): net = self._model for op in net.op: - if op.type == MaceOp.Conv2D.name: + if op.type == MaceOp.Conv2D.name \ + and len(op.input) >= 2 \ + and op.input[1] in self._consts: producer = self._producer[op.input[0]] input_shape = producer.output_shape[0].dims batch, height, width, channels = self.sort_feature_map_shape( @@ -975,12 +1007,13 @@ class Transformer(base_converter.ConverterInterface): input_shape = list(input_op.output_shape[0].dims) weight.dims[:] = [weight.dims[0]] + input_shape[1:] if len(input_shape) == 2: - if filter_format == FilterFormat.HWIO: + if filter_format == DataFormat.HWIO: weight.dims[:] = [1, 1] + weight.dims[:] - elif filter_format == FilterFormat.OIHW: + elif filter_format == DataFormat.OIHW: weight.dims[:] = weight.dims[:] + [1, 1] else: - mace_check("FC does not support filter format %s", + mace_check(False, + "FC does not support filter format %s" % filter_format.name) return False @@ -1052,6 +1085,16 @@ class Transformer(base_converter.ConverterInterface): new_axises.sort() arg.ints[:] = [] arg.ints.extend(new_axises) + elif op.type == MaceOp.Crop.name: + offset_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_offset_str) + mace_check(offset_arg and + ConverterUtil.data_format(op) == DataFormat.NCHW and + len(op.output_shape[0].dims) == 4, + "MACE only support crop with NCHW format") + print("Transpose crop args: %s(%s)" + % (op.name, op.type)) + self.transpose_shape(offset_arg.ints, [0, 2, 3, 1]) # transpose op output shape data_format = ConverterUtil.data_format(op) @@ -1087,7 +1130,7 @@ class Transformer(base_converter.ConverterInterface): rhs = op.input[1] if rhs in self._consts and len(self._consts[rhs].dims) == 2: arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa - six.print_('transpose matmul weight') + six.print_("Transpose matmul weight %s" % rhs) if arg is None: arg = op.arg.add() arg.name = MaceKeyword.mace_transpose_b_str @@ -1110,12 +1153,12 @@ class Transformer(base_converter.ConverterInterface): if self._option.quantize and \ self._option.device == DeviceType.CPU.value: print("Transpose filters to OHWI") - if filter_format == FilterFormat.HWIO: + if filter_format == DataFormat.HWIO: transpose_order = [3, 0, 1, 2] - elif filter_format == FilterFormat.OIHW: + elif filter_format == DataFormat.OIHW: transpose_order = [0, 2, 3, 1] else: - mace_check("Quantize model does not support conv " + mace_check(False, "Quantize model does not support conv " "filter format: %s" % filter_format.name) for op in net.op: @@ -1141,20 +1184,22 @@ class Transformer(base_converter.ConverterInterface): filter.dims[:] = filter_data.shape transposed_deconv_filter.add(op.input[1]) - self.set_filter_format(FilterFormat.OHWI) + self.set_filter_format(DataFormat.OHWI) elif self._option.quantize and \ - self._option.device == DeviceType.HEXAGON.value: + (self._option.device == DeviceType.HEXAGON.value or + self._option.device == DeviceType.HTA.value): print("Transpose filters to HWIO/HWIM") - mace_check(filter_format == FilterFormat.HWIO, + mace_check(filter_format == DataFormat.HWIO, "HEXAGON only support HWIO/HWIM filter format.") else: print("Transpose filters to OIHW/MIHW") # transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM) - if filter_format == FilterFormat.HWIO: + if filter_format == DataFormat.HWIO: for op in net.op: if (op.type == MaceOp.Conv2D.name or op.type == MaceOp.Deconv2D.name or op.type == MaceOp.DepthwiseConv2d.name) \ + and op.input[1] in self._consts \ and op.input[1] not in transposed_filter: filter = self._consts[op.input[1]] filter_data = np.array(filter.float_data).reshape( @@ -1184,7 +1229,7 @@ class Transformer(base_converter.ConverterInterface): weight.dims[:] = weight_data.shape transposed_filter.add(op.input[1]) - self.set_filter_format(FilterFormat.OIHW) + self.set_filter_format(DataFormat.OIHW) # deconv's filter's output channel and input channel is reversed for op in net.op: if op.type in [MaceOp.Deconv2D.name, @@ -1265,7 +1310,7 @@ class Transformer(base_converter.ConverterInterface): len(op.input) == 2 and \ op.input[1] in self._consts and \ len(op.output_shape[0].dims) == 2 and \ - filter_format == FilterFormat.HWIO and \ + filter_format == DataFormat.HWIO and \ op.input[0] in self._producer: input_op = self._producer[op.input[0]] input_shape = input_op.output_shape[0].dims @@ -1298,7 +1343,8 @@ class Transformer(base_converter.ConverterInterface): # transform `fc1(2D) -> matmul` to `fc1(2D) -> fc1(2D)` if op.type == MaceOp.MatMul.name and \ - filter_format == FilterFormat.HWIO: + filter_format == DataFormat.HWIO and \ + op.input[1] in self._consts: producer = self._producer[op.input[0]] weight = self._consts[op.input[1]] if len(weight.dims) == 2 and self.is_after_fc(op) and \ @@ -1373,21 +1419,18 @@ class Transformer(base_converter.ConverterInterface): return False def update_data_format(self): - data_format_flag = DataFormat.NHWC.value + print("update data format") + data_format_flag = 1 for input_node in self._option.input_nodes.values(): if input_node.data_format.value == DataFormat.DF_NONE.value: - data_format_flag = DataFormat.DF_NONE.value - + data_format_flag = 0 net = self._model for op in net.op: - data_format_arg = ConverterUtil.get_arg( + ConverterUtil.del_arg( op, MaceKeyword.mace_data_format_str) - if not data_format_arg: - data_format_arg = op.arg.add() - data_format_arg.name = MaceKeyword.mace_data_format_str - data_format_arg.i = data_format_flag - elif data_format_arg.i != data_format_flag: - data_format_arg.i = data_format_flag + has_data_format_arg = op.arg.add() + has_data_format_arg.name = MaceKeyword.mace_has_data_format_str + has_data_format_arg.i = data_format_flag return False def quantize_nodes(self): @@ -1423,10 +1466,11 @@ class Transformer(base_converter.ConverterInterface): else: mace_check(op.type == MaceOp.Quantize.name, "Quantization only support float ops, " - "but get %s(%s)" - % (op.name, op.type)) + "but get %s(%s, %s)" + % (op.name, op.type, + mace_pb2.DataType.Name(data_type_arg.i))) - for input_node in self._option.input_nodes.values(): + for i, input_node in enumerate(self._option.input_nodes.values()): new_input_name = self.input_name_map[input_node.name] op_def = self._model.op.add() op_def.name = self.normalize_op_name(new_input_name) @@ -1435,8 +1479,10 @@ class Transformer(base_converter.ConverterInterface): op_def.output.extend([new_input_name]) output_shape = op_def.output_shape.add() output_shape.dims.extend(input_node.shape) - self.copy_quantize_info( - op_def, self._quantize_activation_info[new_input_name]) + quantize_info = self._quantize_activation_info[new_input_name] + self.copy_quantize_info(op_def, quantize_info) + self._model.input_info[i].scale = quantize_info.scale + self._model.input_info[i].zero_point = quantize_info.zero_point ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) @@ -1447,16 +1493,19 @@ class Transformer(base_converter.ConverterInterface): find_range_every_time_arg.i = 1 output_nodes = self._option.check_nodes.values() - for output_node in output_nodes: + for i, output_node in enumerate(output_nodes): op_def = self._model.op.add() op_def.name = self.normalize_op_name(output_node.name) op_def.type = MaceOp.Dequantize.name op_def.input.extend([self.output_name_map[output_node.name]]) op_def.output.extend([output_node.name]) output_shape = op_def.output_shape.add() - output_shape.dims.extend( - self._producer[output_node.name].output_shape[0].dims) + producer_op = self._producer[output_node.name] + output_shape.dims.extend(producer_op.output_shape[0].dims) op_def.output_type.extend([mace_pb2.DT_FLOAT]) + quantize_info = producer_op.quantize_info[0] + self._model.output_info[i].scale = quantize_info.scale + self._model.output_info[i].zero_point = quantize_info.zero_point ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) @@ -1503,7 +1552,8 @@ class Transformer(base_converter.ConverterInterface): quantized_tensor = \ quantize_util.quantize_with_scale_and_zero( tensor.float_data, scale, 0) - elif self._option.device == DeviceType.HEXAGON.value: + elif self._option.device == DeviceType.HEXAGON.value or \ + self._option.device == DeviceType.HTA.value: quantized_tensor = \ quantize_util.quantize_bias_for_hexagon( tensor.float_data) @@ -1661,7 +1711,7 @@ class Transformer(base_converter.ConverterInterface): return False print("Add default quantize info for input") - for input_node in self._option.input_nodes.values(): + for i, input_node in enumerate(self._option.input_nodes.values()): if input_node.name not in self._quantize_activation_info: print("Input range %s: %s" % (input_node.name, str(input_node.range))) @@ -1670,7 +1720,8 @@ class Transformer(base_converter.ConverterInterface): quantize_util.adjust_range(input_node.range[0], input_node.range[1], non_zero=False) - quantize_info = mace_pb2.QuantizeActivationInfo() + quantize_info = \ + mace_pb2.QuantizeActivationInfo() quantize_info.minval = minval quantize_info.maxval = maxval quantize_info.scale = scale @@ -1725,18 +1776,29 @@ class Transformer(base_converter.ConverterInterface): self.add_quantize_info(op, 0.0, 1.0) self._quantize_activation_info[op.output[0]] = quantize_info elif (op.type == MaceOp.Eltwise.name - and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value # noqa and not op.quantize_info and len(op.input) == 2 and len(op.input[0]) not in self._consts and len(op.input[1]) not in self._consts): - del op.quantize_info[:] producer_op0 = self._producer[op.input[0]] producer_op1 = self._producer[op.input[1]] - minval = producer_op0.quantize_info[0].minval \ - + producer_op1.quantize_info[0].minval - maxval = producer_op0.quantize_info[0].maxval \ - + producer_op1.quantize_info[0].maxval + if ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i \ + == EltwiseType.SUM.value: + minval = producer_op0.quantize_info[0].minval \ + + producer_op1.quantize_info[0].minval + maxval = producer_op0.quantize_info[0].maxval \ + + producer_op1.quantize_info[0].maxval + elif ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i \ + == EltwiseType.SUB.value: + minval = producer_op0.quantize_info[0].minval \ + - producer_op1.quantize_info[0].maxval + maxval = producer_op0.quantize_info[0].maxval \ + - producer_op1.quantize_info[0].minval + else: + mace_check(False, "Quantized Elementwise only support:" + " SUM and SUB now.") quantize_info = \ self.add_quantize_info(op, minval, maxval) self._quantize_activation_info[op.output[0]] = quantize_info @@ -1880,3 +1942,131 @@ class Transformer(base_converter.ConverterInterface): producer_op.output_shape[0].dims[:] = output_shape return True + + def quantize_specific_ops_only(self): + """ + This transform rule is only used internally, we are not gonna make + things too complex for users + """ + to_quantize_ops_output_type = { + MaceOp.MatMul.name: mace_pb2.DT_INT32, + MaceOp.Gather.name: mace_pb2.DT_UINT8, + } + + for op in self._model.op: + if (op.type not in to_quantize_ops_output_type + or len(op.output) > 1 + or ConverterUtil.get_arg(op, + MaceKeyword.mace_op_data_type_str).i != mace_pb2.DT_FLOAT): # noqa + # only support single output + continue + + quantized_inputs_names = [] + + should_quantize = False + has_const = False + for idx, input_tensor in enumerate(op.input): + if input_tensor in self._consts: + has_const = True + break + if not has_const: + continue + + for idx, input_tensor in enumerate(op.input): + if self.get_tensor_data_type(input_tensor) \ + == mace_pb2.DT_FLOAT: + should_quantize = True + break + if not should_quantize: + continue + else: + print("Quantize op %s (%s)" % (op.name, op.type)) + + non_zero = self._option.device == DeviceType.CPU.value \ + and op.type == MaceOp.MatMul.name + + for idx, input_tensor in enumerate(op.input): + quantized_inputs_names.append(input_tensor) + + if self.get_tensor_data_type(input_tensor) \ + != mace_pb2.DT_FLOAT: + continue + + if input_tensor in self._consts: + const_tensor = self._consts[input_tensor] + quantized_tensor = quantize_util.quantize( + const_tensor.float_data, non_zero) + del const_tensor.float_data[:] + const_tensor.int32_data.extend(quantized_tensor.data) + const_tensor.data_type = mace_pb2.DT_UINT8 + const_tensor.scale = quantized_tensor.scale + const_tensor.zero_point = quantized_tensor.zero + const_tensor.minval = quantized_tensor.minval + const_tensor.maxval = quantized_tensor.maxval + const_tensor.quantized = True + else: + input_shape = self.get_tensor_shape(input_tensor) + quantize_op = self._model.op.add() + quantize_op.name = self.normalize_op_name( + input_tensor) + "_quant" + quantize_op.type = MaceOp.Quantize.name + quantize_op.input.extend([input_tensor]) + quantize_output_name = quantize_op.name + '_0' + quantize_op.output.extend([quantize_output_name]) + output_shape = quantize_op.output_shape.add() + output_shape.dims.extend(input_shape) + quantize_op.output_type.extend([mace_pb2.DT_UINT8]) + data_type_arg = quantize_op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = mace_pb2.DT_UINT8 + + data_type_arg = quantize_op.arg.add() + data_type_arg.name = MaceKeyword.mace_non_zero + if non_zero: + data_type_arg.i = 1 + else: + data_type_arg.i = 0 + + find_range_arg = quantize_op.arg.add() + find_range_arg.name = \ + MaceKeyword.mace_find_range_every_time + find_range_arg.i = 1 + + quantized_inputs_names[-1] = quantize_output_name + + non_zero = False + + del op.input[:] + op.input.extend(quantized_inputs_names) + + orginal_output_name = op.output[0] + op.output[0] = orginal_output_name + "_quant" + op.output_type.extend([to_quantize_ops_output_type[op.type]]) + data_type_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_op_data_type_str) # noqa + if data_type_arg is None: + data_type_arg = op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = mace_pb2.DT_UINT8 + + dequantize_op = self._model.op.add() + dequantize_op.name = op.name + "_dequant" + dequantize_op.type = MaceOp.Dequantize.name + dequantize_op.input.extend([op.output[0]]) + dequantize_op.output.extend([orginal_output_name]) + dequantize_op.output_shape.extend(op.output_shape) + dequantize_op.output_type.extend([mace_pb2.DT_FLOAT]) + data_type_arg = dequantize_op.arg.add() + data_type_arg.name = MaceKeyword.mace_op_data_type_str + data_type_arg.i = to_quantize_ops_output_type[op.type] + + quantize_flag_arg = ConverterUtil.get_arg(self._model, + MaceKeyword.mace_quantize_flag_arg_str) # noqa + if quantize_flag_arg is None: + quantize_flag_arg = self._model.arg.add() + quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str + quantize_flag_arg.i = 1 + + return True + + return False diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 1beae21ab41ca1fc583fbd016b4e6d8430ad2ff9..89bee8d8f9dba8ce27ff97ff016381eb7b9da5e7 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -16,10 +16,10 @@ #include -#include "mace/core/macros.h" +#include "mace/utils/macros.h" #include "mace/proto/mace.pb.h" #include "mace/public/mace.h" -#include "mace/utils/env_time.h" +#include "mace/port/env.h" #include "mace/utils/logging.h" namespace mace { @@ -75,7 +75,7 @@ void CreateNetArg(NetDef *net_def) { {% if net.input_info | length > 0 %} void CreateInputInfo(NetDef *net_def) { net_def->mutable_input_info()->Reserve({{ net.input_info | length }}); - InputInfo *input_info = nullptr; + InputOutputInfo *input_info = nullptr; {% for idx in range(net.input_info|length) %} input_info = net_def->add_input_info(); input_info->set_name({{ net.input_info[idx].name|tojson }}); @@ -92,7 +92,7 @@ void CreateInputInfo(NetDef *net_def) { {% if net.output_info | length > 0 %} void CreateOutputInfo(NetDef *net_def) { net_def->mutable_output_info()->Reserve({{ net.output_info | length }}); - OutputInfo *output_info = nullptr; + InputOutputInfo *output_info = nullptr; {% for idx in range(net.output_info|length) %} output_info = net_def->add_output_info(); output_info->set_name({{ net.output_info[idx].name|tojson }}); diff --git a/mace/python/tools/operator.jinja2 b/mace/python/tools/operator.jinja2 index 8992da31ef7c9468b723d362ac04ab98511593f5..b184b54a3d98f034147866d04a6b48c1af0703f9 100644 --- a/mace/python/tools/operator.jinja2 +++ b/mace/python/tools/operator.jinja2 @@ -19,7 +19,7 @@ #include "mace/proto/mace.pb.h" #include "mace/public/mace.h" -#include "mace/utils/env_time.h" +#include "mace/port/env.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/python/tools/quantization/quantize_util.py b/mace/python/tools/quantization/quantize_util.py index 349393870e24e39073761bd10faffc4277a7335d..666b94bdf58e6311e50d5351df8b233a60f50922 100644 --- a/mace/python/tools/quantization/quantize_util.py +++ b/mace/python/tools/quantization/quantize_util.py @@ -100,7 +100,7 @@ def cal_multiplier_and_shift(scale): def quantize_with_scale_and_zero(data, scale, zero): - output = np.round(zero + data / scale).astype(int) + output = np.round(zero + data / scale).astype(np.int32) quantized_data = QuantizedData() quantized_data.data = output quantized_data.scale = scale @@ -114,7 +114,7 @@ def quantize(data, non_zero): in_max = np_data.max() scale, zero, out_min, out_max = adjust_range(in_min, in_max, non_zero=non_zero) - output = np.clip((np.round(zero + data / scale).astype(int)), 0, 255) + output = np.clip((np.round(zero + data / scale).astype(np.int32)), 0, 255) quantized_data = QuantizedData() quantized_data.data = output @@ -132,7 +132,7 @@ def quantize_bias_for_hexagon(data): in_max = max_val scale = (in_max - in_min) / 2**32 zero = 0 - output = np.clip((np.round(zero + data / scale).astype(long)), + output = np.clip((np.round(zero + data / scale).astype(np.int64)), -2**31, 2**31 - 1) quantized_data = QuantizedData() diff --git a/mace/python/tools/tensor_source.jinja2 b/mace/python/tools/tensor_source.jinja2 index 77d91eab6aff431549b8e848369503944d52d5d3..d459d9bc806d23f7cb49ad90ba72f2a753dfd886 100644 --- a/mace/python/tools/tensor_source.jinja2 +++ b/mace/python/tools/tensor_source.jinja2 @@ -16,7 +16,7 @@ #include "mace/proto/mace.pb.h" #include "mace/public/mace.h" -#include "mace/utils/env_time.h" +#include "mace/port/env.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/python/tools/visualization/BUILD b/mace/python/tools/visualization/BUILD.bazel similarity index 100% rename from mace/python/tools/visualization/BUILD rename to mace/python/tools/visualization/BUILD.bazel diff --git a/mace/python/tools/visualization/index.html b/mace/python/tools/visualization/index.html index f36ea26066f44b6d1d999386d8e9035618b55fa6..658897fa646c159b06ece05aeca16b90e4c82fdf 100644 --- a/mace/python/tools/visualization/index.html +++ b/mace/python/tools/visualization/index.html @@ -111,7 +111,11 @@ Click node to see details at bottom of this page. var output_shapes = []; if (typeof node["outputShape"] !== "undefined") { for (var j = 0; j < node["outputShape"].length; j++) { - var output_shape = node["outputShape"][j].dims.join(","); + var output_shape = ""; + if (typeof node["outputShape"][j].dims !== "undefined") { + console.log(node["outputShape"][j].dims); + output_shape = node["outputShape"][j].dims.join(","); + } output_shapes.push(output_shape); } } @@ -140,11 +144,15 @@ Click node to see details at bottom of this page. " min=" + node["minval"] + " max=" + node["maxval"]; } + var dims = ""; + if (typeof node["dims"] != "undefined") { + dims = node["dims"].join(","); + } tensor_data.push({ "idx": tensor_data.length, "name": node["name"], "data_type": node["dataType"], - "dims": node["dims"].join(","), + "dims": dims, "quantize_info": quantize_info }) diff --git a/mace/test/BUILD b/mace/test/BUILD.bazel similarity index 93% rename from mace/test/BUILD rename to mace/test/BUILD.bazel index 36a2b6472d46db4360b1840b6031f32f94212e40..a5c5f974552dd13b35faff26f7e14266e042b3fc 100644 --- a/mace/test/BUILD +++ b/mace/test/BUILD.bazel @@ -11,6 +11,7 @@ load( "if_openmp_enabled", "if_android_armv7", "if_hexagon_enabled", + "if_hta_enabled", "if_opencl_enabled", "if_quantize_enabled", ) @@ -45,6 +46,8 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_hta_enabled([ + "-DMACE_ENABLE_HTA", ]), linkopts = ["-fopenmp"], linkstatic = 1, @@ -78,6 +81,8 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_hta_enabled([ + "-DMACE_ENABLE_HTA", ]), linkopts = ["-fopenmp"], linkstatic = 1, @@ -111,6 +116,8 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_hta_enabled([ + "-DMACE_ENABLE_HTA", ]), linkopts = ["-fopenmp"], linkstatic = 1, @@ -143,6 +150,8 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_hta_enabled([ + "-DMACE_ENABLE_HTA", ]), linkopts = ["-fopenmp"], linkstatic = 1, diff --git a/mace/test/mace_api_exception_test.cc b/mace/test/mace_api_exception_test.cc index 075b04b40c7467d2d6a6dff10b6cb245521b68f5..232023dace17584f49c15a499b196c538f6598eb 100644 --- a/mace/test/mace_api_exception_test.cc +++ b/mace/test/mace_api_exception_test.cc @@ -29,7 +29,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) { std::shared_ptr net_def(new NetDef()); for (size_t i = 0; i < input_names.size(); ++i) { - InputInfo *info = net_def->add_input_info(); + InputOutputInfo *info = net_def->add_input_info(); info->set_name(input_names[i]); } diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 6124792c5f1e395777b3874860e570173cad51c8..ee14129a05dd23d7d2fa6b3bcc491da375c12096 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -45,14 +45,15 @@ void MaceRunFunc(const int in_out_size) { filter_tensor_name, filter_shape, 0, data.size(), net_def.get()); for (size_t i = 0; i < input_names.size(); ++i) { - InputInfo *info = net_def->add_input_info(); + InputOutputInfo *info = net_def->add_input_info(); + info->set_data_format(DataFormat::NHWC); info->set_name(input_names[i]); for (auto d : input_shapes[0]) { info->add_dims(static_cast(d)); } } for (size_t i = 0; i < output_names.size(); ++i) { - OutputInfo *info = net_def->add_output_info(); + InputOutputInfo *info = net_def->add_output_info(); info->set_name(output_names[i]); } for (size_t i = 0; i < output_names.size(); ++i) { diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 438683fec2f694b73ac0d5b132bb73f1bf6377db..0a852a17a9a9cfd6a7d331556b1ad1b1a85e397a 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -44,14 +44,15 @@ void MaceRun(const int in_out_size, AddTensor(filter_tensor_name, filter_shape, 0, data.size(), net_def.get()); for (size_t i = 0; i < input_names.size(); ++i) { - InputInfo *info = net_def->add_input_info(); + InputOutputInfo *info = net_def->add_input_info(); + info->set_data_format(DataFormat::NHWC); info->set_name(input_names[i]); for (auto d : max_shape) { info->add_dims(static_cast(d)); } } for (size_t i = 0; i < output_names.size(); ++i) { - OutputInfo *info = net_def->add_output_info(); + InputOutputInfo *info = net_def->add_output_info(); info->set_name(output_names[i]); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -123,12 +124,11 @@ TEST_F(MaceAPITest, MultipleInputOutput) { } TEST_F(MaceAPITest, VariableInputShape) { - // TODO(liyin): there is a bug of cpu convolution -// MaceRun(1, -// {1, 32, 64, 16}, -// {{1, 16, 32, 16}, {1, 32, 64, 16}}, -// {{1, 16, 32, 16}, {1, 32, 64, 16}}, -// {16, 16, 3, 3}); + MaceRun(1, + {1, 32, 64, 16}, + {{1, 16, 32, 16}, {1, 32, 64, 16}}, + {{1, 16, 32, 16}, {1, 32, 64, 16}}, + {16, 16, 3, 3}); MaceRun(1, {1, 32, 64, 16}, {{1, 16, 32, 16}, {1, 32, 64, 16}}, diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h index 2c2ed7d177fb2b1d834f427a5ecfaa956fe7e648..2257b2162ca6d53e81fd29367594bf860ff115ec 100644 --- a/mace/test/mace_api_test.h +++ b/mace/test/mace_api_test.h @@ -76,6 +76,7 @@ void Conv3x3(const std::string &input_name, .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", 1) .Finalize(&operator_def); OutputShape *shape = operator_def.add_output_shape(); @@ -98,6 +99,7 @@ void Relu(const std::string &input_name, .AddStringArg("activation", "RELU") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .AddIntArg("device", static_cast(device_type)) + .AddIntArg("has_data_format", 1) .Finalize(&operator_def); net_def->add_op()->CopyFrom(operator_def); diff --git a/mace/tools/validation/BUILD b/mace/tools/validation/BUILD.bazel similarity index 86% rename from mace/tools/validation/BUILD rename to mace/tools/validation/BUILD.bazel index 7e238c00730ba3f3ad87259aa857be61f8e72653..d85283acbc9b1e407e3c7a0bf69ebf5182804897 100644 --- a/mace/tools/validation/BUILD +++ b/mace/tools/validation/BUILD.bazel @@ -29,14 +29,8 @@ cc_binary( ] + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]), - linkopts = [ - "-lm", - ] + if_openmp_enabled([ - "-fopenmp" - ]) + if_android([ - "-ldl", - "-pie", - "-llog", + linkopts = if_openmp_enabled([ + "-fopenmp", ]), linkstatic = 0, deps = [ diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 7ea06f089eca63a189edac2306641bac81e39c7f..0653304fde80b275217eba9332ab4a121c169a9a 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -34,9 +34,10 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/utils/env_time.h" +#include "mace/port/env.h" +#include "mace/port/file_system.h" #include "mace/utils/logging.h" -#include "mace/utils/utils.h" +#include "mace/utils/string_util.h" #ifdef MODEL_GRAPH_FORMAT_CODE #include "mace/codegen/engine/mace_engine_factory.h" @@ -46,29 +47,6 @@ namespace mace { namespace tools { namespace validation { -namespace str_util { - -std::vector Split(const std::string &str, char delims) { - std::vector result; - if (str.empty()) { - result.push_back(""); - return result; - } - std::string tmp = str; - while (!tmp.empty()) { - size_t next_offset = tmp.find(delims); - result.push_back(tmp.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp = tmp.substr(next_offset + 1); - } - } - return result; -} - -} // namespace str_util - void ParseShape(const std::string &str, std::vector *shape) { std::string tmp = str; while (!tmp.empty()) { @@ -98,11 +76,25 @@ DeviceType ParseDeviceType(const std::string &device_str) { return DeviceType::GPU; } else if (device_str.compare("HEXAGON") == 0) { return DeviceType::HEXAGON; + } else if (device_str.compare("HTA") == 0) { + return DeviceType::HTA; } else { return DeviceType::CPU; } } +DataFormat ParseDataFormat(const std::string &data_format_str) { + if (data_format_str == "NHWC") { + return DataFormat::NHWC; + } else if (data_format_str == "NCHW") { + return DataFormat::NCHW; + } else if (data_format_str == "OIHW") { + return DataFormat::OIHW; + } else { + return DataFormat::DF_NONE; + } +} + struct mallinfo LogMallinfoChange(struct mallinfo prev) { struct mallinfo curr = mallinfo(); if (prev.arena != curr.arena) { @@ -168,6 +160,12 @@ DEFINE_string(output_node, DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma"); +DEFINE_string(input_data_format, + "NHWC", + "input data formats, NONE|NHWC|NCHW"); +DEFINE_string(output_data_format, + "NHWC", + "output data formats, NONE|NHWC|NCHW"); DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs."); @@ -206,8 +204,11 @@ DEFINE_int32(cpu_affinity_policy, 1, bool RunModel(const std::string &model_name, const std::vector &input_names, const std::vector> &input_shapes, + const std::vector &input_data_formats, const std::vector &output_names, - const std::vector> &output_shapes) { + const std::vector> &output_shapes, + const std::vector &output_data_formats, + float cpu_capability) { DeviceType device_type = ParseDeviceType(FLAGS_device); int64_t t0 = NowMicros(); @@ -243,20 +244,24 @@ bool RunModel(const std::string &model_name, } #endif // MACE_ENABLE_OPENCL - std::vector model_graph_data; + std::unique_ptr model_graph_data; if (FLAGS_model_file != "") { - if (!mace::ReadBinaryFile(&model_graph_data, FLAGS_model_file)) { + auto fs = GetFileSystem(); + status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(), + &model_graph_data); + if (status != MaceStatus::MACE_SUCCESS) { LOG(FATAL) << "Failed to read file: " << FLAGS_model_file; } } - const unsigned char *model_weights_data = nullptr; - size_t model_weights_data_size = 0; + std::unique_ptr model_weights_data; if (FLAGS_model_data_file != "") { - MemoryMap(FLAGS_model_data_file, - &model_weights_data, - &model_weights_data_size); - MACE_CHECK(model_weights_data != nullptr && model_weights_data_size != 0); + auto fs = GetFileSystem(); + status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_data_file.c_str(), + &model_weights_data); + if (status != MaceStatus::MACE_SUCCESS) { + LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file; + } } std::shared_ptr engine; @@ -268,8 +273,9 @@ bool RunModel(const std::string &model_name, #ifdef MODEL_GRAPH_FORMAT_CODE create_engine_status = CreateMaceEngineFromCode(model_name, - model_weights_data, - model_weights_data_size, + reinterpret_cast( + model_weights_data->data()), + model_weights_data->length(), input_names, output_names, config, @@ -277,10 +283,12 @@ bool RunModel(const std::string &model_name, #else (void)(model_name); create_engine_status = - CreateMaceEngineFromProto(model_graph_data.data(), - model_graph_data.size(), - model_weights_data, - model_weights_data_size, + CreateMaceEngineFromProto(reinterpret_cast( + model_graph_data->data()), + model_graph_data->length(), + reinterpret_cast( + model_weights_data->data()), + model_weights_data->length(), input_names, output_names, config, @@ -325,7 +333,8 @@ bool RunModel(const std::string &model_name, LOG(INFO) << "Open input file failed"; return -1; } - inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); + inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in, + input_data_formats[i]); } for (size_t i = 0; i < output_count; ++i) { @@ -334,7 +343,8 @@ bool RunModel(const std::string &model_name, std::multiplies()); auto buffer_out = std::shared_ptr(new float[output_size], std::default_delete()); - outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); + outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out, + output_data_formats[i]); } LOG(INFO) << "Warm up run"; @@ -349,18 +359,21 @@ bool RunModel(const std::string &model_name, #ifdef MODEL_GRAPH_FORMAT_CODE create_engine_status = CreateMaceEngineFromCode(model_name, - model_weights_data, - model_weights_data_size, + reinterpret_cast( + model_weights_data->data()), + model_weights_data->length(), input_names, output_names, config, &engine); #else create_engine_status = - CreateMaceEngineFromProto(model_graph_data.data(), - model_graph_data.size(), - model_weights_data, - model_weights_data_size, + CreateMaceEngineFromProto(reinterpret_cast( + model_graph_data->data()), + model_graph_data->length(), + reinterpret_cast( + model_weights_data->data()), + model_weights_data->length(), input_names, output_names, config, @@ -392,22 +405,26 @@ bool RunModel(const std::string &model_name, #ifdef MODEL_GRAPH_FORMAT_CODE create_engine_status = CreateMaceEngineFromCode(model_name, - model_weights_data, - model_weights_data_size, + reinterpret_cast( + model_weights_data->data()), + model_weights_data->length(), input_names, output_names, config, &engine); #else create_engine_status = - CreateMaceEngineFromProto(model_graph_data.data(), - model_graph_data.size(), - model_weights_data, - model_weights_data_size, - input_names, - output_names, - config, - &engine); + CreateMaceEngineFromProto( + reinterpret_cast( + model_graph_data->data()), + model_graph_data->length(), + reinterpret_cast( + model_weights_data->data()), + model_weights_data->length(), + input_names, + output_names, + config, + &engine); #endif } while (create_engine_status != MaceStatus::MACE_SUCCESS); } else { @@ -426,11 +443,11 @@ bool RunModel(const std::string &model_name, } // Metrics reporting tools depends on the format, keep in consistent - printf("========================================\n"); - printf(" init warmup run_avg\n"); - printf("========================================\n"); - printf("time %11.3f %11.3f %11.3f\n", - init_millis, warmup_millis, model_run_millis); + printf("========================================================\n"); + printf(" capability(CPU) init warmup run_avg\n"); + printf("========================================================\n"); + printf("time %15.3f %11.3f %11.3f %11.3f\n", + cpu_capability, init_millis, warmup_millis, model_run_millis); for (size_t i = 0; i < output_count; ++i) { @@ -449,10 +466,6 @@ bool RunModel(const std::string &model_name, << output_size << " done."; } - if (model_weights_data != nullptr) { - MemoryUnMap(model_weights_data, model_weights_data_size); - } - return true; } @@ -480,13 +493,10 @@ int Main(int argc, char **argv) { LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy; - std::vector input_names = str_util::Split(FLAGS_input_node, ','); - std::vector output_names = - str_util::Split(FLAGS_output_node, ','); - std::vector input_shapes = - str_util::Split(FLAGS_input_shape, ':'); - std::vector output_shapes = - str_util::Split(FLAGS_output_shape, ':'); + std::vector input_names = Split(FLAGS_input_node, ','); + std::vector output_names = Split(FLAGS_output_node, ','); + std::vector input_shapes = Split(FLAGS_input_shape, ':'); + std::vector output_shapes = Split(FLAGS_output_shape, ':'); const size_t input_count = input_shapes.size(); const size_t output_count = output_shapes.size(); @@ -498,13 +508,30 @@ int Main(int argc, char **argv) { for (size_t i = 0; i < output_count; ++i) { ParseShape(output_shapes[i], &output_shape_vec[i]); } + std::vector raw_input_data_formats = + Split(FLAGS_input_data_format, ','); + std::vector raw_output_data_formats = + Split(FLAGS_output_data_format, ','); + std::vector input_data_formats(input_count); + std::vector output_data_formats(output_count); + for (size_t i = 0; i < input_count; ++i) { + input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]); + } + for (size_t i = 0; i < output_count; ++i) { + output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]); + } + + + // get cpu capability + Capability cpu_capability = GetCapability(DeviceType::CPU); bool ret = false; for (int i = 0; i < FLAGS_restart_round; ++i) { VLOG(0) << "restart round " << i; - ret = - RunModel(FLAGS_model_name, input_names, input_shape_vec, - output_names, output_shape_vec); + ret = RunModel(FLAGS_model_name, + input_names, input_shape_vec, input_data_formats, + output_names, output_shape_vec, output_data_formats, + cpu_capability.float32_performance.exec_time); } if (ret) { return 0; diff --git a/mace/utils/BUILD b/mace/utils/BUILD.bazel similarity index 61% rename from mace/utils/BUILD rename to mace/utils/BUILD.bazel index 4388e1a6628de7f738cb2a971d9a9c8f29022bd3..378210a3905e68188a8de35d2b1a8b1dacdefd39 100644 --- a/mace/utils/BUILD +++ b/mace/utils/BUILD.bazel @@ -10,9 +10,27 @@ licenses(["notice"]) # Apache 2.0 load( "//mace:mace.bzl", "if_android", + "if_android_armv7", + "if_neon_enabled", "if_openmp_enabled", ) +cc_library( + name = "utils_hdrs", + hdrs = glob([ + "*.h", + ]), + copts = [ + "-Werror", + "-Wextra", + "-Wno-missing-field-initializers", + ], + deps = [ + "//mace/port:port_api", + "//mace/public", + ], +) + cc_library( name = "utils", srcs = glob( @@ -20,46 +38,47 @@ cc_library( "*.cc", ], exclude = [ - "tuner_test.cc", + "*_test.cc", ], ), - hdrs = glob([ - "*.h", - ]), copts = [ "-Werror", "-Wextra", "-Wno-missing-field-initializers", ] + if_openmp_enabled([ "-fopenmp", + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + "-mfloat-abi=softfp", ]), linkopts = if_android([ "-llog", ]), deps = [ - "//mace/public", + ":utils_hdrs", ], + alwayslink = 1, ) cc_test( - name = "tuner_test", + name = "utils_test", testonly = 1, - srcs = [ - "tuner_test.cc", - ], + srcs = glob( + [ + "*_test.cc", + ], + ), copts = [ "-Werror", "-Wextra", "-Wno-missing-field-initializers", ], - linkopts = ["-ldl"] + if_android([ - "-pie", - "-lm", # Required by unordered_map - ]), linkstatic = 1, deps = [ ":utils", - "//mace/core", + "//mace/port", "@gtest//:gtest", "@gtest//:gtest_main", ], diff --git a/mace/utils/conf_util.h b/mace/utils/conf_util.h new file mode 100644 index 0000000000000000000000000000000000000000..4800b15e24d47c3690531a94f52214616c710624 --- /dev/null +++ b/mace/utils/conf_util.h @@ -0,0 +1,33 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_UTILS_CONF_UTIL_H_ +#define MACE_UTILS_CONF_UTIL_H_ + +#include +#include +#include +#include +#include + +namespace mace { + +inline bool EnvConfEnabled(std::string env_name) { + char *env = getenv(env_name.c_str()); + return !(!env || env[0] == 0 || env[0] == '0'); +} + +} // namespace mace + +#endif // MACE_UTILS_CONF_UTIL_H_ diff --git a/mace/utils/detection_output.cc b/mace/utils/detection_output.cc deleted file mode 100644 index 10a4f4f0903e65d3d11ff53051b61f4a15dc1756..0000000000000000000000000000000000000000 --- a/mace/utils/detection_output.cc +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include - -#include "mace/utils/logging.h" - -namespace mace { - -struct BBox { - float xmin; - float ymin; - float xmax; - float ymax; - int label; - float confidence; -}; - -namespace { -inline float overlap(const BBox &a, const BBox &b) { - if (a.xmin > b.xmax || a.xmax < b.xmin || - a.ymin > b.ymax || a.ymax < b.ymin) { - return 0.f; - } - float overlap_w = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); - float overlap_h = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); - return overlap_w * overlap_h; -} - -void NmsSortedBboxes(const std::vector &bboxes, - const float nms_threshold, - const int top_k, - std::vector *sorted_boxes) { - const int n = std::min(top_k, static_cast(bboxes.size())); - std::vector picked; - - std::vector areas(n); -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < n; ++i) { - const BBox &r = bboxes[i]; - float width = std::max(0.f, r.xmax - r.xmin); - float height = std::max(0.f, r.ymax - r.ymin); - areas[i] = width * height; - } - - for (int i = 0; i < n; ++i) { - const BBox &a = bboxes[i]; - int keep = 1; - for (size_t j = 0; j < picked.size(); ++j) { - const BBox &b = bboxes[picked[j]]; - - float inter_area = overlap(a, b); - float union_area = areas[i] + areas[picked[j]] - inter_area; - MACE_CHECK(union_area > 0, "union_area should be greater than 0"); - if (inter_area / union_area > nms_threshold) { - keep = 0; - break; - } - } - - if (keep) { - picked.push_back(i); - sorted_boxes->push_back(bboxes[i]); - } - } -} - -inline bool cmp(const BBox &a, const BBox &b) { - return a.confidence > b.confidence; -} -} // namespace - -int DetectionOutput(const float *loc_ptr, - const float *conf_ptr, - const float *pbox_ptr, - const int num_prior, - const int num_classes, - const float nms_threshold, - const int top_k, - const int keep_top_k, - const float confidence_threshold, - std::vector *bbox_rects) { - MACE_CHECK(keep_top_k > 0, "keep_top_k should be greater than 0"); - std::vector bboxes(4 * num_prior); -#pragma omp parallel for schedule(runtime) - for (int i = 0; i < num_prior; ++i) { - int index = i * 4; - const float *lc = loc_ptr + index; - const float *pb = pbox_ptr + index; - const float *var = pb + num_prior * 4; - - float pb_w = pb[2] - pb[0]; - float pb_h = pb[3] - pb[1]; - float pb_cx = (pb[0] + pb[2]) * 0.5f; - float pb_cy = (pb[1] + pb[3]) * 0.5f; - - float bbox_cx = var[0] * lc[0] * pb_w + pb_cx; - float bbox_cy = var[1] * lc[1] * pb_h + pb_cy; - float bbox_w = std::exp(var[2] * lc[2]) * pb_w; - float bbox_h = std::exp(var[3] * lc[3]) * pb_h; - - bboxes[0 + index] = bbox_cx - bbox_w * 0.5f; - bboxes[1 + index] = bbox_cy - bbox_h * 0.5f; - bboxes[2 + index] = bbox_cx + bbox_w * 0.5f; - bboxes[3 + index] = bbox_cy + bbox_h * 0.5f; - } - // start from 1 to ignore background class - - for (int i = 1; i < num_classes; ++i) { - // filter by confidence threshold - std::vector class_bbox_rects; - for (int j = 0; j < num_prior; ++j) { - float confidence = conf_ptr[j * num_classes + i]; - if (confidence > confidence_threshold) { - BBox c = {bboxes[0 + j * 4], bboxes[1 + j * 4], bboxes[2 + j * 4], - bboxes[3 + j * 4], i, confidence}; - class_bbox_rects.push_back(c); - } - } - std::sort(class_bbox_rects.begin(), class_bbox_rects.end(), cmp); - - // apply nms - std::vector sorted_boxes; - NmsSortedBboxes(class_bbox_rects, - nms_threshold, - std::min(top_k, - static_cast(class_bbox_rects.size())), - &sorted_boxes); - // gather - bbox_rects->insert(bbox_rects->end(), sorted_boxes.begin(), - sorted_boxes.end()); - } - - std::sort(bbox_rects->begin(), bbox_rects->end(), cmp); - - // output - int num_detected = keep_top_k < static_cast(bbox_rects->size()) ? - keep_top_k : static_cast(bbox_rects->size()); - bbox_rects->resize(num_detected); - - return num_detected; -} -} // namespace mace diff --git a/mace/utils/logging.cc b/mace/utils/logging.cc deleted file mode 100644 index 8091f0a0148e8f1d68f7c88858585d51f232dd8a..0000000000000000000000000000000000000000 --- a/mace/utils/logging.cc +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/utils/logging.h" - -#include -#include -#include -#if defined(ANDROID) || defined(__ANDROID__) -#include -#include -#endif - -namespace mace { -namespace logging { - -LogMessage::LogMessage(const char *fname, int line, int severity) - : fname_(fname), line_(line), severity_(severity) {} - -void LogMessage::DealWithFatal() { - // When there is a fatal log, now we simply abort. - abort(); -} - -void LogMessage::GenerateLogMessage() { -#if defined(ANDROID) || defined(__ANDROID__) - int android_log_level; - switch (severity_) { - case INFO: - android_log_level = ANDROID_LOG_INFO; - break; - case WARNING: - android_log_level = ANDROID_LOG_WARN; - break; - case ERROR: - android_log_level = ANDROID_LOG_ERROR; - break; - case FATAL: - android_log_level = ANDROID_LOG_FATAL; - break; - default: - if (severity_ < INFO) { - android_log_level = ANDROID_LOG_VERBOSE; - } else { - android_log_level = ANDROID_LOG_ERROR; - } - break; - } - - std::stringstream ss; - const char *const partial_name = strrchr(fname_, '/'); - ss << (partial_name != nullptr ? partial_name + 1 : fname_) << ":" << line_ - << " " << str(); - __android_log_write(android_log_level, "MACE", ss.str().c_str()); - - // Also log to stderr (for standalone Android apps). - std::cerr << "IWEF"[severity_] << " " << ss.str() << std::endl; -#else - fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_, - str().c_str()); -#endif - - // When there is a fatal log, terminate execution - if (severity_ == FATAL) { - DealWithFatal(); - } -} -namespace { - -int LogLevelStrToInt(const char *mace_env_var_val) { - if (mace_env_var_val == nullptr) { - return 0; - } - // Simply use atoi here. Return 0 if convert unsuccessfully. - return atoi(mace_env_var_val); -} - -int MinLogLevelFromEnv() { - // Read the min log level from env once during the first call to logging. - static int log_level = LogLevelStrToInt(getenv("MACE_CPP_MIN_LOG_LEVEL")); - return log_level; -} - -int MinVLogLevelFromEnv() { - // Read the min vlog level from env once during the first call to logging. - static int vlog_level = LogLevelStrToInt(getenv("MACE_CPP_MIN_VLOG_LEVEL")); - return vlog_level; -} - -} // namespace - -LogMessage::~LogMessage() { - int min_log_level = MinLogLevelFromEnv(); - if (severity_ >= min_log_level) GenerateLogMessage(); -} - -int LogMessage::MinVLogLevel() { - return MinVLogLevelFromEnv(); -} - -} // namespace logging -} // namespace mace diff --git a/mace/utils/logging.h b/mace/utils/logging.h index 63d372d88b5c1241e34f8c92c7ff9b7c41d6a33e..8a5f2f8e025f1ad350a9503243dd66ad9628691f 100644 --- a/mace/utils/logging.h +++ b/mace/utils/logging.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,45 +21,19 @@ #include #include -#include "mace/public/mace.h" -#include "mace/utils/env_time.h" +#include "mace/port/env.h" +#include "mace/port/logger.h" +#include "mace/utils/macros.h" #include "mace/utils/string_util.h" -#include "mace/utils/utils.h" -#undef ERROR namespace mace { - -// Log severity level constants. -const int INFO = 0; -const int WARNING = 1; -const int ERROR = 2; -const int FATAL = 3; - -namespace logging { - -class LogMessage : public std::ostringstream { - public: - LogMessage(const char *fname, int line, int severity); - ~LogMessage(); - - static int MinVLogLevel(); - - private: - void GenerateLogMessage(); - void DealWithFatal(); - - const char *fname_; - int line_; - int severity_; -}; +namespace logging_internal { #define LOG(severity) \ - ::mace::logging::LogMessage(__FILE__, __LINE__, mace::severity) + ::mace::port::Logger(__FILE__, __LINE__, mace::severity) -// Set MACE_CPP_MIN_VLOG_LEVEL environment to update minimum log level of VLOG. -// Only when vlog_level <= MinVLogLevel(), it will produce output. -#define VLOG_IS_ON(vll) ((vll) <= ::mace::logging::LogMessage::MinVLogLevel()) +#define VLOG_IS_ON(vll) (mace::ShouldGenerateVLogMessage(vll)) #define VLOG(vll) if (VLOG_IS_ON(vll)) LOG(INFO) // MACE_CHECK/MACE_ASSERT dies with a fatal error if condition is not true. @@ -85,17 +59,27 @@ class LogMessage : public std::ostringstream { template T &&CheckNotNull(const char *file, int line, const char *exprtext, T &&t) { if (t == nullptr) { - ::mace::logging::LogMessage(file, line, FATAL) << std::string(exprtext); + ::mace::port::Logger(file, line, FATAL) << std::string(exprtext); } return std::forward(t); } #define MACE_CHECK_NOTNULL(val) \ - ::mace::logging::CheckNotNull(__FILE__, __LINE__, \ - "'" #val "' Must not be NULL", (val)) + ::mace::logging_internal::CheckNotNull(__FILE__, __LINE__, \ + "'" #val "' Must not be NULL", (val)) #define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented") +#define MACE_RETURN_IF_ERROR(stmt) \ + { \ + MaceStatus status = (stmt); \ + if (status != MaceStatus::MACE_SUCCESS) { \ + VLOG(0) << #stmt << " failed with error: " \ + << status.information(); \ + return status; \ + } \ + } + class LatencyLogger { public: LatencyLogger(int vlog_level, const std::string &message) @@ -121,11 +105,21 @@ class LatencyLogger { MACE_DISABLE_COPY_AND_ASSIGN(LatencyLogger); }; -#define MACE_LATENCY_LOGGER(vlog_level, ...) \ - mace::logging::LatencyLogger latency_logger_##__line__( \ +#define MACE_LATENCY_LOGGER(vlog_level, ...) \ + mace::logging_internal::LatencyLogger latency_logger_##__line__( \ vlog_level, VLOG_IS_ON(vlog_level) ? mace::MakeString(__VA_ARGS__) : "") -} // namespace logging + +#ifdef MACE_ENABLE_MALLOC_LOGGING +#define MACE_MEMORY_LOGGING_GUARD() \ + auto malloc_logger_##__line__ = port::Env::Default()->NewMallocLogger( \ + ::mace::port::Logger(__FILE__, __LINE__, mace::INFO), \ + std::string(__FILE__) + ":" + std::string(__func__)); +#else +#define MACE_MEMORY_LOGGING_GUARD() +#endif + +} // namespace logging_internal } // namespace mace #endif // MACE_UTILS_LOGGING_H_ diff --git a/mace/utils/logging_test.cc b/mace/utils/logging_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a33cc96161c09f2daddac05411e3b6c269d2d5b --- /dev/null +++ b/mace/utils/logging_test.cc @@ -0,0 +1,41 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/utils/logging.h" + +#include + +namespace mace { +namespace { + +class LoggingTest : public ::testing::Test { +}; + +TEST_F(LoggingTest, Basic) { + LOG(INFO) << "info logging"; + LOG(WARNING) << "warning logging"; + LOG(ERROR) << "error logging"; + + VLOG(1) << "vlog 1 logging"; + VLOG(2) << "vlog 2 logging"; +} + +TEST_F(LoggingTest, LogFatal) { +#ifdef GTEST_HAS_DEATH_TEST + EXPECT_DEATH(do { LOG(FATAL) << "fatal logging"; } while (false), ""); +#endif +} + +} // namespace +} // namespace mace diff --git a/mace/core/macros.h b/mace/utils/macros.h similarity index 61% rename from mace/core/macros.h rename to mace/utils/macros.h index e90049f4764ea07654ed810e8086230dc2fc9b5b..1ce38183018b5ddb9b64a3756126cfd6426c4f68 100644 --- a/mace/core/macros.h +++ b/mace/utils/macros.h @@ -1,4 +1,4 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. +// Copyright 2019 The MACE Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,8 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_CORE_MACROS_H_ -#define MACE_CORE_MACROS_H_ +#ifndef MACE_UTILS_MACROS_H_ +#define MACE_UTILS_MACROS_H_ + +namespace mace { + +// Disable the copy and assignment operator for a class. +#ifndef MACE_DISABLE_COPY_AND_ASSIGN +#define MACE_DISABLE_COPY_AND_ASSIGN(CLASSNAME) \ + CLASSNAME(const CLASSNAME &) = delete; \ + CLASSNAME &operator=(const CLASSNAME &) = delete; +#endif + +#ifndef MACE_EMPTY_VIRTUAL_DESTRUCTOR +#define MACE_EMPTY_VIRTUAL_DESTRUCTOR(CLASSNAME) \ + public: \ + virtual ~CLASSNAME() {} +#endif + +#define MACE_UNUSED(var) (void)(var) + +#define MACE_COMPUTE_KERNEL_SOURCE(...) #__VA_ARGS__ // GCC can be told that a certain branch is not likely to be taken (for // instance, a CHECK failure), and use that information in static analysis. @@ -27,6 +46,6 @@ #define MACE_PREDICT_TRUE(x) (x) #endif -#define MACE_UNUSED(var) (void)(var) +} // namespace mace -#endif // MACE_CORE_MACROS_H_ +#endif // MACE_UTILS_MACROS_H_ diff --git a/mace/utils/utils.h b/mace/utils/math.h similarity index 50% rename from mace/utils/utils.h rename to mace/utils/math.h index 0b1a6992c0d6240e62516379b34eda2a313cf74f..0293806c66667d55439b6802e1a8ec3943c1635e 100644 --- a/mace/utils/utils.h +++ b/mace/utils/math.h @@ -12,29 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_UTILS_UTILS_H_ -#define MACE_UTILS_UTILS_H_ +#ifndef MACE_UTILS_MATH_H_ +#define MACE_UTILS_MATH_H_ -#include -#include -#include -#include +#include -namespace mace { +#include +#include -// Disable the copy and assignment operator for a class. -#ifndef MACE_DISABLE_COPY_AND_ASSIGN -#define MACE_DISABLE_COPY_AND_ASSIGN(CLASSNAME) \ - private: \ - CLASSNAME(const CLASSNAME &) = delete; \ - CLASSNAME &operator=(const CLASSNAME &) = delete -#endif +#include "mace/utils/logging.h" -#ifndef MACE_EMPTY_VIRTUAL_DESTRUCTOR -#define MACE_EMPTY_VIRTUAL_DESTRUCTOR(CLASSNAME) \ - public: \ - virtual ~CLASSNAME() {} -#endif +namespace mace { template Integer RoundUp(Integer i, Integer factor) { @@ -67,51 +55,38 @@ Integer CeilQuotient(Integer a, Integer b) { return (a + b - 1) / b; } -std::string ObfuscateString(const std::string &src, - const std::string &lookup_table); - -std::string ObfuscateString(const std::string &src); - -std::string ObfuscateSymbol(const std::string &src); - -#ifdef MACE_OBFUSCATE_LITERALS -#define MACE_OBFUSCATE_STRING(str) ObfuscateString(str) -#define MACE_OBFUSCATE_SYMBOL(str) ObfuscateSymbol(str) -#else -#define MACE_OBFUSCATE_STRING(str) (str) -#define MACE_OBFUSCATE_SYMBOL(str) (str) -#endif - -std::vector Split(const std::string &str, char delims); - -bool ReadBinaryFile(std::vector *data, - const std::string &filename); - -void MemoryMap(const std::string &file, - const unsigned char **data, - size_t *size); - -void MemoryUnMap(const unsigned char *data, - const size_t &size); +template +inline Integer Clamp(Integer in, Integer low, Integer high) { + return std::max(low, std::min(in, high)); +} template -std::vector MapKeys(const std::map &data) { - std::vector keys; - for (auto &kv : data) { - keys.push_back(kv.first); +inline T ScalarSigmoid(T in) { + if (in > static_cast(0)) { + return static_cast(1) / (static_cast(1) + std::exp(-in)); + } else { + T x = std::exp(in); + return x / (x + static_cast(1)); } - return keys; } -inline bool EnvEnabled(std::string env_name) { - char *env = getenv(env_name.c_str()); - return !(!env || env[0] == 0 || env[0] == '0'); +template +inline T ScalarTanh(T in) { + if (in > static_cast(0)) { + T inv_expa = std::exp(-in); + return -static_cast(1) + + static_cast(2) / (static_cast(1) + inv_expa * inv_expa); + } else { + T x = std::exp(in); + return x / (x + static_cast(1)); + } } template std::vector TransposeShape(const std::vector &shape, const std::vector &dst_dims) { size_t shape_dims = shape.size(); + MACE_CHECK(shape_dims == dst_dims.size()); std::vector output_shape(shape_dims); for (size_t i = 0; i < shape_dims; ++i) { output_shape[i] = static_cast(shape[dst_dims[i]]); @@ -120,4 +95,5 @@ std::vector TransposeShape(const std::vector &shape, } } // namespace mace -#endif // MACE_UTILS_UTILS_H_ + +#endif // MACE_UTILS_MATH_H_ diff --git a/mace/utils/memory.h b/mace/utils/memory.h new file mode 100644 index 0000000000000000000000000000000000000000..41a898ef48fd712ce65191f967565531a4afdd89 --- /dev/null +++ b/mace/utils/memory.h @@ -0,0 +1,74 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_UTILS_MEMORY_H_ +#define MACE_UTILS_MEMORY_H_ + +#include +#include + +namespace mace { + +namespace memory_internal { + +// Traits to select proper overload and return type for `make_unique<>`. +template +struct MakeUniqueResult { + using scalar = std::unique_ptr; +}; +template +struct MakeUniqueResult { + using array = std::unique_ptr; +}; +template +struct MakeUniqueResult { + using invalid = void; +}; + +} // namespace memory_internal + +// gcc 4.8 has __cplusplus at 201301 but doesn't define make_unique. Other +// supported compilers either just define __cplusplus as 201103 but have +// make_unique (msvc), or have make_unique whenever __cplusplus > 201103 (clang) +#if (__cplusplus > 201103L || defined(_MSC_VER)) && \ + !(defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8) +using std::make_unique; +#else + +// `make_unique` overload for non-array types. +template +typename memory_internal::MakeUniqueResult::scalar make_unique( + Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +// `make_unique` overload for an array T[] of unknown bounds. +// The array allocation needs to use the `new T[size]` form and cannot take +// element constructor arguments. The `std::unique_ptr` will manage destructing +// these array elements. +template +typename memory_internal::MakeUniqueResult::array make_unique(size_t n) { + return std::unique_ptr(new typename std::remove_extent::type[n]()); +} + +// `make_unique` overload for an array T[N] of known bounds. +// This construction will be rejected. +template +typename memory_internal::MakeUniqueResult::invalid make_unique( + Args&&... /* args */) = delete; +#endif + +} // namespace mace + +#endif // MACE_UTILS_MEMORY_H_ diff --git a/mace/utils/memory_logging.h b/mace/utils/memory_logging.h deleted file mode 100644 index 4e3cd5883b749b8f1d49d5f8d6ec886d8f65a78b..0000000000000000000000000000000000000000 --- a/mace/utils/memory_logging.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_UTILS_MEMORY_LOGGING_H_ -#define MACE_UTILS_MEMORY_LOGGING_H_ - -#ifndef __hexagon__ -#include -#endif -#include - -#include "mace/utils/logging.h" - -namespace mace { - -#ifdef MACE_ENABLE_MEMORY_LOGGING -class MallinfoChangeLogger { - public: - explicit MallinfoChangeLogger(const std::string &name) : name_(name) { - prev_ = mallinfo(); - } - ~MallinfoChangeLogger() { - struct mallinfo curr = mallinfo(); - LogMallinfoChange(name_, curr, prev_); - } - - private: - const std::string name_; - struct mallinfo prev_; - - struct mallinfo LogMallinfoChange(const std::string &name, - const struct mallinfo curr, - const struct mallinfo prev) { - if (prev.arena != curr.arena) { - LOG(INFO) << "[" << name << "] " - << "Non-mmapped space allocated (bytes): " << curr.arena - << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena); - } - if (prev.ordblks != curr.ordblks) { - LOG(INFO) << "[" << name << "] " - << "Number of free chunks: " << curr.ordblks << ", diff: " - << ((int64_t)curr.ordblks - (int64_t)prev.ordblks); - } - if (prev.smblks != curr.smblks) { - LOG(INFO) << "[" << name << "] " - << "Number of free fastbin blocks: " << curr.smblks - << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks); - } - if (prev.hblks != curr.hblks) { - LOG(INFO) << "[" << name << "] " - << "Number of mmapped regions: " << curr.hblks - << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks); - } - if (prev.hblkhd != curr.hblkhd) { - LOG(INFO) << "[" << name << "] " - << "Space allocated in mmapped regions (bytes): " << curr.hblkhd - << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd); - } - if (prev.usmblks != curr.usmblks) { - LOG(INFO) << "[" << name << "] " - << "Maximum total allocated space (bytes): " << curr.usmblks - << ", diff: " - << ((int64_t)curr.usmblks - (int64_t)prev.usmblks); - } - if (prev.fsmblks != curr.fsmblks) { - LOG(INFO) << "[" << name << "] " - << "Space in freed fastbin blocks (bytes): " << curr.fsmblks - << ", diff: " - << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks); - } - if (prev.uordblks != curr.uordblks) { - LOG(INFO) << "[" << name << "] " - << "Total allocated space (bytes): " << curr.uordblks - << ", diff: " - << ((int64_t)curr.uordblks - (int64_t)prev.uordblks); - } - if (prev.fordblks != curr.fordblks) { - LOG(INFO) << "[" << name << "] " - << "Total free space (bytes): " << curr.fordblks << ", diff: " - << ((int64_t)curr.fordblks - (int64_t)prev.fordblks); - } - if (prev.keepcost != curr.keepcost) { - LOG(INFO) << "[" << name << "] " - << "Top-most, releasable space (bytes): " << curr.keepcost - << ", diff: " - << ((int64_t)curr.keepcost - (int64_t)prev.keepcost); - } - return curr; - } -}; - -#define MACE_MEMORY_LOGGING_GUARD() \ - MallinfoChangeLogger mem_logger_##__line__(std::string(__FILE__) + ":" + \ - std::string(__func__)); -#else -#define MACE_MEMORY_LOGGING_GUARD() -#endif - -} // namespace mace - -#endif // MACE_UTILS_MEMORY_LOGGING_H_ diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h index 81d820cbfc39b2fe9edb729071d351c1993b1b01..7634833cc1e75763d79901f68b47f46705fa97db 100644 --- a/mace/utils/quantize.h +++ b/mace/utils/quantize.h @@ -19,6 +19,12 @@ #include #include +#if defined(MACE_ENABLE_NEON) +#include +#endif // MACE_ENABLE_NEON + +#include "mace/utils/logging.h" + namespace mace { template @@ -123,6 +129,25 @@ inline void Quantize(const float *input, QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output); } +template +inline void Quantize(const Tensor &input, + Tensor *output, + float *min_out, + float *max_out) { + MACE_CHECK(input.size() != 0); + Tensor::MappingGuard input_guard(&input); + Tensor::MappingGuard output_guard(output); + auto *input_data = input.data(); + auto *output_data = output->mutable_data(); + float scale; + int32_t zero_point; + + Quantize(input_data, input.size(), false, output_data, &scale, &zero_point); + + *min_out = scale * (std::numeric_limits::lowest() - zero_point); + *max_out = scale * (std::numeric_limits::max() - zero_point); +} + template inline void Dequantize(const T *input, const index_t size, @@ -135,14 +160,127 @@ inline void Dequantize(const T *input, } } -inline void QuantizeMultiplier(double multiplier, - int32_t* output_multiplier, - int32_t* shift) { - if (multiplier == 0.f) { - *output_multiplier = 0; - *shift = 0; - return; +#if defined(MACE_ENABLE_NEON) +template<> +inline void QuantizeWithScaleAndZeropoint(const float *input, + const index_t size, + float scale, + int32_t zero_point, + uint8_t *output) { + const float32x4_t vround = vdupq_n_f32(0.5); + const float32x4_t + vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point))); + const float recip_scale = 1.f / scale; + const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale); + const index_t block_count = size / 16; + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < block_count; ++i) { + float32x4_t vi0 = vld1q_f32(input + i * 16); + float32x4_t vi1 = vld1q_f32(input + i * 16 + 4); + float32x4_t vi2 = vld1q_f32(input + i * 16 + 8); + float32x4_t vi3 = vld1q_f32(input + i * 16 + 12); + + int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale)); + int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale)); + int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale)); + int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale)); + + uint8x8_t vo0_u8 = + vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32))); + uint8x8_t vo1_u8 = + vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32))); + uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8); + + vst1q_u8(output + i * 16, vo); + } + +#pragma omp parallel for schedule(runtime) + for (index_t i = block_count * 16; i < size; ++i) { + output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); + } +} + +template<> +inline void Dequantize(const int32_t *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { + const index_t block_count = size / 4; + const int32x4_t vzero = vdupq_n_s32(zero_point); + const float32x4_t vscale = vdupq_n_f32(scale); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < block_count; ++i) { + int32x4_t vi = vld1q_s32(input + i * 4); + float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero))); + vst1q_f32(output + i * 4, vo); + } + for (index_t i = block_count * 4; i < size; ++i) { + output[i] = scale * (input[i] - zero_point); + } +} + +template<> +inline void Dequantize(const uint8_t *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { + const index_t block_count = size / 16; + const int32x4_t vzero = vdupq_n_s32(zero_point); + const float32x4_t vscale = vdupq_n_f32(scale); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i < block_count; ++i) { + uint8x16_t vi = vld1q_u8(input + i * 16); + float32x4x4_t vo = { + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), + vmulq_f32(vscale, + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( + vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), + }; + vst1q_f32(output + i * 16, vo.val[0]); + vst1q_f32(output + i * 16 + 4, vo.val[1]); + vst1q_f32(output + i * 16 + 8, vo.val[2]); + vst1q_f32(output + i * 16 + 12, vo.val[3]); } + for (index_t i = block_count * 16; i < size; ++i) { + output[i] = scale * (input[i] - zero_point); + } +} +#endif // MACE_ENABLE_NEON + +template +inline void DeQuantize(const Tensor &input, + const float min_in, + const float max_in, + Tensor *output) { + MACE_CHECK(input.size() != 0); + Tensor::MappingGuard input_guard(&input); + Tensor::MappingGuard output_guard(output); + auto *input_data = input.data(); + auto *output_data = output->mutable_data(); + float scale; + int32_t zero_point; + + AdjustRange(min_in, max_in, false, &scale, &zero_point); + + Dequantize(input_data, input.size(), scale, zero_point, output_data); +} + +inline void QuantizeMultiplier(double multiplier, + int32_t *output_multiplier, + int32_t *shift) { const double q = std::frexp(multiplier, shift); auto qint = static_cast(roundl(q * (1ll << 31))); if (qint == (1ll << 31)) { diff --git a/mace/utils/rwlock.h b/mace/utils/rwlock.h index c15fa5ad7a605ce1dc0d7b2fabb083a4aba53e7f..b4d6392ce3772fac468b46f450faa89839c8e5f6 100644 --- a/mace/utils/rwlock.h +++ b/mace/utils/rwlock.h @@ -17,7 +17,9 @@ #include // NOLINT(build/c++11) #include // NOLINT(build/c++11) + #include "mace/utils/logging.h" +#include "mace/utils/macros.h" namespace mace { namespace utils { @@ -26,10 +28,6 @@ class RWMutex { public: RWMutex() : counter_(0), waiting_readers_(0), waiting_writers_(0) {} ~RWMutex() = default; - RWMutex(const RWMutex &) = delete; - RWMutex(RWMutex &&) = delete; - RWMutex& operator=(const RWMutex &) = delete; - RWMutex& operator=(RWMutex &&) = delete; int counter_; // -1 for writer, 0 for nobody, 1~n for reader int waiting_readers_; @@ -37,6 +35,8 @@ class RWMutex { std::mutex mutex_; std::condition_variable reader_cv_; std::condition_variable writer_cv_; + + MACE_DISABLE_COPY_AND_ASSIGN(RWMutex); }; // Writer first @@ -61,13 +61,11 @@ class ReadLock { } } } - ReadLock(const ReadLock &) = delete; - ReadLock(ReadLock &&) = delete; - ReadLock& operator=(const ReadLock &) = delete; - ReadLock& operator=(ReadLock &&) = delete; private: RWMutex *rw_mutex_; + + MACE_DISABLE_COPY_AND_ASSIGN(ReadLock); }; class WriteLock { @@ -91,13 +89,11 @@ class WriteLock { rw_mutex_->reader_cv_.notify_all(); } } - WriteLock(const WriteLock &) = delete; - WriteLock(WriteLock &&) = delete; - WriteLock& operator=(const WriteLock &) = delete; - WriteLock& operator=(WriteLock &&) = delete; private: RWMutex *rw_mutex_; + + MACE_DISABLE_COPY_AND_ASSIGN(WriteLock); }; } // namespace utils diff --git a/mace/utils/stl_util.h b/mace/utils/stl_util.h new file mode 100644 index 0000000000000000000000000000000000000000..44dd1d8e384b7bfa260e12b9f33183a5ec5b7157 --- /dev/null +++ b/mace/utils/stl_util.h @@ -0,0 +1,37 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_UTILS_STL_UTIL_H_ +#define MACE_UTILS_STL_UTIL_H_ + +#include +#include +#include +#include +#include + +namespace mace { + +template +std::vector MapKeys(const std::map &data) { + std::vector keys; + for (auto &kv : data) { + keys.push_back(kv.first); + } + return keys; +} + +} // namespace mace + +#endif // MACE_UTILS_STL_UTIL_H_ diff --git a/mace/utils/string_util.cc b/mace/utils/string_util.cc index 3492706fe068f3caef8ce9443b505f887fb97ab6..8114e3aad7364c20a2d14b75912d1d798df24263 100644 --- a/mace/utils/string_util.cc +++ b/mace/utils/string_util.cc @@ -83,4 +83,65 @@ std::string StringFormatter::Table( } } // namespace string_util + +std::string ObfuscateString(const std::string &src, + const std::string &lookup_table) { + std::string dest; + dest.resize(src.size()); + for (size_t i = 0; i < src.size(); i++) { + dest[i] = src[i] ^ lookup_table[i % lookup_table.size()]; + } + return dest; +} + +// ObfuscateString(ObfuscateString(str)) ==> str +std::string ObfuscateString(const std::string &src) { + // Keep consistent with obfuscation in python tools + return ObfuscateString(src, "Mobile-AI-Compute-Engine"); +} + +// Obfuscate synbol or path string +std::string ObfuscateSymbol(const std::string &src) { + std::string dest = src; + if (dest.empty()) { + return dest; + } + dest[0] = src[0]; // avoid invalid symbol which starts from 0-9 + const std::string encode_dict = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"; + for (size_t i = 1; i < src.size(); i++) { + char ch = src[i]; + int idx; + if (ch >= '0' && ch <= '9') { + idx = ch - '0'; + } else if (ch >= 'a' && ch <= 'z') { + idx = 10 + ch - 'a'; + } else if (ch >= 'A' && ch <= 'Z') { + idx = 10 + 26 + ch - 'a'; + } else if (ch == '_') { + idx = 10 + 26 + 26; + } else { + dest[i] = ch; + continue; + } + // There is no collision if it's true for every char at every position + dest[i] = encode_dict[(idx + i + 31) % encode_dict.size()]; + } + return dest; +} + +std::vector Split(const std::string &str, char delims) { + std::vector result; + std::string tmp = str; + while (!tmp.empty()) { + size_t next_offset = tmp.find(delims); + result.push_back(tmp.substr(0, next_offset)); + if (next_offset == std::string::npos) { + break; + } else { + tmp = tmp.substr(next_offset + 1); + } + } + return result; +} } // namespace mace diff --git a/mace/utils/string_util.h b/mace/utils/string_util.h index c41aaaa12fdb682bc7aea2d08076b867ad8615f0..c9df13566335b4041aca30d6a6f4e911434bb0d4 100644 --- a/mace/utils/string_util.h +++ b/mace/utils/string_util.h @@ -15,6 +15,7 @@ #ifndef MACE_UTILS_STRING_UTIL_H_ #define MACE_UTILS_STRING_UTIL_H_ +#include #include #include #include @@ -80,6 +81,35 @@ inline std::string MakeString(const std::string &str) { inline std::string MakeString(const char *c_str) { return std::string(c_str); } +inline std::string ToLower(const std::string &src) { + std::string dest(src); + std::transform(src.begin(), src.end(), dest.begin(), ::tolower); + return dest; +} + +inline std::string ToUpper(const std::string &src) { + std::string dest(src); + std::transform(src.begin(), src.end(), dest.begin(), ::toupper); + return dest; +} + +std::string ObfuscateString(const std::string &src, + const std::string &lookup_table); + +std::string ObfuscateString(const std::string &src); + +std::string ObfuscateSymbol(const std::string &src); + +#ifdef MACE_OBFUSCATE_LITERALS +#define MACE_OBFUSCATE_STRING(str) ObfuscateString(str) +#define MACE_OBFUSCATE_SYMBOL(str) ObfuscateSymbol(str) +#else +#define MACE_OBFUSCATE_STRING(str) (str) +#define MACE_OBFUSCATE_SYMBOL(str) (str) +#endif + +std::vector Split(const std::string &str, char delims); + } // namespace mace #endif // MACE_UTILS_STRING_UTIL_H_ diff --git a/mace/utils/string_util_test.cc b/mace/utils/string_util_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..84b9b0a671a5655d3f9660e859d5ff0ed56b9f3a --- /dev/null +++ b/mace/utils/string_util_test.cc @@ -0,0 +1,40 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/utils/string_util.h" + +#include + +namespace mace { +namespace { + +class StringUtilTest : public ::testing::Test { +}; + +TEST_F(StringUtilTest, MakeString) { + EXPECT_EQ("Hello 2019", MakeString("Hello", " ", 2019)); +} + +TEST_F(StringUtilTest, ToLower) { + EXPECT_EQ("", ToLower("")); + EXPECT_EQ("hello world!", ToLower("Hello World!")); +} + +TEST_F(StringUtilTest, ToUpper) { + EXPECT_EQ("", ToLower("")); + EXPECT_EQ("HELLO WORLD!", ToUpper("Hello World!")); +} + +} // namespace +} // namespace mace diff --git a/mace/utils/timer.h b/mace/utils/timer.h index 3f0e96f4c37045ecd7c9b9a274a6fbf7dc0a0380..0955af7ba5ce5db65e4493b486e028683f5d1e66 100644 --- a/mace/utils/timer.h +++ b/mace/utils/timer.h @@ -15,7 +15,7 @@ #ifndef MACE_UTILS_TIMER_H_ #define MACE_UTILS_TIMER_H_ -#include "mace/utils/env_time.h" +#include "mace/port/env.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index 7ac8467bce6cf4df2c3c6c4741cf6b630497074d..5d381b048a68ee9c728b656e8efdcd72d6971d5a 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -14,12 +14,14 @@ #ifndef MACE_UTILS_TUNER_H_ #define MACE_UTILS_TUNER_H_ + +// TODO(heliangliang) Fix portability #include -#include #include #include #include +#include #include #include #include @@ -30,8 +32,8 @@ #include #include "mace/utils/logging.h" +#include "mace/utils/string_util.h" #include "mace/utils/timer.h" -#include "mace/utils/utils.h" namespace mace { @@ -76,16 +78,14 @@ class Tuner { std::vector opt_param = default_param; RetType res = Tune(param_generator, func, timer, &opt_param); VLOG(3) << "Tuning " << param_key - << " retult: " << (VLOG_IS_ON(3) ? MakeString(opt_param) : ""); + << " retult: " << MakeString(opt_param); param_table_[obfucated_param_key] = opt_param; return res; } else { // run if (param_table_.find(obfucated_param_key) != param_table_.end()) { VLOG(3) << param_key << ": " - << (VLOG_IS_ON(3) - ? MakeString(param_table_[obfucated_param_key]) - : ""); + << MakeString(param_table_[obfucated_param_key]); return func(param_table_[obfucated_param_key], nullptr, nullptr); } else { return func(default_param, nullptr, nullptr); @@ -112,7 +112,7 @@ class Tuner { sizeof(params_size)); VLOG(3) << "Write tuning param: " << kp.first.c_str() << ": " - << (VLOG_IS_ON(3) ? MakeString(params) : ""); + << MakeString(params); for (auto ¶m : params) { ofs.write(reinterpret_cast(¶m), sizeof(params_size)); } @@ -293,4 +293,5 @@ class Tuner { }; } // namespace mace + #endif // MACE_UTILS_TUNER_H_ diff --git a/mace/utils/utils.cc b/mace/utils/utils.cc deleted file mode 100644 index a422988d1689353a720e19ca544859dd5f952a68..0000000000000000000000000000000000000000 --- a/mace/utils/utils.cc +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright 2018 The MACE Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/utils/utils.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mace/utils/logging.h" - -namespace mace { - -std::string ObfuscateString(const std::string &src, - const std::string &lookup_table) { - std::string dest; - dest.resize(src.size()); - for (size_t i = 0; i < src.size(); i++) { - dest[i] = src[i] ^ lookup_table[i % lookup_table.size()]; - } - return dest; -} - -// ObfuscateString(ObfuscateString(str)) ==> str -std::string ObfuscateString(const std::string &src) { - // Keep consistent with obfuscation in python tools - return ObfuscateString(src, "Mobile-AI-Compute-Engine"); -} - -// Obfuscate synbol or path string -std::string ObfuscateSymbol(const std::string &src) { - std::string dest = src; - if (dest.empty()) { - return dest; - } - dest[0] = src[0]; // avoid invalid symbol which starts from 0-9 - const std::string encode_dict = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"; - for (size_t i = 1; i < src.size(); i++) { - char ch = src[i]; - int idx; - if (ch >= '0' && ch <= '9') { - idx = ch - '0'; - } else if (ch >= 'a' && ch <= 'z') { - idx = 10 + ch - 'a'; - } else if (ch >= 'A' && ch <= 'Z') { - idx = 10 + 26 + ch - 'a'; - } else if (ch == '_') { - idx = 10 + 26 + 26; - } else { - dest[i] = ch; - continue; - } - // There is no collision if it's true for every char at every position - dest[i] = encode_dict[(idx + i + 31) % encode_dict.size()]; - } - return dest; -} - -std::vector Split(const std::string &str, char delims) { - std::vector result; - std::string tmp = str; - while (!tmp.empty()) { - size_t next_offset = tmp.find(delims); - result.push_back(tmp.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp = tmp.substr(next_offset + 1); - } - } - return result; -} - -bool ReadBinaryFile(std::vector *data, - const std::string &filename) { - std::ifstream ifs(filename, std::ios::in | std::ios::binary); - if (!ifs.is_open()) { - return false; - } - ifs.seekg(0, ifs.end); - size_t length = ifs.tellg(); - ifs.seekg(0, ifs.beg); - - data->resize(length); - ifs.read(reinterpret_cast(data->data()), length); - - if (ifs.fail()) { - return false; - } - ifs.close(); - - return true; -} - -void MemoryMap(const std::string &file, - const unsigned char **data, - size_t *size) { - int fd = open(file.c_str(), O_RDONLY); - MACE_CHECK(fd >= 0, - "Failed to open file ", file, ", error code: ", strerror(errno)); - struct stat st; - fstat(fd, &st); - *size = static_cast(st.st_size); - - *data = static_cast( - mmap(nullptr, *size, PROT_READ, MAP_PRIVATE, fd, 0)); - MACE_CHECK(*data != static_cast(MAP_FAILED), - "Failed to map file ", file, ", error code: ", strerror(errno)); - - int ret = close(fd); - MACE_CHECK(ret == 0, - "Failed to close file ", file, ", error code: ", strerror(errno)); -} - -void MemoryUnMap(const unsigned char *data, - const size_t &size) { - MACE_CHECK(data != nullptr && size > 0, "data is null or size is 0"); - - int ret = munmap(const_cast(data), size); - - MACE_CHECK(ret == 0, - "Failed to unmap file, error code: ", strerror(errno)); -} - -} // namespace mace diff --git a/repository/git/BUILD b/repository/git/BUILD.bazel similarity index 100% rename from repository/git/BUILD rename to repository/git/BUILD.bazel diff --git a/repository/git/BUILD.tpl b/repository/git/BUILD.bazel.tpl similarity index 100% rename from repository/git/BUILD.tpl rename to repository/git/BUILD.bazel.tpl diff --git a/repository/git/git_configure.bzl b/repository/git/git_configure.bzl index ca2b8b2d5d9d158554bb32933a2b9a825081a3bd..aa1ea598970b60b4f3a0b8d79d6e35cf282565e9 100644 --- a/repository/git/git_configure.bzl +++ b/repository/git/git_configure.bzl @@ -2,10 +2,10 @@ """ def _git_version_conf_impl(repository_ctx): repository_ctx.template( - "BUILD", - Label("//repository/git:BUILD.tpl")) + "BUILD.bazel", + Label("//repository/git:BUILD.bazel.tpl")) - mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD")))[:-len("BUILD")] + mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD.bazel")))[:-len("BUILD.bazel")] generated_files_path = repository_ctx.path("gen") diff --git a/repository/opencl-kernel/BUILD b/repository/opencl-kernel/BUILD.bazel similarity index 100% rename from repository/opencl-kernel/BUILD rename to repository/opencl-kernel/BUILD.bazel diff --git a/repository/opencl-kernel/BUILD.tpl b/repository/opencl-kernel/BUILD.bazel.tpl similarity index 100% rename from repository/opencl-kernel/BUILD.tpl rename to repository/opencl-kernel/BUILD.bazel.tpl diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl index 88c0880e1bc68a9cd6b4308eb3426e14166d0769..572219b161bf496b68c0949da53c6820554f13c9 100644 --- a/repository/opencl-kernel/opencl_kernel_configure.bzl +++ b/repository/opencl-kernel/opencl_kernel_configure.bzl @@ -3,11 +3,11 @@ def _opencl_encrypt_kernel_impl(repository_ctx): repository_ctx.template( - "BUILD", - Label("//repository/opencl-kernel:BUILD.tpl"), + "BUILD.bazel", + Label("//repository/opencl-kernel:BUILD.bazel.tpl"), ) - mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD")))[:-len("BUILD")] + mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD.bazel")))[:-len("BUILD.bazel")] generated_files_path = repository_ctx.path("gen") ret = repository_ctx.execute( diff --git a/third_party/caffe/BUILD b/third_party/caffe/BUILD.bazel similarity index 100% rename from third_party/caffe/BUILD rename to third_party/caffe/BUILD.bazel diff --git a/third_party/caffe/caffe.proto b/third_party/caffe/caffe.proto index b2d56b9898fbcfd0bbd31d7d1356aea12ce87445..c972c9f66bd27c8145b919da2778d40668cf50ff 100644 --- a/third_party/caffe/caffe.proto +++ b/third_party/caffe/caffe.proto @@ -515,6 +515,7 @@ message LayerParameter { optional InfogainLossParameter infogain_loss_param = 116; optional InnerProductParameter inner_product_param = 117; optional InputParameter input_param = 143; + optional InterpParameter interp_param = 147; optional LogParameter log_param = 134; optional LRNParameter lrn_param = 118; optional MemoryDataParameter memory_data_param = 119; @@ -1207,6 +1208,15 @@ message InputParameter { repeated BlobShape shape = 1; } +message InterpParameter { + optional int32 height = 1 [default = 0]; // Height of output + optional int32 width = 2 [default = 0]; // Width of output + optional int32 zoom_factor = 3 [default = 1]; // zoom factor + optional int32 shrink_factor = 4 [default = 1]; // shrink factor + optional int32 pad_beg = 5 [default = 0]; // padding at begin of input + optional int32 pad_end = 6 [default = 0]; // padding at end of input +} + // Message that stores parameters used by LogLayer message LogParameter { // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0. diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD.bazel similarity index 100% rename from third_party/eigen3/BUILD rename to third_party/eigen3/BUILD.bazel diff --git a/third_party/hta/BUILD b/third_party/hta/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..7385472755eab0a1fb75df4bb089a63aa01e110e --- /dev/null +++ b/third_party/hta/BUILD @@ -0,0 +1,31 @@ +# These files are generated fron nnlib project + +licenses(["notice"]) + +exports_files(["license.txt"]) + +load( + "//mace:mace.bzl", + "if_android_armv7", + "if_android_arm64", +) + +cc_library( + name = "hta", + srcs = if_android_armv7([ + "armeabi-v7a/libhta_controller.so", + "armeabi-v7a/libhta_hexagon_runtime.so", + "armeabi-v7a/libnpu.so", + ]) + if_android_arm64([ + "arm64-v8a/libcdsprpc.so", + "arm64-v8a/libhta_controller.so", + "arm64-v8a/libhta_hexagon_runtime.so", + "arm64-v8a/libnpu.so", + ]), + hdrs = [ + "hta_hexagon_api.h", + "hta_hexagon_nn_ops.h", + "hta_ops.h", + ], + visibility = ["//visibility:public"], +) diff --git a/third_party/hta/arm64-v8a/libcdsprpc.so b/third_party/hta/arm64-v8a/libcdsprpc.so new file mode 100755 index 0000000000000000000000000000000000000000..57de01f4887197b0b510f395f828289d74597069 Binary files /dev/null and b/third_party/hta/arm64-v8a/libcdsprpc.so differ diff --git a/third_party/hta/arm64-v8a/libhta_controller.so b/third_party/hta/arm64-v8a/libhta_controller.so new file mode 100644 index 0000000000000000000000000000000000000000..3cb5ea31a24d319779521454720c3b587120d2e0 Binary files /dev/null and b/third_party/hta/arm64-v8a/libhta_controller.so differ diff --git a/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so b/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so new file mode 100644 index 0000000000000000000000000000000000000000..32b5d784a19a6390ffe25f4c4e4853172b4d5074 Binary files /dev/null and b/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so differ diff --git a/third_party/hta/arm64-v8a/libnpu.so b/third_party/hta/arm64-v8a/libnpu.so new file mode 100644 index 0000000000000000000000000000000000000000..9b6633769db106f516ac7cfebea0b40b491996e1 Binary files /dev/null and b/third_party/hta/arm64-v8a/libnpu.so differ diff --git a/third_party/hta/armeabi-v7a/libhta_controller.so b/third_party/hta/armeabi-v7a/libhta_controller.so new file mode 100644 index 0000000000000000000000000000000000000000..03b267889d96e74b965fd485313d35ce59b8bc97 Binary files /dev/null and b/third_party/hta/armeabi-v7a/libhta_controller.so differ diff --git a/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so b/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so new file mode 100644 index 0000000000000000000000000000000000000000..9136f520d74901ca068c5377eccb578978ca9fa6 Binary files /dev/null and b/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so differ diff --git a/third_party/hta/armeabi-v7a/libnpu.so b/third_party/hta/armeabi-v7a/libnpu.so new file mode 100644 index 0000000000000000000000000000000000000000..a88605929cfdca12ecd720749064d880a6d48ab4 Binary files /dev/null and b/third_party/hta/armeabi-v7a/libnpu.so differ diff --git a/third_party/hta/hta_hexagon_api.h b/third_party/hta/hta_hexagon_api.h new file mode 100644 index 0000000000000000000000000000000000000000..cb13fe62bcd8bbdcb8f50f4dfb725df292aa87fd --- /dev/null +++ b/third_party/hta/hta_hexagon_api.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT + * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef THIRD_PARTY_HTA_HEXAGON_API_H_ +#define THIRD_PARTY_HTA_HEXAGON_API_H_ + +#include "hta_hexagon_nn_ops.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int hexagon_hta_nn_nn_id; + +struct input { + uint32_t src_id; + uint32_t output_idx; +}; + +#define NODE_ID_RESERVED_CONSTANT 0 + +#define MAX_DIMENSIONS 8 +struct output { + uint32_t rank; // dimensions in the tensor + uint32_t max_sizes[MAX_DIMENSIONS]; // max num elements in each dimension + uint32_t elementsize; // size of each element + int32_t zero_offset; // 0 for float / integer values + float stepsize; // 0 for float/integer values +}; + +struct perfinfo { + uint32_t node_id; + uint32_t executions; + union { + uint64_t counter; + struct { + uint32_t counter_lo; + uint32_t counter_hi; + }; + }; +}; + +typedef struct input hexagon_hta_nn_input; +typedef struct output hexagon_hta_nn_output; +typedef struct perfinfo hexagon_hta_nn_perfinfo; +typedef int32_t hexagon_hta_nn_padding_type; + +typedef enum padding_type_enum { + HTA_NN_PAD_NA = 0, + HTA_NN_PAD_SAME, + HTA_NN_PAD_VALID, + HTA_NN_PAD_MIRROR_REFLECT, + HTA_NN_PAD_MIRROR_SYMMETRIC, + HTA_NN_PAD_SAME_CAFFE, +} hta_padding_type; + +typedef struct { + unsigned int batches; + unsigned int height; + unsigned int width; + unsigned int depth; + unsigned char *data; + int dataLen; /* For input and output */ + unsigned int data_valid_len; /* for output only */ + unsigned int unused; +} hexagon_hta_nn_tensordef; + +typedef struct hexagon_nn_op_node hexagon_nn_op_node; +struct hexagon_nn_op_node { + unsigned int node_id; + hta_op_type operation; + hta_padding_type padding; + hexagon_hta_nn_input* inputs; + int inputsLen; + hexagon_hta_nn_output* outputs; + int outputsLen; +}; +typedef struct hexagon_nn_const_node hexagon_nn_const_node; +struct hexagon_nn_const_node { + unsigned int node_id; + hexagon_hta_nn_tensordef tensor; +}; + +/* Actual functions in the interface */ +/* Returns 0 on success, nonzero on error unless otherwise noted */ +/* Configure the hardware and software environment. Should be called once before doing anything */ +int hexagon_hta_nn_config( void ); + +/* Initialize a new graph, returns a new nn_id or -1 on error */ +int hexagon_hta_nn_init(hexagon_hta_nn_nn_id *g); + +/* Set debug verbosity. Default is 0, higher values are more verbose */ +int hexagon_hta_nn_set_debug_level(hexagon_hta_nn_nn_id id, int level); + +/* Append a node to the graph. Nodes are executed in the appended order. */ +int hexagon_hta_nn_append_node( + hexagon_hta_nn_nn_id id, + uint32_t node_id, + hta_op_type operation, + hta_padding_type padding, + const struct input *inputs, + uint32_t num_inputs, + const struct output *outputs, + uint32_t num_outputs); + +/* + * Append a const node into the graph. The data is copied locally during this + * call, the caller does not need it to persist. + */ +int hexagon_hta_nn_append_const_node( + hexagon_hta_nn_nn_id id, + uint32_t node_id, + uint32_t batches, + uint32_t height, + uint32_t width, + uint32_t depth, + const uint8_t *data, + uint32_t data_len); + +/* + * Prepare a graph for execution. Must be done before attempting to execute the graph. + */ +int hexagon_hta_nn_prepare(hexagon_hta_nn_nn_id id); + +/* Execute the graph with a single input and a single output. */ +int hexagon_hta_nn_execute( + hexagon_hta_nn_nn_id id, + uint32_t batches_in, + uint32_t height_in, + uint32_t width_in, + uint32_t depth_in, + const uint8_t *data_in, + uint32_t data_len_in, + uint32_t *batches_out, + uint32_t *height_out, + uint32_t *width_out, + uint32_t *depth_out, + uint8_t *data_out, + uint32_t data_out_max, + uint32_t *data_out_size); + +/* Tear down a graph, destroying it and freeing resources. */ +int hexagon_hta_nn_teardown(hexagon_hta_nn_nn_id id); + +/* Get the version of the library */ +int hexagon_hta_nn_version(int *ver); + +/* Execute the graph with a multiple input and a multiple output. */ +int hexagon_hta_nn_execute_new( + hexagon_hta_nn_nn_id id, + const hexagon_hta_nn_tensordef *inputs, + uint32_t n_inputs, + hexagon_hta_nn_tensordef *outputs, + uint32_t n_outputs); + +int hexagon_hta_nn_serialize_size(hexagon_hta_nn_nn_id id, unsigned int *serialized_obj_size_out); +int hexagon_hta_nn_serialize(hexagon_hta_nn_nn_id id, void *buf, unsigned int buf_len); +int hexagon_hta_nn_deserialize(void *buf, unsigned len, hexagon_hta_nn_nn_id *g); + +#ifdef __cplusplus +} +#endif + +#endif //THIRD_PARTY_HTA_HEXAGON_API_H_ diff --git a/third_party/hta/hta_hexagon_nn_ops.h b/third_party/hta/hta_hexagon_nn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e2aaa5881c842d12892d21dead102efad08df270 --- /dev/null +++ b/third_party/hta/hta_hexagon_nn_ops.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT + * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_ +#define THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_ + +typedef enum hta_op_type_enum { +#define HTA_DEF_OP(NAME, ...) HTA_OP_##NAME, + +#include "hta_ops.h" + HTA_NN_OPS_MAX + +#undef HTA_DEF_OP +} hta_op_type; + +#endif // THIRD_PARTY_HTA_HEXAGON_NN_OPS_H_ diff --git a/third_party/hta/hta_ops.h b/third_party/hta/hta_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3becf1d3a79534131a8cfb3c9508bada52752623 --- /dev/null +++ b/third_party/hta/hta_ops.h @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT + * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * You probably want to + * + * ## ##### ##### + * # # # # # # + * # # # # # # + * ###### # # # # + * # # # # # # + * # # ##### ##### + * + * + * # # #### ##### ###### #### + * ## # # # # # # # + * # # # # # # # ##### #### + * # # # # # # # # # + * # ## # # # # # # # + * # # #### ##### ###### #### + * + * + * ## ##### + * # # # + * # # # + * ###### # + * # # # + * # # # + * + * + * ##### # # ###### + * # # # # + * # ###### ##### + * # # # # + * # # # # + * # # # ###### + * + * + * ###### # # ##### + * # ## # # # + * ##### # # # # # + * # # # # # # + * # # ## # # + * ###### # # ##### + * + * otherwise the interface becomes incompatible. + */ +HTA_DEF_OP(INPUT) +HTA_DEF_OP(OUTPUT) +HTA_DEF_OP(Nop) +HTA_DEF_OP(Const) +HTA_DEF_OP(Check) +HTA_DEF_OP(Close_f) +HTA_DEF_OP(Close_quint8) +HTA_DEF_OP(Close_q_quint8) +HTA_DEF_OP(Close_int32) +HTA_DEF_OP(Close_qint32) +HTA_DEF_OP(PPrint_8) +HTA_DEF_OP(PPrint_32) +HTA_DEF_OP(PPrint_f) +HTA_DEF_OP(PreFree) +HTA_DEF_OP(Flatten) + +#ifndef HTA_DEF_OP_WREF +#define HTA_DEF_OP_WREF(NAME) HTA_DEF_OP(NAME) HTA_DEF_OP(NAME##_ref) +#define __SELF_HTA_DEF_OP_WREF +#endif + +HTA_DEF_OP_WREF(QuantizedConv2d_8x8to32) +HTA_DEF_OP_WREF(QuantizedMatMul_8x8to32) +HTA_DEF_OP_WREF(QuantizeDownAndShrinkRange_32to8) +HTA_DEF_OP_WREF(QuantizedRelu_8) +HTA_DEF_OP_WREF(QuantizedReluX_8) +HTA_DEF_OP_WREF(QuantizedMaxPool_8) +HTA_DEF_OP_WREF(QuantizedAvgPool_8) +HTA_DEF_OP_WREF(QuantizedL2Pool_8) +HTA_DEF_OP_WREF(QuantizedConcat_8) +HTA_DEF_OP_WREF(QuantizedBiasAdd_8p8to32) +HTA_DEF_OP_WREF(Min_f) +HTA_DEF_OP_WREF(Max_f) +HTA_DEF_OP_WREF(Quantize) +HTA_DEF_OP_WREF(Dequantize) +HTA_DEF_OP_WREF(Supernode_8x8p8to8) + +HTA_DEF_OP(QuantizedFlatten) +HTA_DEF_OP(Softmax_f) +HTA_DEF_OP(Conv2d_f) +HTA_DEF_OP(MatMul_f) +HTA_DEF_OP(Relu_f) +HTA_DEF_OP(ReluX_f) +HTA_DEF_OP(AvgPool_f) +HTA_DEF_OP(L2Pool_f) +HTA_DEF_OP(MaxPool_f) +HTA_DEF_OP(Concat_f) +HTA_DEF_OP(BiasAdd_f) +HTA_DEF_OP(LRN_f) + +HTA_DEF_OP(Variable) +HTA_DEF_OP(Assign) +HTA_DEF_OP(Reshape) +HTA_DEF_OP(QuantizedReshape) +HTA_DEF_OP(Tanh_f) +HTA_DEF_OP(Sigmoid_f) +HTA_DEF_OP(Slice_8) +HTA_DEF_OP(Slice_f) +HTA_DEF_OP(QuantizedSlice_8) +HTA_DEF_OP(Add_f) +HTA_DEF_OP(Mul_f) +HTA_DEF_OP(Minimum_f) +HTA_DEF_OP(Maximum_f) + +HTA_DEF_OP_WREF(Requantize_32to8) +HTA_DEF_OP_WREF(RequantizationRange_32) + +HTA_DEF_OP(Neg_f) +HTA_DEF_OP(Sub_f) +HTA_DEF_OP(AddN_f) +HTA_DEF_OP(Range_int32) +HTA_DEF_OP(Rank_int32) +HTA_DEF_OP(Transpose_int32) +HTA_DEF_OP(Transpose_f) +HTA_DEF_OP(InstanceNorm_f) +HTA_DEF_OP_WREF(QuantizedInstanceNorm_8) +HTA_DEF_OP(Sub_int32) +HTA_DEF_OP(Add_int32) +HTA_DEF_OP(Split_f) +HTA_DEF_OP(Dequantize_qint32_f) +HTA_DEF_OP(PRelu_f) +HTA_DEF_OP_WREF(QuantizedPRelu_8) +HTA_DEF_OP(Sum_f) +HTA_DEF_OP(Prod_f) +HTA_DEF_OP(Mul_int32) +HTA_DEF_OP(LogicalAnd_int32) +HTA_DEF_OP(LogicalOr_int32) +HTA_DEF_OP(LogicalXor_int32) +HTA_DEF_OP(Shape_int32) +HTA_DEF_OP(Pack_int32) +HTA_DEF_OP(MirrorPad_f) +HTA_DEF_OP(ResizeNearestNeighbor_f) +HTA_DEF_OP(StridedSlice_int32) +HTA_DEF_OP(StridedSlice_f) +HTA_DEF_OP(ExpandDims_int32) +HTA_DEF_OP(ExpandDims_f) + +HTA_DEF_OP(LogSoftmax_f) +HTA_DEF_OP(Split_int32) +HTA_DEF_OP(QuantizedSplit_8) + +HTA_DEF_OP(Deconv_f) +HTA_DEF_OP_WREF(QuantizedDeconv_8x8to32) + +HTA_DEF_OP_WREF(QuantizedMul_8x8to32) +HTA_DEF_OP_WREF(QuantizedAdd_8p8to32) +HTA_DEF_OP_WREF(QuantizedSigmoid_8) +HTA_DEF_OP_WREF(QuantizedTanh_8) +HTA_DEF_OP_WREF(QuantizedSoftmax_8) +HTA_DEF_OP_WREF(QuantizedLRN_8) +HTA_DEF_OP_WREF(Quantizedpad2d_frame_8p) +HTA_DEF_OP_WREF(QuantizedSub_8p8to32) +HTA_DEF_OP_WREF(QuantizedMaximum_8) +HTA_DEF_OP_WREF(QuantizedMinimum_8) + +HTA_DEF_OP(Pad_f) +HTA_DEF_OP(SpaceToBatchND_f) +HTA_DEF_OP(BatchToSpaceND_f) +HTA_DEF_OP(QuantizedPad_8) +HTA_DEF_OP(ResizeBilinear_f) +HTA_DEF_OP(ConcatV2_f) +HTA_DEF_OP(ConcatV2_int32) +HTA_DEF_OP(Prod_int32) +HTA_DEF_OP(Slice_int32) + +HTA_DEF_OP(QuantizedAdd_8p8to8) +HTA_DEF_OP(QuantizedResizeBilinear_8) +HTA_DEF_OP(Supernode_8x8p8to8_d32) +HTA_DEF_OP(Convert_to_d32) +HTA_DEF_OP(Convert_from_d32) +HTA_DEF_OP_WREF(QuantizedMaxPool_8_d32) +HTA_DEF_OP_WREF(QuantizedConcat_8_d32) +HTA_DEF_OP_WREF(QuantizedAvgPool_8_d32) + +HTA_DEF_OP(Sink) + +HTA_DEF_OP_WREF(QuantizedPRelu_8_d32) +HTA_DEF_OP_WREF(AutoQuantize) +HTA_DEF_OP_WREF(QuantizedDepthwiseConv2d_8x8to32) +HTA_DEF_OP_WREF(DepthwiseConv2d_f) +HTA_DEF_OP(DepthwiseSupernode_8x8p8to8) +HTA_DEF_OP(DepthwiseSupernode_8x8p8to8_d32) + +HTA_DEF_OP_WREF(QuantizedMul_8x8to8_d32) + +HTA_DEF_OP(FullyConnected_u8) +#if 0 +HTA_DEF_OP_WREF(QuantizedFC_8x8p8to8) +#endif + +HTA_DEF_OP_WREF(QuantizedAdd_8p8to8_d32) + +HTA_DEF_OP_WREF(QuantizedClamp_8) +HTA_DEF_OP(Clamp_f) +HTA_DEF_OP(QuantizeForTest_d32) +HTA_DEF_OP(Close_d32) +HTA_DEF_OP_WREF(QuantizedSub_8p8to8_d32) + +HTA_DEF_OP(InputSupernode_8x8p8to8_outd32) +HTA_DEF_OP(QuantizedLRN_8_d32) +HTA_DEF_OP_WREF(QuantizedBiasAdd_32p32to32) +HTA_DEF_OP_WREF(Quantize_int32) + +HTA_DEF_OP(Supernode_8x8p32to8) +HTA_DEF_OP(DepthwiseSupernode_8x8p32to8) +HTA_DEF_OP(Supernode_8x8p32to8_d32) +HTA_DEF_OP(DepthwiseSupernode_8x8p32to8_d32) +HTA_DEF_OP(InputSupernode_8x8p32to8_outd32) + +HTA_DEF_OP(PPrint_8_d32) +HTA_DEF_OP(PPrintWithPadding_8_d32) +HTA_DEF_OP_WREF(AutoQuantize_d32) + +HTA_DEF_OP_WREF(QuantizedTanh_8_d32) +HTA_DEF_OP_WREF(QuantizedSigmoid_8_d32) +HTA_DEF_OP_WREF(QuantizedSoftmax_8_d32) + + +HTA_DEF_OP_WREF(QuantizedL2Pool_8_d32) + +HTA_DEF_OP(Gather_f) +HTA_DEF_OP(Gather_int32) +HTA_DEF_OP(Gather_8) +HTA_DEF_OP(Table_f) +HTA_DEF_OP(Table_int32) +HTA_DEF_OP(Table_8) + +HTA_DEF_OP(FillPadding_8_d32) +HTA_DEF_OP(QuantizedResizeBilinear_8_d32) + +HTA_DEF_OP(QuantizeINPUT_f_to_8) +HTA_DEF_OP_WREF(DeconvBias_8x8to32) + +HTA_DEF_OP(SpaceToBatchND_8) +HTA_DEF_OP(BatchToSpaceND_8) + + +HTA_DEF_OP(SpaceToDepth_f) +HTA_DEF_OP(DepthToSpace_f) +HTA_DEF_OP(SpaceToDepth_8) +HTA_DEF_OP(DepthToSpace_8) + +HTA_DEF_OP(DequantizeOUTPUT_8tof) +HTA_DEF_OP(QuantizedBatchNorm_8x8p8to8) +HTA_DEF_OP(QuantizedBatchNorm_8x8p32to8) +HTA_DEF_OP(QuantizedBatchNorm_8x8p8to8_d32) +HTA_DEF_OP(QuantizedBatchNorm_8x8p32to8_d32) + +HTA_DEF_OP_WREF(QuantizedInstanceNorm_8_d32) +HTA_DEF_OP_WREF(QuantizedInstanceNormBG_8) +HTA_DEF_OP_WREF(QuantizedInstanceNormBG_8_d32) + +HTA_DEF_OP(SuperFC_8x8p32to8) +HTA_DEF_OP(SuperFC_8x8p32to8_ref) +HTA_DEF_OP(SuperFC_8x8p32to8_d32) + +HTA_DEF_OP(ChannelShuffle_f) +HTA_DEF_OP(ChannelShuffle_int32) +HTA_DEF_OP_WREF(QuantizedChannelShuffle_8) +HTA_DEF_OP(QuantizedChannelShuffle_8_d32) +/* this is in op_chanshuffle_d32.c*/ +HTA_DEF_OP(QuantizedSplit_8_d32) + +HTA_DEF_OP(QuantizedCrop_8) +HTA_DEF_OP(ResizeUnitSquare_f) +HTA_DEF_OP_WREF(ResizeUnitSquare_8) +HTA_DEF_OP_WREF(Nv21ToRgb_8) +HTA_DEF_OP_WREF(RgbaToRgb_8) +HTA_DEF_OP_WREF(Argb32ToRgb_8) +HTA_DEF_OP(Permute_f) +HTA_DEF_OP(QuantizedPermute_8) +HTA_DEF_OP_WREF(QuantizedRoiPool_8) +HTA_DEF_OP(Proposal_f) +HTA_DEF_OP(RoiAlign_f) +HTA_DEF_OP_WREF(QuantizedRoiAlign_8) +HTA_DEF_OP_WREF(Implode_8) +HTA_DEF_OP(QuantizedConcat_8_nond32) + +HTA_DEF_OP(Close_16tof) +HTA_DEF_OP(QuantizedLstmInput_16x16to16) +HTA_DEF_OP(QuantizedLstmOutput_16x16to8) + +HTA_DEF_OP(Quantize_16) +HTA_DEF_OP(Dequantize_16) +HTA_DEF_OP(Convert_8_16) +HTA_DEF_OP(QuantizedTanh_16) +HTA_DEF_OP(QuantizedSigmoid_16) + +HTA_DEF_OP_WREF(QuantizeDownAndShrinkRange_32to16) +HTA_DEF_OP_WREF(Requantize_32to16) +HTA_DEF_OP_WREF(QuantizedMatMul_8x8p32to16) + +HTA_DEF_OP(QuantizedStridedSlice_8) +HTA_DEF_OP(Bbox_Transform_f) +HTA_DEF_OP(Softmax_uint8) + +HTA_DEF_OP(QuantizedFakeConcat_8_d32) + +HTA_DEF_OP(DepthToSpace_8_d32) +HTA_DEF_OP(OemNode) + +HTA_DEF_OP(QuantizedPad_8_d32) +// Add new operations above this line +#ifdef __SELF_HTA_DEF_OP_WREF +#undef __SELF_HTA_DEF_OP_WREF +#undef HTA_DEF_OP_WREF +#endif + diff --git a/third_party/hta/libhta_dsp_skel.so b/third_party/hta/libhta_dsp_skel.so new file mode 100644 index 0000000000000000000000000000000000000000..6a371cfef8f47e6541be0f6bc307d9ed72aa5c7a Binary files /dev/null and b/third_party/hta/libhta_dsp_skel.so differ diff --git a/third_party/hta/license.txt b/third_party/hta/license.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fc186df55d1d4b6d43eaea9f7e77be6bc470459 --- /dev/null +++ b/third_party/hta/license.txt @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT + * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ diff --git a/third_party/nnlib/BUILD b/third_party/nnlib/BUILD.bazel similarity index 100% rename from third_party/nnlib/BUILD rename to third_party/nnlib/BUILD.bazel diff --git a/tools/aarch64_compiler/BUILD b/tools/aarch64_compiler/BUILD.bazel similarity index 100% rename from tools/aarch64_compiler/BUILD rename to tools/aarch64_compiler/BUILD.bazel diff --git a/tools/aarch64_compiler/linaro_linux_gcc/BUILD b/tools/aarch64_compiler/linaro_linux_gcc/BUILD.bazel similarity index 100% rename from tools/aarch64_compiler/linaro_linux_gcc/BUILD rename to tools/aarch64_compiler/linaro_linux_gcc/BUILD.bazel diff --git a/tools/arm_compiler/BUILD b/tools/arm_compiler/BUILD.bazel similarity index 93% rename from tools/arm_compiler/BUILD rename to tools/arm_compiler/BUILD.bazel index 30f83eb0a4f55815009c4d262cff634ec1516e48..140ed0d38c6a0f0a3be22daf73b959f78bb3e755 100644 --- a/tools/arm_compiler/BUILD +++ b/tools/arm_compiler/BUILD.bazel @@ -11,7 +11,7 @@ filegroup( name = "toolchain_fg", srcs = [ - ":cc-compiler-armeabi-v7a", + ":cc-compiler-armhf", ":linaro_linux_all_files", "@gcc_linaro_7_3_1_arm_linux_gnueabihf//:compiler_components", ], @@ -29,7 +29,7 @@ cc_toolchain_suite( name = "toolchain", # target_cpu | compiler toolchains = { - "armeabi-v7a|gcc": "cc-compiler-armeabi-v7a", + "armhf|gcc": "cc-compiler-armhf", }, ) @@ -66,10 +66,10 @@ filegroup( ) cc_toolchain( - name = "cc-compiler-armeabi-v7a", + name = "cc-compiler-armhf", all_files = ":linaro_linux_all_files", compiler_files = ":linaro_linux_compiler_files", - cpu = "armeabi-v7a", + cpu = "armhf", dwp_files = ":empty", dynamic_runtime_libs = [":empty"], linker_files = ":linaro_linux_linker_files", diff --git a/tools/arm_compiler/CROSSTOOL b/tools/arm_compiler/CROSSTOOL index ce7f6d15ccc177a5fcbc0e94a56438d5ef5278cb..58edd2f976cc37f2d27fc2763647d6cab881c080 100644 --- a/tools/arm_compiler/CROSSTOOL +++ b/tools/arm_compiler/CROSSTOOL @@ -1,9 +1,9 @@ major_version: "local" minor_version: "" -default_target_cpu: "armeabi-v7a" +default_target_cpu: "armhf" default_toolchain { - cpu: "armeabi-v7a" + cpu: "armhf" toolchain_identifier: "arm-linux-gnueabihf" } @@ -12,7 +12,7 @@ toolchain { abi_libc_version: "" builtin_sysroot: "" compiler: "gcc" - host_system_name: "armeabi-v7a" + host_system_name: "armhf" needsPic: true supports_gold_linker: true supports_incremental_linker: false @@ -22,7 +22,7 @@ toolchain { supports_start_end_lib: false supports_thin_archives: true target_libc: "" - target_cpu: "armeabi-v7a" + target_cpu: "armhf" target_system_name: "" toolchain_identifier: "arm-linux-gnueabihf" diff --git a/tools/arm_compiler/linaro_linux_gcc/BUILD b/tools/arm_compiler/linaro_linux_gcc/BUILD.bazel similarity index 100% rename from tools/arm_compiler/linaro_linux_gcc/BUILD rename to tools/arm_compiler/linaro_linux_gcc/BUILD.bazel diff --git a/tools/bazel.rc b/tools/bazel.rc index 77978ea3930d3df56ab216f50c3687a05037e16d..067cb7e1772cca81dc8bd9fb9713c7ef1aa151c4 100644 --- a/tools/bazel.rc +++ b/tools/bazel.rc @@ -14,20 +14,49 @@ build --copt=-DGEMMLOWP_USE_OPENMP build:symbol_hidden --copt=-fvisibility=hidden # Usage example: bazel build --config android +build:android --linkopt=-pie +build:android --linkopt=-ldl +build:android --linkopt=-llog +build:android --linkopt=-lm build:android --distinct_host_configuration=true build:android --crosstool_top=//external:android/crosstool build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain +# Linux host build, --config linux +build:linux --define linux=true + +# MacOS host build, --config darwin +build:darwin --define darwin=true + +# iOS and other darwin platforms, --config ios +build:ios --define darwin=true +build:ios --distinct_host_configuration=true +build:ios --host_crosstool_top=@bazel_tools//tools/cpp:toolchain +build:ios --cpu=arm64 + +# Linux host build, --config linux +build:linux --define linux=true + +# MacOS host build, --config darwin +build:darwin --define darwin=true + +# iOS and other darwin platforms, --config ios +build:ios --define darwin=true +build:ios --distinct_host_configuration=true +build:ios --host_crosstool_top=@bazel_tools//tools/cpp:toolchain +build:ios --cpu=ios_arm64 + # Usage example: bazel build --config arm_linux_gnueabihf # Used to fix library not find linking issue, see also: # https://github.com/bazelbuild/bazel/issues/6653, # https://github.com/bazelbuild/bazel/issues/6189 +build:arm_linux_gnueabihf --define linux=true build:arm_linux_gnueabihf --spawn_strategy=standalone build:arm_linux_gnueabihf --distinct_host_configuration=true build:arm_linux_gnueabihf --crosstool_top=//tools/arm_compiler:toolchain build:arm_linux_gnueabihf --host_crosstool_top=@bazel_tools//tools/cpp:toolchain -build:arm_linux_gnueabihf --cpu=armeabi-v7a +build:arm_linux_gnueabihf --cpu=armhf build:arm_linux_gnueabihf --copt -mfloat-abi=hard build:arm_linux_gnueabihf --copt -mfpu=neon build:arm_linux_gnueabihf --copt -Wno-ignored-attributes @@ -36,6 +65,7 @@ build:arm_linux_gnueabihf --copt -Wno-sequence-point build:arm_linux_gnueabihf --copt -Wno-implicit-fallthrough # Usage example: bazel build --config aarch64_linux_gnu +build:aarch64_linux_gnu --define linux=true build:aarch64_linux_gnu --spawn_strategy=standalone build:aarch64_linux_gnu --distinct_host_configuration=true build:aarch64_linux_gnu --crosstool_top=//tools/aarch64_compiler:toolchain diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 37d02b78f993dc4fcb38e9359ba404b4ed89eed5..328620c1f179869e36b4340199a6aefbe85f4466 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -86,7 +86,7 @@ def parse_args(): type=str, default="all", help="SoCs (ro.board.platform from getprop) to build, " - "comma seperated list or all/random") + "comma seperated list or all/random") parser.add_argument( "--target", type=str, default="//...", help="Bazel target to build") parser.add_argument( @@ -118,8 +118,8 @@ def parse_args(): '--device_yml', type=str, default='', - help='embedded linux device config yml file' - ) + help='embedded linux device config yml file') + parser.add_argument('--vlog_level', type=int, default=0, help='vlog level') return parser.parse_known_args() @@ -130,10 +130,12 @@ def main(unused_args): for target_abi in target_abis: toolchain = infer_toolchain(target_abi) - sh_commands.bazel_build(target, abi=target_abi, - toolchain=toolchain, - enable_neon=FLAGS.enable_neon, - address_sanitizer=FLAGS.address_sanitizer) + sh_commands.bazel_build( + target, + abi=target_abi, + toolchain=toolchain, + enable_neon=FLAGS.enable_neon, + address_sanitizer=FLAGS.address_sanitizer) if FLAGS.run_target: target_devices = DeviceManager.list_devices(FLAGS.device_yml) if FLAGS.target_socs != TargetSOCTag.all and\ @@ -158,12 +160,11 @@ def main(unused_args): bin_name, args=FLAGS.args, opencl_profiling=True, - vlog_level=0, + vlog_level=FLAGS.vlog_level, out_of_range_check=True, address_sanitizer=FLAGS.address_sanitizer, simpleperf=FLAGS.simpleperf) - globals()[FLAGS.stdout_processor](stdouts, dev, - target_abi) + globals()[FLAGS.stdout_processor](stdouts, dev, target_abi) if __name__ == "__main__": diff --git a/tools/common.py b/tools/common.py index 8e69ed8ed20cb8d20b83d2492afe3f377a65c8c3..82a25e5d5e6c04c1db474f93cf7dd21c3d1d48d3 100644 --- a/tools/common.py +++ b/tools/common.py @@ -129,6 +129,14 @@ class DeviceType(object): CPU = 'CPU' GPU = 'GPU' HEXAGON = 'HEXAGON' + HTA = 'HTA' + + +class DataFormat(object): + NONE = "NONE" + NHWC = "NHWC" + NCHW = "NCHW" + OIHW = "OIHW" ################################ @@ -193,6 +201,8 @@ def parse_device_type(runtime): if runtime == RuntimeType.dsp: device_type = DeviceType.HEXAGON + elif runtime == RuntimeType.hta: + device_type = DeviceType.HTA elif runtime == RuntimeType.gpu: device_type = DeviceType.GPU elif runtime == RuntimeType.cpu: @@ -401,6 +411,7 @@ class YAMLKeyword(object): graph_optimize_options = 'graph_optimize_options' # internal use for now cl_mem_type = 'cl_mem_type' backend = 'backend' + validation_outputs_data = 'validation_outputs_data' docker_image_tag = 'docker_image_tag' dockerfile_path = 'dockerfile_path' dockerfile_sha256_checksum = 'dockerfile_sha256_checksum' @@ -506,6 +517,7 @@ class RuntimeType(object): cpu = 'cpu' gpu = 'gpu' dsp = 'dsp' + hta = 'hta' cpu_gpu = 'cpu+gpu' @@ -524,3 +536,10 @@ class ToolchainType: class TargetSOCTag: all = 'all' random = 'random' + + +def split_shape(shape): + if shape.strip() == "": + return [] + else: + return shape.split(',') diff --git a/tools/converter.py b/tools/converter.py index b3a6569638137b52d25d3ac40b246eb8aba3bf8c..99a24b877e44c650cac5dbd7aea4a5a213df492f 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -14,12 +14,9 @@ import argparse import glob -import hashlib -import os -import re import sh import sys -import urllib +import time import yaml from enum import Enum @@ -64,6 +61,7 @@ RuntimeTypeStrs = [ "cpu", "gpu", "dsp", + "hta", "cpu+gpu" ] @@ -96,14 +94,11 @@ WinogradParameters = [0, 2, 4] DataFormatStrs = [ "NONE", "NHWC", + "NCHW", + "OIHW", ] -class DataFormat(object): - NONE = "NONE" - NHWC = "NHWC" - - class DefaultValues(object): mace_lib_type = MACELibType.static omp_num_threads = -1, @@ -149,6 +144,8 @@ def parse_device_type(runtime): if runtime == RuntimeType.dsp: device_type = DeviceType.HEXAGON + elif runtime == RuntimeType.hta: + device_type = DeviceType.HTA elif runtime == RuntimeType.gpu: device_type = DeviceType.GPU elif runtime == RuntimeType.cpu: @@ -170,6 +167,19 @@ def get_hexagon_mode(configs): return False +def get_hta_mode(configs): + runtime_list = [] + for model_name in configs[YAMLKeyword.models]: + model_runtime = \ + configs[YAMLKeyword.models][model_name].get( + YAMLKeyword.runtime, "") + runtime_list.append(model_runtime.lower()) + + if RuntimeType.hta in runtime_list: + return True + return False + + def get_opencl_mode(configs): runtime_list = [] for model_name in configs[YAMLKeyword.models]: @@ -371,6 +381,15 @@ def format_model_config(flags): if not isinstance(value, list): subgraph[key] = [value] subgraph[key] = [str(v) for v in subgraph[key]] + input_size = len(subgraph[YAMLKeyword.input_tensors]) + output_size = len(subgraph[YAMLKeyword.output_tensors]) + + mace_check(len(subgraph[YAMLKeyword.input_shapes]) == input_size, + ModuleName.YAML_CONFIG, + "input shapes' size not equal inputs' size.") + mace_check(len(subgraph[YAMLKeyword.output_shapes]) == output_size, + ModuleName.YAML_CONFIG, + "output shapes' size not equal outputs' size.") for key in [YAMLKeyword.check_tensors, YAMLKeyword.check_shapes]: @@ -399,13 +418,13 @@ def format_model_config(flags): if input_data_formats: if not isinstance(input_data_formats, list): subgraph[YAMLKeyword.input_data_formats] =\ - [input_data_formats] + [input_data_formats] * input_size else: mace_check(len(input_data_formats) - == len(subgraph[YAMLKeyword.input_tensors]), + == input_size, ModuleName.YAML_CONFIG, "input_data_formats should match" - " the size of input") + " the size of input.") for input_data_format in\ subgraph[YAMLKeyword.input_data_formats]: mace_check(input_data_format in DataFormatStrs, @@ -414,17 +433,18 @@ def format_model_config(flags): + str(DataFormatStrs) + ", but got " + input_data_format) else: - subgraph[YAMLKeyword.input_data_formats] = [DataFormat.NHWC] + subgraph[YAMLKeyword.input_data_formats] = \ + [DataFormat.NHWC] * input_size output_data_formats = subgraph.get(YAMLKeyword.output_data_formats, []) if output_data_formats: if not isinstance(output_data_formats, list): subgraph[YAMLKeyword.output_data_formats] = \ - [output_data_formats] + [output_data_formats] * output_size else: mace_check(len(output_data_formats) - == len(subgraph[YAMLKeyword.output_tensors]), + == output_size, ModuleName.YAML_CONFIG, "output_data_formats should match" " the size of output") @@ -435,7 +455,8 @@ def format_model_config(flags): "'output_data_formats' must be in " + str(DataFormatStrs)) else: - subgraph[YAMLKeyword.output_data_formats] = [DataFormat.NHWC] + subgraph[YAMLKeyword.output_data_formats] =\ + [DataFormat.NHWC] * output_size validation_threshold = subgraph.get( YAMLKeyword.validation_threshold, {}) @@ -448,6 +469,8 @@ def format_model_config(flags): DeviceType.GPU: ValidationThreshold.gpu_threshold, DeviceType.HEXAGON + "_QUANTIZE": ValidationThreshold.hexagon_threshold, + DeviceType.HTA + "_QUANTIZE": + ValidationThreshold.hexagon_threshold, DeviceType.CPU + "_QUANTIZE": ValidationThreshold.cpu_quantize_threshold, } @@ -457,6 +480,7 @@ def format_model_config(flags): if k.upper() not in (DeviceType.CPU, DeviceType.GPU, DeviceType.HEXAGON, + DeviceType.HTA, DeviceType.CPU + "_QUANTIZE"): raise argparse.ArgumentTypeError( 'Unsupported validation threshold runtime: %s' % k) @@ -476,6 +500,14 @@ def format_model_config(flags): onnx_backend = subgraph.get( YAMLKeyword.backend, "tensorflow") subgraph[YAMLKeyword.backend] = onnx_backend + validation_outputs_data = subgraph.get( + YAMLKeyword.validation_outputs_data, []) + if not isinstance(validation_outputs_data, list): + subgraph[YAMLKeyword.validation_outputs_data] = [ + validation_outputs_data] + else: + subgraph[YAMLKeyword.validation_outputs_data] = \ + validation_outputs_data input_ranges = subgraph.get( YAMLKeyword.input_ranges, []) if not isinstance(input_ranges, list): @@ -728,7 +760,6 @@ def build_model_lib(configs, address_sanitizer): # create model library dir library_name = configs[YAMLKeyword.library_name] for target_abi in configs[YAMLKeyword.target_abis]: - hexagon_mode = get_hexagon_mode(configs) model_lib_output_path = get_model_lib_output_path(library_name, target_abi) library_out_dir = os.path.dirname(model_lib_output_path) @@ -739,7 +770,8 @@ def build_model_lib(configs, address_sanitizer): MODEL_LIB_TARGET, abi=target_abi, toolchain=toolchain, - hexagon_mode=hexagon_mode, + enable_hexagon=get_hexagon_mode(configs), + enable_hta=get_hta_mode(configs), enable_opencl=get_opencl_mode(configs), enable_quantize=get_quantize_mode(configs), address_sanitizer=address_sanitizer, @@ -830,7 +862,6 @@ def report_run_statistics(stdout, def build_mace_run(configs, target_abi, toolchain, enable_openmp, address_sanitizer, mace_lib_type): library_name = configs[YAMLKeyword.library_name] - hexagon_mode = get_hexagon_mode(configs) build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi) if os.path.exists(build_tmp_binary_dir): @@ -853,7 +884,8 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp, mace_run_target, abi=target_abi, toolchain=toolchain, - hexagon_mode=hexagon_mode, + enable_hexagon=get_hexagon_mode(configs), + enable_hta=get_hta_mode(configs), enable_openmp=enable_openmp, enable_opencl=get_opencl_mode(configs), enable_quantize=get_quantize_mode(configs), @@ -868,7 +900,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp, def build_example(configs, target_abi, toolchain, enable_openmp, mace_lib_type, cl_binary_to_code, device): library_name = configs[YAMLKeyword.library_name] - hexagon_mode = get_hexagon_mode(configs) build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi) if os.path.exists(build_tmp_binary_dir): @@ -902,7 +933,8 @@ def build_example(configs, target_abi, toolchain, enable_openmp=enable_openmp, enable_opencl=get_opencl_mode(configs), enable_quantize=get_quantize_mode(configs), - hexagon_mode=hexagon_mode, + enable_hexagon=get_hexagon_mode(configs), + enable_hta=get_hta_mode(configs), address_sanitizer=flags.address_sanitizer, symbol_hidden=symbol_hidden) @@ -933,7 +965,8 @@ def build_example(configs, target_abi, toolchain, enable_openmp=enable_openmp, enable_opencl=get_opencl_mode(configs), enable_quantize=get_quantize_mode(configs), - hexagon_mode=hexagon_mode, + enable_hexagon=get_hexagon_mode(configs), + enable_hta=get_hta_mode(configs), address_sanitizer=flags.address_sanitizer, extra_args=build_arg) @@ -991,8 +1024,11 @@ def run_mace(flags): flags.address_sanitizer, flags.mace_lib_type) # run + start_time = time.time() with device.lock(): device.run_specify_abi(flags, configs, target_abi) + elapse_minutes = (time.time() - start_time) / 60 + print("Elapse time: %f minutes." % elapse_minutes) elif dev[YAMLKeyword.device_name] != SystemType.host: six.print_('The device with soc %s do not support abi %s' % (dev[YAMLKeyword.target_socs], target_abi), @@ -1013,7 +1049,6 @@ def build_benchmark_model(configs, enable_openmp, mace_lib_type): library_name = configs[YAMLKeyword.library_name] - hexagon_mode = get_hexagon_mode(configs) link_dynamic = mace_lib_type == MACELibType.dynamic if link_dynamic: @@ -1036,7 +1071,8 @@ def build_benchmark_model(configs, enable_openmp=enable_openmp, enable_opencl=get_opencl_mode(configs), enable_quantize=get_quantize_mode(configs), - hexagon_mode=hexagon_mode, + enable_hexagon=get_hexagon_mode(configs), + enable_hta=get_hta_mode(configs), symbol_hidden=symbol_hidden, extra_args=build_arg) # clear tmp binary dir @@ -1075,8 +1111,11 @@ def benchmark_model(flags): not flags.disable_openmp, flags.mace_lib_type) device = DeviceWrapper(dev) + start_time = time.time() with device.lock(): device.bm_specific_target(flags, configs, target_abi) + elapse_minutes = (time.time() - start_time) / 60 + print("Elapse time: %f minutes." % elapse_minutes) else: six.print_('There is no abi %s with soc %s' % (target_abi, dev[YAMLKeyword.target_socs]), diff --git a/tools/device.py b/tools/device.py index 07e92878db3b0e0effc6814aa5d68df10dc16983..0ff868f482cf0a9d6d7e69cd854a10ee863d51ae 100644 --- a/tools/device.py +++ b/tools/device.py @@ -154,7 +154,9 @@ class DeviceWrapper: input_nodes, output_nodes, input_shapes, + input_data_formats, output_shapes, + output_data_formats, mace_model_dir, model_tag, device_type, @@ -206,6 +208,7 @@ class DeviceWrapper: p = subprocess.Popen( [ "env", + "ASAN_OPTIONS=detect_leaks=1", "LD_LIBRARY_PATH=%s" % libmace_dynamic_lib_path, "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level, "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio, @@ -216,6 +219,8 @@ class DeviceWrapper: "--output_node=%s" % ",".join(output_nodes), "--input_shape=%s" % ":".join(input_shapes), "--output_shape=%s" % ":".join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), "--input_file=%s/%s" % (model_output_dir, input_file_name), "--output_file=%s/%s" % (model_output_dir, @@ -307,6 +312,8 @@ class DeviceWrapper: "--output_node=%s" % ",".join(output_nodes), "--input_shape=%s" % ":".join(input_shapes), "--output_shape=%s" % ":".join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), "--input_file=%s/%s" % (self.data_dir, input_file_name), "--output_file=%s/%s" % (self.data_dir, output_file_name), "--input_dir=%s" % input_dir, @@ -394,6 +401,8 @@ class DeviceWrapper: output_nodes=subgraphs[0][YAMLKeyword.output_tensors], input_shapes=subgraphs[0][YAMLKeyword.input_shapes], output_shapes=subgraphs[0][YAMLKeyword.output_shapes], + input_data_formats=subgraphs[0][YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][YAMLKeyword.output_data_formats], mace_model_dir=mace_model_dir, model_tag=model_name, device_type=DeviceType.GPU, @@ -587,6 +596,10 @@ class DeviceWrapper: YAMLKeyword.output_tensors], input_shapes=subgraphs[0][YAMLKeyword.input_shapes], output_shapes=output_config[YAMLKeyword.output_shapes], + input_data_formats=subgraphs[0][ + YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][ + YAMLKeyword.output_data_formats], mace_model_dir=mace_model_dir, model_tag=model_name, device_type=device_type, @@ -652,6 +665,10 @@ class DeviceWrapper: YAMLKeyword.input_shapes], output_shapes=output_config[ YAMLKeyword.output_shapes], + input_data_formats=subgraphs[0][ + YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][ + YAMLKeyword.output_data_formats], model_output_dir=model_output_dir, input_data_types=subgraphs[0][ YAMLKeyword.input_data_types], @@ -660,6 +677,8 @@ class DeviceWrapper: YAMLKeyword.validation_threshold][ validate_type], backend=subgraphs[0][YAMLKeyword.backend], + validation_outputs_data=subgraphs[0][ + YAMLKeyword.validation_outputs_data], log_file=log_file, ) if flags.report and flags.round > 0: @@ -748,6 +767,8 @@ class DeviceWrapper: output_nodes, input_shapes, output_shapes, + input_data_formats, + output_data_formats, max_num_runs, max_seconds, model_tag, @@ -788,6 +809,8 @@ class DeviceWrapper: '--output_node=%s' % ','.join(output_nodes), '--input_shape=%s' % ':'.join(input_shapes), '--output_shape=%s' % ':'.join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), '--input_file=%s/%s' % (model_output_dir, input_file_name), "--model_data_file=%s" % model_data_file, '--max_num_runs=%d' % max_num_runs, @@ -843,6 +866,8 @@ class DeviceWrapper: '--output_node=%s' % ','.join(output_nodes), '--input_shape=%s' % ':'.join(input_shapes), '--output_shape=%s' % ':'.join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), '--input_file=%s/%s' % (self.data_dir, input_file_name), "--model_data_file=%s" % model_data_file, '--max_num_runs=%d' % max_num_runs, @@ -959,6 +984,10 @@ class DeviceWrapper: output_nodes=output_nodes, input_shapes=subgraphs[0][YAMLKeyword.input_shapes], output_shapes=output_shapes, + input_data_formats=subgraphs[0][ + YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][ + YAMLKeyword.output_data_formats], max_num_runs=flags.max_num_runs, max_seconds=flags.max_seconds, mace_model_dir=mace_model_dir, @@ -972,8 +1001,7 @@ class DeviceWrapper: opencl_binary_file=opencl_output_bin_path, opencl_parameter_file=opencl_parameter_path, libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH, - link_dynamic=link_dynamic - ) + link_dynamic=link_dynamic) def run(self, abi, diff --git a/tools/generate_data.py b/tools/generate_data.py index 5ad0340e456df423fd36d37a3b565eb500fad39b..b80f0c20c9964b7e02c04b6d313f55e87a00cc20 100644 --- a/tools/generate_data.py +++ b/tools/generate_data.py @@ -59,7 +59,7 @@ def generate_input_data(input_file, input_node, input_shape, input_ranges, assert len(input_names) == len(input_shapes) == len(input_ranges) == len(input_data_types) # noqa for i in range(len(input_names)): - shape = [int(x) for x in input_shapes[i].split(',')] + shape = [int(x) for x in common.split_shape(input_shapes[i])] input_range = [float(x) for x in input_ranges[i].split(',')] generate_data(input_names[i], shape, input_file, input_range, input_data_types[i]) diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 035348ff2baae673ae798ce6cf4e40c771b877b8..6a8746b1e593154d580d7143d5c98a75977ee07c 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -69,7 +69,7 @@ def device_lock_path(serialno): return "/tmp/device-lock-%s" % serialno -def device_lock(serialno, timeout=3600): +def device_lock(serialno, timeout=7200): import filelock return filelock.FileLock(device_lock_path(serialno), timeout=timeout) @@ -263,7 +263,8 @@ def find_simpleperf_library(abi, simpleperf_path=''): def bazel_build(target, abi="armeabi-v7a", toolchain='android', - hexagon_mode=False, + enable_hexagon=False, + enable_hta=False, enable_openmp=True, enable_neon=True, enable_opencl=True, @@ -275,6 +276,8 @@ def bazel_build(target, if abi == "host": bazel_args = ( "build", + "--config", + platform.system().lower(), "--define", "openmp=%s" % str(enable_openmp).lower(), "--define", @@ -297,13 +300,15 @@ def bazel_build(target, "--define", "quantize=%s" % str(enable_quantize).lower(), "--define", - "hexagon=%s" % str(hexagon_mode).lower()) + "hexagon=%s" % str(enable_hexagon).lower(), + "--define", + "hta=%s" % str(enable_hta).lower()) if address_sanitizer: bazel_args += ("--config", "asan") else: bazel_args += ("--config", "optimization") - if symbol_hidden: - bazel_args += ("--config", "symbol_hidden") + if symbol_hidden: + bazel_args += ("--config", "symbol_hidden") if extra_args: bazel_args += (extra_args,) six.print_(bazel_args) @@ -649,6 +654,8 @@ def validate_model(abi, output_nodes, input_shapes, output_shapes, + input_data_formats, + output_data_formats, model_output_dir, input_data_types, caffe_env, @@ -656,9 +663,12 @@ def validate_model(abi, output_file_name="model_out", validation_threshold=0.9, backend="tensorflow", - log_file="", - ): - six.print_("* Validate with %s" % platform) + validation_outputs_data=[], + log_file=""): + if not validation_outputs_data: + six.print_("* Validate with %s" % platform) + else: + six.print_("* Validate with file: %s" % validation_outputs_data) if abi != "host": for output_name in output_nodes: formatted_name = common.formatted_file_name( @@ -668,21 +678,15 @@ def validate_model(abi, sh.rm("-rf", "%s/%s" % (model_output_dir, formatted_name)) device.pull_from_data_dir(formatted_name, model_output_dir) - if platform == "tensorflow": - validate(platform, model_file_path, "", - "%s/%s" % (model_output_dir, input_file_name), - "%s/%s" % (model_output_dir, output_file_name), device_type, - ":".join(input_shapes), ":".join(output_shapes), - ",".join(input_nodes), ",".join(output_nodes), - validation_threshold, ",".join(input_data_types), backend, - log_file) - elif platform == "onnx": + if platform == "tensorflow" or platform == "onnx": validate(platform, model_file_path, "", "%s/%s" % (model_output_dir, input_file_name), "%s/%s" % (model_output_dir, output_file_name), device_type, ":".join(input_shapes), ":".join(output_shapes), + ",".join(input_data_formats), ",".join(output_data_formats), ",".join(input_nodes), ",".join(output_nodes), validation_threshold, ",".join(input_data_types), backend, + validation_outputs_data, log_file) elif platform == "caffe": image_name = "mace-caffe:" + docker_image_tag @@ -698,8 +702,11 @@ def validate_model(abi, "%s/%s" % (model_output_dir, output_file_name), device_type, ":".join(input_shapes), ":".join(output_shapes), + ",".join(input_data_formats), + ",".join(output_data_formats), ",".join(input_nodes), ",".join(output_nodes), validation_threshold, ",".join(input_data_types), backend, + validation_outputs_data, log_file) elif caffe_env == common.CaffeEnvType.DOCKER: docker_image_id = sh.docker("images", "-q", image_name) @@ -764,9 +771,13 @@ def validate_model(abi, "--output_node=%s" % ",".join(output_nodes), "--input_shape=%s" % ":".join(input_shapes), "--output_shape=%s" % ":".join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), "--validation_threshold=%f" % validation_threshold, "--input_data_type=%s" % ",".join(input_data_types), "--backend=%s" % ",".join(backend), + "--validation_outputs_data=%s" % ",".join( + validation_outputs_data), "--log_file=%s" % log_file, _fg=True) diff --git a/tools/validate.py b/tools/validate.py index 2ea8fed2786b37ef2950deb706482d284cace6fd..47dc6c019ee790b264eb27a85732a635941db0b9 100644 --- a/tools/validate.py +++ b/tools/validate.py @@ -18,6 +18,7 @@ import os import os.path import numpy as np import re +import six import common @@ -67,6 +68,8 @@ def calculate_similarity(u, v, data_type=np.float64): def calculate_pixel_accuracy(out_value, mace_out_value): + if len(out_value.shape) < 2: + return 1.0 out_value = out_value.reshape((-1, out_value.shape[-1])) batches = out_value.shape[0] classes = out_value.shape[1] @@ -121,10 +124,37 @@ def normalize_tf_tensor_name(name): return name -def validate_tf_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, validation_threshold, input_data_types, - log_file): +def validate_with_file(platform, device_type, + output_names, output_shapes, + mace_out_file, validation_outputs_data, + validation_threshold, log_file): + for i in range(len(output_names)): + if validation_outputs_data[i].startswith("http://") or \ + validation_outputs_data[i].startswith("https://"): + validation_file_name = common.formatted_file_name( + mace_out_file, output_names[i] + '_validation') + six.moves.urllib.request.urlretrieve(validation_outputs_data[i], + validation_file_name) + else: + validation_file_name = validation_outputs_data[i] + value = load_data(validation_file_name) + out_shape = output_shapes[i] + if len(out_shape) == 4: + out_shape[1], out_shape[2], out_shape[3] = \ + out_shape[3], out_shape[1], out_shape[2] + value = value.reshape(out_shape).transpose((0, 2, 3, 1)) + output_file_name = common.formatted_file_name( + mace_out_file, output_names[i]) + mace_out_value = load_data(output_file_name) + compare_output(platform, device_type, output_names[i], mace_out_value, + value, validation_threshold, log_file) + + +def validate_tf_model(platform, device_type, model_file, + input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, input_data_types, log_file): import tensorflow as tf if not os.path.isfile(model_file): common.MaceLogger.error( @@ -147,6 +177,13 @@ def validate_tf_model(platform, device_type, model_file, input_file, common.formatted_file_name(input_file, input_names[i]), input_data_types[i]) input_value = input_value.reshape(input_shapes[i]) + if input_data_formats[i] == common.DataFormat.NCHW and\ + len(input_shapes[i]) == 4: + input_value = input_value.transpose((0, 2, 3, 1)) + elif input_data_formats[i] == common.DataFormat.OIHW and \ + len(input_shapes[i]) == 4: + # OIHW -> HWIO + input_value = input_value.transpose((2, 3, 1, 0)) input_node = graph.get_tensor_by_name( normalize_tf_tensor_name(input_names[i])) input_dict[input_node] = input_value @@ -161,15 +198,20 @@ def validate_tf_model(platform, device_type, model_file, input_file, output_file_name = common.formatted_file_name( mace_out_file, output_names[i]) mace_out_value = load_data(output_file_name) + if output_data_formats[i] == common.DataFormat.NCHW and\ + len(output_shapes[i]) == 4: + mace_out_value = mace_out_value.\ + reshape(output_shapes[i]).transpose((0, 2, 3, 1)) compare_output(platform, device_type, output_names[i], mace_out_value, output_values[i], validation_threshold, log_file) def validate_caffe_model(platform, device_type, model_file, input_file, - mace_out_file, weight_file, input_names, input_shapes, - output_names, output_shapes, validation_threshold, - log_file): + mace_out_file, weight_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, log_file): os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints import caffe if not os.path.isfile(model_file): @@ -188,8 +230,10 @@ def validate_caffe_model(platform, device_type, model_file, input_file, for i in range(len(input_names)): input_value = load_data( common.formatted_file_name(input_file, input_names[i])) - input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, - 2)) + input_value = input_value.reshape(input_shapes[i]) + if input_data_formats[i] == common.DataFormat.NHWC and \ + len(input_shapes[i]) == 4: + input_value = input_value.transpose((0, 3, 1, 2)) input_blob_name = input_names[i] try: if input_names[i] in net.top_names: @@ -205,22 +249,23 @@ def validate_caffe_model(platform, device_type, model_file, input_file, for i in range(len(output_names)): value = net.blobs[output_names[i]].data - out_shape = output_shapes[i] - if len(out_shape) == 4: - out_shape[1], out_shape[2], out_shape[3] = \ - out_shape[3], out_shape[1], out_shape[2] - value = value.reshape(out_shape).transpose((0, 2, 3, 1)) output_file_name = common.formatted_file_name( mace_out_file, output_names[i]) mace_out_value = load_data(output_file_name) + if output_data_formats[i] == common.DataFormat.NHWC and \ + len(output_shapes[i]) == 4: + mace_out_value = mace_out_value.reshape(output_shapes[i])\ + .transpose((0, 3, 1, 2)) compare_output(platform, device_type, output_names[i], mace_out_value, value, validation_threshold, log_file) -def validate_onnx_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, output_shapes, validation_threshold, - input_data_types, backend, log_file): +def validate_onnx_model(platform, device_type, model_file, + input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, input_data_types, + backend, log_file): import onnx if backend == "tensorflow": from onnx_tf.backend import prepare @@ -242,13 +287,16 @@ def validate_onnx_model(platform, device_type, model_file, input_file, input_value = load_data(common.formatted_file_name(input_file, input_names[i]), input_data_types[i]) - input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, - 2)) + input_value = input_value.reshape(input_shapes[i]) + if input_data_formats[i] == common.DataFormat.NHWC and \ + len(input_shapes[i]) == 4: + input_value = input_value.transpose((0, 3, 1, 2)) input_dict[input_names[i]] = input_value onnx_outputs = [] for i in range(len(output_names)): out_shape = output_shapes[i] - if len(out_shape) == 4: + if output_data_formats[i] == common.DataFormat.NHWC and\ + len(out_shape) == 4: out_shape[1], out_shape[2], out_shape[3] = \ out_shape[3], out_shape[1], out_shape[2] onnx_outputs.append( @@ -262,24 +310,32 @@ def validate_onnx_model(platform, device_type, model_file, input_file, for i in range(len(output_names)): out_name = output_names[i] value = output_values[out_name].flatten() - out_shape = output_shapes[i] - if len(out_shape) == 4: - value = value.reshape(out_shape).transpose((0, 2, 3, 1)) output_file_name = common.formatted_file_name(mace_out_file, output_names[i]) mace_out_value = load_data(output_file_name) + if output_data_formats[i] == common.DataFormat.NHWC and \ + len(output_shapes[i]) == 4: + mace_out_value = mace_out_value.reshape(output_shapes[i]) \ + .transpose((0, 3, 1, 2)) compare_output(platform, device_type, output_names[i], mace_out_value, value, validation_threshold, log_file) def validate(platform, model_file, weight_file, input_file, mace_out_file, - device_type, input_shape, output_shape, input_node, output_node, - validation_threshold, input_data_type, backend, log_file): + device_type, input_shape, output_shape, input_data_format_str, + output_data_format_str, input_node, output_node, + validation_threshold, input_data_type, backend, + validation_outputs_data, log_file): input_names = [name for name in input_node.split(',')] input_shape_strs = [shape for shape in input_shape.split(':')] - input_shapes = [[int(x) for x in shape.split(',')] + input_shapes = [[int(x) for x in common.split_shape(shape)] for shape in input_shape_strs] + output_shape_strs = [shape for shape in output_shape.split(':')] + output_shapes = [[int(x) for x in common.split_shape(shape)] + for shape in output_shape_strs] + input_data_formats = [df for df in input_data_format_str.split(',')] + output_data_formats = [df for df in output_data_format_str.split(',')] if input_data_type: input_data_types = [data_type for data_type in input_data_type.split(',')] @@ -287,27 +343,35 @@ def validate(platform, model_file, weight_file, input_file, mace_out_file, input_data_types = ['float32'] * len(input_names) output_names = [name for name in output_node.split(',')] assert len(input_names) == len(input_shapes) - - if platform == 'tensorflow': - validate_tf_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, validation_threshold, input_data_types, + if not isinstance(validation_outputs_data, list): + if os.path.isfile(validation_outputs_data): + validation_outputs = [validation_outputs_data] + else: + validation_outputs = [] + else: + validation_outputs = validation_outputs_data + if validation_outputs: + validate_with_file(platform, device_type, output_names, output_shapes, + mace_out_file, validation_outputs, + validation_threshold, log_file) + elif platform == 'tensorflow': + validate_tf_model(platform, device_type, + model_file, input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, input_data_types, log_file) elif platform == 'caffe': - output_shape_strs = [shape for shape in output_shape.split(':')] - output_shapes = [[int(x) for x in shape.split(',')] - for shape in output_shape_strs] - validate_caffe_model(platform, device_type, model_file, input_file, - mace_out_file, weight_file, input_names, - input_shapes, output_names, output_shapes, + validate_caffe_model(platform, device_type, model_file, + input_file, mace_out_file, weight_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, validation_threshold, log_file) elif platform == 'onnx': - output_shape_strs = [shape for shape in output_shape.split(':')] - output_shapes = [[int(x) for x in shape.split(',')] - for shape in output_shape_strs] - validate_onnx_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, output_shapes, + validate_onnx_model(platform, device_type, model_file, + input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, validation_threshold, input_data_types, backend, log_file) @@ -338,8 +402,14 @@ def parse_args(): "--device_type", type=str, default="", help="mace runtime device.") parser.add_argument( "--input_shape", type=str, default="1,64,64,3", help="input shape.") + parser.add_argument( + "--input_data_format", type=str, default="NHWC", + help="input data format.") parser.add_argument( "--output_shape", type=str, default="1,64,64,2", help="output shape.") + parser.add_argument( + "--output_data_format", type=str, default="NHWC", + help="output data format.") parser.add_argument( "--input_node", type=str, default="input_node", help="input node") parser.add_argument( @@ -358,10 +428,10 @@ def parse_args(): default="tensorflow", help="onnx backend framwork") parser.add_argument( - "--log_file", - type=str, - default="", - help="log file") + "--validation_outputs_data", type=str, + default="", help="validation outputs data file path.") + parser.add_argument( + "--log_file", type=str, default="", help="log file.") return parser.parse_known_args() @@ -376,9 +446,12 @@ if __name__ == '__main__': FLAGS.device_type, FLAGS.input_shape, FLAGS.output_shape, + FLAGS.input_data_format, + FLAGS.output_data_format, FLAGS.input_node, FLAGS.output_node, FLAGS.validation_threshold, FLAGS.input_data_type, FLAGS.backend, + FLAGS.validation_outputs_data, FLAGS.log_file)