diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 51806a6863612decfd5b3fb30ba2ef1918e63f40..91d057a49e2a0a71a3e160f0f772bfeaf8891dca 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,7 +25,8 @@ docs: - cd docs - make html - CI_LATEST_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/latest - - CI_JOB_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/$CI_BUILD_ID + - CI_JOB_OUTPUT_PATH=/mace-build-output/$CI_PROJECT_NAME/$CI_PIPELINE_ID + - rm -rf $CI_JOB_OUTPUT_PATH - mkdir -p $CI_JOB_OUTPUT_PATH - cp -r _build/html $CI_JOB_OUTPUT_PATH/docs - rm -rf $CI_LATEST_OUTPUT_PATH diff --git a/docker/Dockerfile b/docker/Dockerfile index 32d0cef583f71ff0a19593f3795b441c9103df99..8a8c089c493baa0982fbd68e8fa815ed68ea3e45 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -110,7 +110,8 @@ RUN apt-get install -y --no-install-recommends \ # Install tools RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com setuptools -RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.6.0 \ +RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com tensorflow==1.7.0 \ + numpy>=1.14.0 \ scipy \ jinja2 \ pyyaml \ diff --git a/docs/faq.md b/docs/faq.md index 01d1cd9acf12d39526802c714b39c9a922cbcdbf..c8ee4a9095f050e12d45c9a5ea6cab00a57aecd2 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -1,6 +1,17 @@ Frequently asked questions ========================== +Does the tensor data consume extra memory when compiled into C++ code? +---------------------------------------------------------------------- +When compiled into C++ code, the data will be mmaped by the system loader. +For CPU runtime, the tensor data are used without memory copy. +For GPU and DSP runtime, the tensor data is used once during model +initialization. The operating system is free to swap the pages out, however, +it still consumes virtual memory space. So generally speaking, it takes +no extra physical memory. If you are short of virtual memory space (this +should be very rare), you can choose load the tensor data from a file, which +can be unmapped after initialization. + Why is the generated static library file size so huge? ------------------------------------------------------- The static library is simply an archive of a set of object files which are diff --git a/docs/getting_started/docker.md b/docs/getting_started/docker.md deleted file mode 100644 index f58f4a64f7320552849a543fca23ab20ea71b5e1..0000000000000000000000000000000000000000 --- a/docs/getting_started/docker.md +++ /dev/null @@ -1,27 +0,0 @@ -Docker Images -============= - -* Login in [Xiaomi Docker Registry](http://docs.api.xiaomi.net/docker-registry/) - -``` -docker login cr.d.xiaomi.net -``` - -* Build with `Dockerfile` - -``` -docker build -t cr.d.xiaomi.net/mace/mace-dev -``` - -* Pull image from docker registry - -``` -docker pull cr.d.xiaomi.net/mace/mace-dev -``` - -* Create container - -``` -# Set 'host' network to use ADB -docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash -``` diff --git a/docs/getting_started/how_to_build.rst b/docs/getting_started/how_to_build.rst index 47781280c28220262be074192bd241071f04602a..43178cf946ac54bc81689bc6e90d993bcc02fc91 100644 --- a/docs/getting_started/how_to_build.rst +++ b/docs/getting_started/how_to_build.rst @@ -33,11 +33,13 @@ How to build +=====================+=================+===================================================================================================+ | bazel | >= 0.5.4 | - | +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ -| android-ndk | r12c | - | +| android-ndk | r15c,r16b | - | +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ | adb | >= 1.0.32 | apt install -y android-tools-adb | +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ -| tensorflow | 1.4.0 | pip install tensorflow==1.4.0 | +| tensorflow | 1.7.0 | pip install tensorflow==1.7.0 | ++---------------------+-----------------+---------------------------------------------------------------------------------------------------+ +| numpy | >= 1.14.0 | pip install numpy | +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ | scipy | >= 1.0.0 | pip install scipy | +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ @@ -45,9 +47,43 @@ How to build +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ | PyYaml | >= 3.12 | pip install pyyaml | +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ +| sh | >= 1.12.14 | pip install sh | ++---------------------+-----------------+---------------------------------------------------------------------------------------------------+ +| filelock | >= 3.0.0 | pip install filelock | ++---------------------+-----------------+---------------------------------------------------------------------------------------------------+ | docker(for caffe) | >= 17.09.0-ce | `install doc `__ | +---------------------+-----------------+---------------------------------------------------------------------------------------------------+ +Docker Images +---------------- + +* Login in `Xiaomi Docker Registry `__ + +.. code:: sh + + docker login cr.d.xiaomi.net + +* Build with Dockerfile + +.. code:: sh + + docker build -t cr.d.xiaomi.net/mace/mace-dev + + +* Pull image from docker registry + +.. code:: sh + + docker pull cr.d.xiaomi.net/mace/mace-dev + +* Create container + +.. code:: sh + + # Set 'host' network to use ADB + docker run -it --rm -v /local/path:/container/path --net=host cr.d.xiaomi.net/mace/mace-dev /bin/bash + + 使用简介 -------- @@ -199,29 +235,47 @@ Caffe目前只支持最新版本,旧版本请使用Caffe的工具进行升级 // 引入头文件 #include "mace/public/mace.h" - #include "mace/public/{MODEL_TAG}.h" + #include "mace/public/mace_engine_factory.h" - // 0. 设置内部存储 + // 0. 设置内部存储(设置一次即可) const std::string file_path ="/path/to/store/internel/files"; std::shared_ptr storage_factory( new FileStorageFactory(file_path)); ConfigKVStorageFactory(storage_factory); - //1. 从文件或代码中Load模型数据,也可通过自定义的方式来Load (例如可自己实现压缩加密等) - // 如果使用的是数据嵌入的方式,将参数设为nullptr。 - unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str()); - - //2. 创建net对象 - NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data); + //1. 声明设备类型(必须与build时指定的runtime一致) + DeviceType device_type = DeviceType::GPU; - //3. 声明设备类型(必须与build时指定的runtime一致) - DeviceType device_type = DeviceType::OPENCL; - - //4. 定义输入输出名称数组 + //2. 定义输入输出名称数组 std::vector input_names = {...}; std::vector output_names = {...}; - //5. 创建输入输出对象 + //3. 创建MaceEngine对象 + std::shared_ptr engine; + MaceStatus create_engine_status; + // Create Engine + if (model_data_file.empty()) { + create_engine_status = + CreateMaceEngine(model_name.c_str(), + nullptr, + input_names, + output_names, + device_type, + &engine); + } else { + create_engine_status = + CreateMaceEngine(model_name.c_str(), + model_data_file.c_str(), + input_names, + output_names, + device_type, + &engine); + } + if (create_engine_status != MaceStatus::MACE_SUCCESS) { + // do something + } + + //4. 创建输入输出对象 std::map inputs; std::map outputs; for (size_t i = 0; i < input_count; ++i) { @@ -246,14 +300,6 @@ Caffe目前只支持最新版本,旧版本请使用Caffe的工具进行升级 outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); } - //6. 创建MaceEngine对象 - mace::MaceEngine engine(&net_def, device_type, input_names, output_names); - - //7. 如果设备类型是OPENCL或HEXAGON,可以在此释放model_data - if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { - mace::MACE_MODEL_TAG::UnloadModelData(model_data); - } - - //8. 执行模型,得到结果 + //5. 执行模型,得到结果 engine.Run(inputs, &outputs); diff --git a/docs/getting_started/introduction.md b/docs/getting_started/introduction.md deleted file mode 100644 index 8cfc6b72537f996fa6f7b220167caf532effac3a..0000000000000000000000000000000000000000 --- a/docs/getting_started/introduction.md +++ /dev/null @@ -1,8 +0,0 @@ -Introduction -============ - -TODO: describe the conceptions and workflow with diagram. -![alt text](workflow.jpg "MiAI workflow") - -TODO: describe the runtime. - diff --git a/docs/getting_started/introduction.rst b/docs/getting_started/introduction.rst new file mode 100644 index 0000000000000000000000000000000000000000..4f9d4c8ae38c085bb0d5e54c587a65390950a5dc --- /dev/null +++ b/docs/getting_started/introduction.rst @@ -0,0 +1,46 @@ +Introduction +============ + +MiAI Compute Engine is a deep learning inference framework optimized for +mobile heterogeneous computing platforms. The following figure shows the +overall architecture. + +.. image:: mace-arch.png + :scale: 40 % + :align: center + +Model format +------------ + +MiAI Compute Engine defines a customized model format which is similar to +Caffe2. The MiAI model can be converted from exported models by TensorFlow +and Caffe. We define a YAML schema to describe the model deployment. In the +next chapter, there is a detailed guide showing how to create this YAML file. + +Model conversion +---------------- + +Currently, we provide model converters for TensorFlow and Caffe. And +more frameworks will be supported in the future. + +Model loading +------------- + +The MiAI model format contains two parts: the model graph definition and +the model parameter tensors. The graph part utilizes Protocol Buffers +for serialization. All the model parameter tensors are concatenated +together into a continuous array, and we call this array tensor data in +the following paragraphs. In the model graph, the tensor data offsets +and lengths are recorded. + +The models can be loaded in 3 ways: + +1. Both model graph and tensor data are dynamically loaded externally + (by default, from file system, but the users are free to choose their own + implementations, for example, with compression or encryption). This + approach provides the most flexibility but the weakest model protection. +2. Both model graph and tensor data are converted into C++ code and loaded + by executing the compiled code. This approach provides the strongest + model protection and simplest deployment. +3. The model graph is converted into C++ code and constructed as the second + approach, and the tensor data is loaded externally as the first approach. diff --git a/docs/getting_started/mace-arch.png b/docs/getting_started/mace-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..bfa4928b83df9604273a3bca4633bad40c0299ad Binary files /dev/null and b/docs/getting_started/mace-arch.png differ diff --git a/docs/getting_started/op_lists.rst b/docs/getting_started/op_lists.rst index 803d32170fb6eaa6a2b742a376158b00bab2ea7c..9a1d5425b1a768a0759a06f18e2f4a2d27f4e1b2 100644 --- a/docs/getting_started/op_lists.rst +++ b/docs/getting_started/op_lists.rst @@ -6,20 +6,22 @@ Operator lists :widths: auto :header: "Operator","Android NN","Supported","Remark" - "ADD","Y","Y","" "AVERAGE_POOL_2D","Y","Y","" "BATCH_NORM","","Y","Fusion with activation is supported" + "BATCH_TO_SPACE_ND","Y","Y","" "BIAS_ADD","","Y","" "CHANNEL_SHUFFLE","","Y","" - "CONCATENATION","Y","Y","" + "CONCATENATION","Y","Y","Only support channel axis concatenation" "CONV_2D","Y","Y","Fusion with BN and activation layer is supported" + "DECONV_2D","N","Y","Only tensorflow model is supported" "DEPTHWISE_CONV_2D","Y","Y","Only multiplier = 1 is supported; Fusion is supported" "DEPTH_TO_SPACE","Y","Y","" - "DEQUANTIZE","Y","","" + "DEQUANTIZE","Y","Y","Model quantization will be supported later" + "ELEMENT_WISE","Y","Y","ADD/MUL/DIV/MIN/MAX/NEG/ABS/SQR_DIFF/POW" "EMBEDDING_LOOKUP","Y","","" "FLOOR","Y","","" "FULLY_CONNECTED","Y","Y","" - "GROUP_CONV_2D","","","" + "GROUP_CONV_2D","","","Caffe model with group count = channel count is supported" "HASHTABLE_LOOKUP","Y","","" "L2_NORMALIZATION","Y","","" "L2_POOL_2D","Y","","" @@ -29,18 +31,20 @@ Operator lists "LSTM","Y","","" "MATMUL","","Y","" "MAX_POOL_2D","Y","Y","" - "MUL","Y","","" + "PAD", "N","Y","" "PSROI_ALIGN","","Y","" - "PRELU","","Y","" + "PRELU","","Y","Only caffe model is supported" "RELU","Y","Y","" "RELU1","Y","Y","" "RELU6","Y","Y","" "RELUX","","Y","" - "RESHAPE","Y","Y","Limited support" + "RESHAPE","Y","Y","Limited support: only internal use of reshape in composed operations is supported" "RESIZE_BILINEAR","Y","Y","" "RNN","Y","","" "RPN_PROPOSAL_LAYER","","Y","" + "SLICE","N","Y","Only support channel axis slice" "SOFTMAX","Y","Y","" + "SPACE_TO_BATCH_ND","Y", "Y","" "SPACE_TO_DEPTH","Y","Y","" "SVDF","Y","","" "TANH","Y","Y","" diff --git a/docs/getting_started/workflow.jpg b/docs/getting_started/workflow.jpg deleted file mode 100644 index e00ebea13ac5e261feb38306938f97fa995d47c7..0000000000000000000000000000000000000000 Binary files a/docs/getting_started/workflow.jpg and /dev/null differ diff --git a/docs/index.rst b/docs/index.rst index 70cfab0fdd9f1c69a62108dd0773f9dbadb21d37..a42b1655448dad6c622882f7359918b851a41576 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,7 +11,6 @@ The main documentation is organized into the following sections: getting_started/introduction getting_started/create_a_model_deployment - getting_started/docker getting_started/how_to_build getting_started/op_lists diff --git a/mace/core/BUILD b/mace/core/BUILD index abc5094e2caa5d8243958f39d6051ff920716446..9917dfe3254c11df195187511deb1f1afacd7df2 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -15,6 +15,7 @@ load( "if_production_mode", "if_not_production_mode", "if_openmp_enabled", + "if_neon_enabled", ) cc_library( @@ -51,7 +52,11 @@ cc_library( "-DMACE_ENABLE_OPENMP", ]) + if_android([ "-DMACE_ENABLE_OPENCL", - ]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]), linkopts = ["-ldl"] + if_android([ "-pie", "-lm", diff --git a/mace/core/allocator.h b/mace/core/allocator.h index 8c73025b4923cd860f3a47f3109cc4325728b259..7ab701ddd21b15b0bb88b258d9b2f85801b8dda2 100644 --- a/mace/core/allocator.h +++ b/mace/core/allocator.h @@ -16,6 +16,7 @@ #define MACE_CORE_ALLOCATOR_H_ #include +#include #include #include #include @@ -42,9 +43,10 @@ class Allocator { public: Allocator() {} virtual ~Allocator() noexcept {} - virtual void *New(size_t nbytes) const = 0; - virtual void *NewImage(const std::vector &image_shape, - const DataType dt) const = 0; + virtual MaceStatus New(size_t nbytes, void **result) const = 0; + virtual MaceStatus NewImage(const std::vector &image_shape, + const DataType dt, + void **result) const = 0; virtual void Delete(void *data) const = 0; virtual void DeleteImage(void *data) const = 0; virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0; @@ -53,44 +55,54 @@ class Allocator { std::vector *mapped_image_pitch) const = 0; virtual void Unmap(void *buffer, void *mapper_ptr) const = 0; virtual bool OnHost() const = 0; - - template - T *New(size_t num_elements) { - if (num_elements > (std::numeric_limits::max() / sizeof(T))) { - return nullptr; - } - void *p = New(sizeof(T) * num_elements); - T *typed_p = reinterpret_cast(p); - return typed_p; - } }; class CPUAllocator : public Allocator { public: ~CPUAllocator() override {} - void *New(size_t nbytes) const override { + MaceStatus New(size_t nbytes, void **result) const override { VLOG(3) << "Allocate CPU buffer: " << nbytes; + if (nbytes == 0) { + return MaceStatus::MACE_SUCCESS; + } void *data = nullptr; #if defined(__ANDROID__) || defined(__hexagon__) data = memalign(kMaceAlignment, nbytes); + if (data == NULL) { + LOG(WARNING) << "Allocate CPU Buffer with " + << nbytes << " bytes failed because of" + << strerror(errno); + *result = nullptr; + return MaceStatus::MACE_OUT_OF_RESOURCES; + } #else - MACE_CHECK(posix_memalign(&data, kMaceAlignment, nbytes) == 0); + int ret = posix_memalign(&data, kMaceAlignment, nbytes); + if (ret != 0) { + LOG(WARNING) << "Allocate CPU Buffer with " + << nbytes << " bytes failed because of" + << strerror(errno); + *result = nullptr; + return MaceStatus::MACE_OUT_OF_RESOURCES; + } #endif - MACE_CHECK_NOTNULL(data); // TODO(heliangliang) This should be avoided sometimes memset(data, 0, nbytes); - return data; + *result = data; + return MaceStatus::MACE_SUCCESS; } - void *NewImage(const std::vector &shape, - const DataType dt) const override { + MaceStatus NewImage(const std::vector &shape, + const DataType dt, + void **result) const override { MACE_UNUSED(shape); MACE_UNUSED(dt); + MACE_UNUSED(result); LOG(FATAL) << "Allocate CPU image"; - return nullptr; + return MaceStatus::MACE_SUCCESS; } void Delete(void *data) const override { + MACE_CHECK_NOTNULL(data); VLOG(3) << "Free CPU buffer"; free(data); } diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc index 8db23cae206a46a949c63de4a794ab4eac784362..9cf40d93a0d682c8a6dd04e90e869dd8514a127b 100644 --- a/mace/core/arg_helper.cc +++ b/mace/core/arg_helper.cc @@ -23,7 +23,8 @@ namespace mace { ArgumentHelper::ArgumentHelper(const OperatorDef &def) { for (auto &arg : def.arg()) { if (arg_map_.find(arg.name()) != arg_map_.end()) { - LOG(WARNING) << "Duplicated argument name found in operator def."; + LOG(WARNING) << "Duplicated argument name found in operator def: " + << def.name() << " " << arg.name(); } arg_map_[arg.name()] = arg; diff --git a/mace/core/buffer.h b/mace/core/buffer.h index f4b252a776296b1e065816c3a9b6288d13d03837..8ef5015c1c02c6f1e5e3feac761436cbff0cce48 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -38,6 +38,11 @@ class BufferBase { virtual void *raw_mutable_data() = 0; + virtual MaceStatus Allocate(index_t nbytes) = 0; + + virtual MaceStatus Allocate(const std::vector &shape, + DataType data_type) = 0; + virtual void *Map(index_t offset, index_t length, std::vector *pitch) const = 0; @@ -48,7 +53,7 @@ class BufferBase { virtual void UnMap() = 0; - virtual void Resize(index_t size) = 0; + virtual MaceStatus Resize(index_t nbytes) = 0; virtual void Copy(void *src, index_t offset, index_t length) = 0; @@ -56,6 +61,8 @@ class BufferBase { virtual void Clear() = 0; + virtual void Clear(index_t size) = 0; + virtual index_t offset() const { return 0; } template @@ -83,14 +90,6 @@ class Buffer : public BufferBase { mapped_buf_(nullptr), is_data_owner_(true) {} - Buffer(Allocator *allocator, index_t size) - : BufferBase(size), - allocator_(allocator), - mapped_buf_(nullptr), - is_data_owner_(true) { - buf_ = allocator->New(size); - } - Buffer(Allocator *allocator, void *data, index_t size) : BufferBase(size), allocator_(allocator), @@ -132,6 +131,31 @@ class Buffer : public BufferBase { } } + MaceStatus Allocate(index_t nbytes) { + if (nbytes <= 0) { + return MaceStatus::MACE_SUCCESS; + } + MACE_CHECK(is_data_owner_, + "data is not owned by this buffer, cannot reallocate"); + if (mapped_buf_ != nullptr) { + UnMap(); + } + if (buf_ != nullptr) { + allocator_->Delete(buf_); + } + size_ = nbytes; + return allocator_->New(nbytes, &buf_); + } + + MaceStatus Allocate(const std::vector &shape, + DataType data_type) { + if (shape.empty()) return MaceStatus::MACE_SUCCESS; + index_t nbytes = std::accumulate(shape.begin(), shape.end(), + 1, std::multiplies()) + * GetEnumTypeSize(data_type); + return this->Allocate(nbytes); + } + void *Map(index_t offset, index_t length, std::vector *pitch) const { MACE_CHECK_NOTNULL(buf_); MACE_UNUSED(pitch); @@ -154,16 +178,17 @@ class Buffer : public BufferBase { mapped_buf_ = nullptr; } - void Resize(index_t size) { + MaceStatus Resize(index_t nbytes) { MACE_CHECK(is_data_owner_, "data is not owned by this buffer, cannot resize"); - if (size != size_) { + if (nbytes != size_) { if (buf_ != nullptr) { allocator_->Delete(buf_); } - size_ = size; - buf_ = allocator_->New(size); + size_ = nbytes; + return allocator_->New(nbytes, &buf_); } + return MaceStatus::MACE_SUCCESS; } void Copy(void *src, index_t offset, index_t length) { @@ -175,7 +200,11 @@ class Buffer : public BufferBase { bool OnHost() const { return allocator_->OnHost(); } void Clear() { - memset(reinterpret_cast(raw_mutable_data()), 0, size_); + Clear(size_); + } + + void Clear(index_t size) { + memset(reinterpret_cast(raw_mutable_data()), 0, size); } protected: @@ -195,18 +224,6 @@ class Image : public BufferBase { buf_(nullptr), mapped_buf_(nullptr) {} - Image(std::vector shape, DataType data_type) - : BufferBase( - std::accumulate( - shape.begin(), shape.end(), 1, std::multiplies()) * - GetEnumTypeSize(data_type)), - allocator_(GetDeviceAllocator(GPU)), - mapped_buf_(nullptr) { - shape_ = shape; - data_type_ = data_type; - buf_ = allocator_->NewImage(shape, data_type); - } - virtual ~Image() { if (mapped_buf_ != nullptr) { UnMap(); @@ -233,6 +250,29 @@ class Image : public BufferBase { std::vector image_shape() const { return shape_; } + MaceStatus Allocate(index_t nbytes) { + MACE_UNUSED(nbytes); + LOG(FATAL) << "Image should not call this allocate function"; + return MaceStatus::MACE_SUCCESS; + } + + MaceStatus Allocate(const std::vector &shape, + DataType data_type) { + index_t size = std::accumulate( + shape.begin(), shape.end(), 1, std::multiplies()) * + GetEnumTypeSize(data_type); + if (mapped_buf_ != nullptr) { + UnMap(); + } + if (buf_ != nullptr) { + allocator_->DeleteImage(buf_); + } + size_ = size; + shape_ = shape; + data_type_ = data_type; + return allocator_->NewImage(shape, data_type, &buf_); + } + void *Map(index_t offset, index_t length, std::vector *pitch) const { MACE_UNUSED(offset); MACE_UNUSED(length); @@ -259,9 +299,10 @@ class Image : public BufferBase { mapped_buf_ = nullptr; } - void Resize(index_t size) { + MaceStatus Resize(index_t size) { MACE_UNUSED(size); MACE_NOT_IMPLEMENTED; + return MaceStatus::MACE_SUCCESS; } void Copy(void *src, index_t offset, index_t length) { @@ -277,6 +318,11 @@ class Image : public BufferBase { MACE_NOT_IMPLEMENTED; } + void Clear(index_t size) { + MACE_UNUSED(size); + MACE_NOT_IMPLEMENTED; + } + private: Allocator *allocator_; std::vector shape_; @@ -339,6 +385,20 @@ class BufferSlice : public BufferBase { } } + MaceStatus Allocate(index_t size) { + MACE_UNUSED(size); + LOG(FATAL) << "BufferSlice should not call allocate function"; + return MaceStatus::MACE_SUCCESS; + } + + MaceStatus Allocate(const std::vector &shape, + DataType data_type) { + MACE_UNUSED(shape); + MACE_UNUSED(data_type); + LOG(FATAL) << "BufferSlice should not call allocate function"; + return MaceStatus::MACE_SUCCESS; + } + void *Map(index_t offset, index_t length, std::vector *pitch) const { MACE_UNUSED(offset); MACE_UNUSED(length); @@ -364,9 +424,10 @@ class BufferSlice : public BufferBase { mapped_buf_ = nullptr; } - void Resize(index_t size) { + MaceStatus Resize(index_t size) { MACE_CHECK(size == size_, "resize buffer slice from ", size_, " to ", size, " is illegal"); + return MaceStatus::MACE_SUCCESS; } void Copy(void *src, index_t offset, index_t length) { @@ -381,7 +442,11 @@ class BufferSlice : public BufferBase { bool OnHost() const { return buffer_->OnHost(); } void Clear() { - memset(raw_mutable_data(), 0, size_); + Clear(size_); + } + + void Clear(index_t size) { + memset(raw_mutable_data(), 0, size); } private: @@ -396,20 +461,17 @@ class ScratchBuffer: public Buffer { : Buffer(allocator), offset_(0) {} - ScratchBuffer(Allocator *allocator, index_t size) - : Buffer(allocator, size), - offset_(0) {} - ScratchBuffer(Allocator *allocator, void *data, index_t size) : Buffer(allocator, data, size), offset_(0) {} virtual ~ScratchBuffer() {} - void GrowSize(index_t size) { + MaceStatus GrowSize(index_t size) { if (size > size_) { - Resize(size); + return Resize(size); } + return MaceStatus::MACE_SUCCESS; } BufferSlice Scratch(index_t size) { diff --git a/mace/core/mace.cc b/mace/core/mace.cc index 4bbcad2a135ea25b65abd55322797eba7e842e0c..bdf1fa894ebc3bc57db6bd4677dd07415a2c4b5b 100644 --- a/mace/core/mace.cc +++ b/mace/core/mace.cc @@ -90,13 +90,15 @@ std::shared_ptr MaceTensor::data() { return impl_->data; } // Mace Engine class MaceEngine::Impl { public: - explicit Impl(const NetDef *net_def, - DeviceType device_type, - const std::vector &input_nodes, - const std::vector &output_nodes, - const unsigned char *model_data); + explicit Impl(DeviceType device_type); + ~Impl(); + MaceStatus Init(const NetDef *net_def, + const std::vector &input_nodes, + const std::vector &output_nodes, + const unsigned char *model_data); + MaceStatus Run(const std::map &inputs, std::map *outputs, RunMetadata *run_metadata); @@ -113,11 +115,7 @@ class MaceEngine::Impl { DISABLE_COPY_AND_ASSIGN(Impl); }; -MaceEngine::Impl::Impl(const NetDef *net_def, - DeviceType device_type, - const std::vector &input_nodes, - const std::vector &output_nodes, - const unsigned char *model_data) +MaceEngine::Impl::Impl(DeviceType device_type) : op_registry_(new OperatorRegistry()), device_type_(device_type), ws_(new Workspace()), @@ -125,7 +123,13 @@ MaceEngine::Impl::Impl(const NetDef *net_def, #ifdef MACE_ENABLE_HEXAGON , hexagon_controller_(nullptr) #endif -{ +{} + +MaceStatus MaceEngine::Impl::Init( + const NetDef *net_def, + const std::vector &input_nodes, + const std::vector &output_nodes, + const unsigned char *model_data) { LOG(INFO) << "MACE version: " << MaceVersion(); // Set storage path for internal usage for (auto input_name : input_nodes) { @@ -137,7 +141,7 @@ MaceEngine::Impl::Impl(const NetDef *net_def, GetDeviceAllocator(device_type_), DT_FLOAT); } #ifdef MACE_ENABLE_HEXAGON - if (device_type == HEXAGON) { + if (device_type_ == HEXAGON) { hexagon_controller_.reset(new HexagonControlWrapper()); MACE_CHECK(hexagon_controller_->Config(), "hexagon config error"); MACE_CHECK(hexagon_controller_->Init(), "hexagon init error"); @@ -153,18 +157,23 @@ MaceEngine::Impl::Impl(const NetDef *net_def, } } else { #endif - ws_->LoadModelTensor(*net_def, device_type, model_data); + MaceStatus status = + ws_->LoadModelTensor(*net_def, device_type_, model_data); + if (status != MaceStatus::MACE_SUCCESS) { + return status; + } - // Init model - auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type, + // Init model + auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_, NetMode::INIT); if (!net->Run()) { LOG(FATAL) << "Net init run failed"; } - net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type); + net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_); #ifdef MACE_ENABLE_HEXAGON } #endif + return MaceStatus::MACE_SUCCESS; } MaceEngine::Impl::~Impl() { @@ -254,18 +263,18 @@ MaceStatus MaceEngine::Impl::Run( return MACE_SUCCESS; } -MaceEngine::MaceEngine(const NetDef *net_def, - DeviceType device_type, - const std::vector &input_nodes, - const std::vector &output_nodes, - const unsigned char *model_data) { - impl_ = std::unique_ptr( - new MaceEngine::Impl(net_def, device_type, input_nodes, output_nodes, - model_data)); -} +MaceEngine::MaceEngine(DeviceType device_type): + impl_(new MaceEngine::Impl(device_type)) {} MaceEngine::~MaceEngine() = default; +MaceStatus MaceEngine::Init(const NetDef *net_def, + const std::vector &input_nodes, + const std::vector &output_nodes, + const unsigned char *model_data) { + return impl_->Init(net_def, input_nodes, output_nodes, model_data); +} + MaceStatus MaceEngine::Run(const std::map &inputs, std::map *outputs, RunMetadata *run_metadata) { @@ -325,17 +334,18 @@ MaceStatus CreateMaceEngineFromPB(const std::string &model_data_file, const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type()))); } + + MaceStatus status; const unsigned char *model_data = nullptr; model_data = LoadModelData(model_data_file, model_data_size); - engine->reset( - new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes, - model_data)); + engine->reset(new mace::MaceEngine(device_type)); + status = (*engine)->Init(&net_def, input_nodes, output_nodes, model_data); if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) { UnloadModelData(model_data, model_data_size); } - return MACE_SUCCESS; + return status; } } // namespace mace diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 09891e8c4cfaab9afe9757dce75b13793e884e3f..23de679509bc1ea72ac7fab721bf76d8105c53ec 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -18,9 +18,11 @@ #include #endif +#include #include #include #include +#include #include #include #include @@ -44,7 +46,7 @@ int GetCPUCount() { result = access(path, F_OK); if (result != 0) { if (errno != ENOENT) { - LOG(ERROR) << "Access " << path << " failed, errno: " << errno; + LOG(ERROR) << "Access " << path << " failed: " << strerror(errno); } return cpu_count; } @@ -81,7 +83,7 @@ void SetThreadAffinity(cpu_set_t mask) { pid_t pid = syscall(SYS_gettid); #endif int err = sched_setaffinity(pid, sizeof(mask), &mask); - MACE_CHECK(err == 0, "set affinity error: ", errno); + MACE_CHECK(err == 0, "set affinity error: ", strerror(errno)); } } // namespace @@ -101,7 +103,7 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, for (int i = 0; i < cpu_count; ++i) { cpu_max_freq[i] = GetCPUMaxFreq(i); if (cpu_max_freq[i] == 0) { - LOG(WARNING) << "Cannot get cpu" << i + LOG(WARNING) << "Cannot get CPU" << i << "'s max frequency info, maybe it is offline."; return MACE_INVALID_ARGS; } @@ -128,13 +130,12 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, const std::vector &cpu_ids) { +#ifdef MACE_ENABLE_OPENMP VLOG(1) << "Set OpenMP threads number: " << omp_num_threads << ", CPU core IDs: " << MakeString(cpu_ids); - -#ifdef MACE_ENABLE_OPENMP omp_set_num_threads(omp_num_threads); #else - LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed."; + LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif // compute mask @@ -147,11 +148,13 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, #ifdef MACE_ENABLE_OPENMP #pragma omp parallel for for (int i = 0; i < omp_num_threads; ++i) { + VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num() + << "/" << omp_get_num_threads(); SetThreadAffinity(mask); } #else SetThreadAffinity(mask); - LOG(INFO) << "SetThreadAffinity: " << mask.__bits[0]; + VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0]; #endif } @@ -163,7 +166,7 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs())); } #else - LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed."; + LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif return MACE_SUCCESS; } @@ -192,7 +195,7 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, CPUAffinityPolicy policy) { - VLOG(1) << "Set CPU openmp num_threads_hint: " << num_threads_hint + VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint << ", affinity policy: " << policy; return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, policy); } diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index 9b94eb92ec15eb187d76fdf429e14712995baf09..96b083cfcbd0fe9b5fea8cbd2862a69e0700fec5 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -44,18 +44,30 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { OpenCLAllocator::OpenCLAllocator() {} OpenCLAllocator::~OpenCLAllocator() {} -void *OpenCLAllocator::New(size_t nbytes) const { +MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { + if (nbytes == 0) { + return MaceStatus::MACE_SUCCESS; + } VLOG(3) << "Allocate OpenCL buffer: " << nbytes; cl_int error; cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nbytes, nullptr, &error); - MACE_CHECK_CL_SUCCESS(error); - return static_cast(buffer); + if (error != CL_SUCCESS) { + LOG(WARNING) << "Allocate OpenCL Buffer with " + << nbytes << " bytes failed because of" + << OpenCLErrorToString(error); + *result = nullptr; + return MaceStatus::MACE_OUT_OF_RESOURCES; + } else { + *result = buffer; + return MaceStatus::MACE_SUCCESS; + } } -void *OpenCLAllocator::NewImage(const std::vector &image_shape, - const DataType dt) const { +MaceStatus OpenCLAllocator::NewImage(const std::vector &image_shape, + const DataType dt, + void **result) const { MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2"; VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1]; @@ -67,11 +79,17 @@ void *OpenCLAllocator::NewImage(const std::vector &image_shape, new cl::Image2D(OpenCLRuntime::Global()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format, image_shape[0], image_shape[1], 0, nullptr, &error); - MACE_CHECK_CL_SUCCESS(error) << " with image shape: [" - << image_shape[0] << ", " << image_shape[1] - << "]"; - - return cl_image; + if (error != CL_SUCCESS) { + LOG(WARNING) << "Allocate OpenCL image with shape: [" + << image_shape[0] << ", " << image_shape[1] + << "] failed because of" + << OpenCLErrorToString(error); + *result = nullptr; + return MaceStatus::MACE_OUT_OF_RESOURCES; + } else { + *result = cl_image; + return MaceStatus::MACE_SUCCESS; + } } void OpenCLAllocator::Delete(void *buffer) const { diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h index 0ec50f61062aad81e2e8b0fc010e2a57b457a833..6304add8583f7b2e47c58cd6e6b186ea43b7f092 100644 --- a/mace/core/runtime/opencl/opencl_allocator.h +++ b/mace/core/runtime/opencl/opencl_allocator.h @@ -27,15 +27,16 @@ class OpenCLAllocator : public Allocator { ~OpenCLAllocator() override; - void *New(size_t nbytes) const override; + MaceStatus New(size_t nbytes, void **result) const override; /* * Use Image2D with RGBA (128-bit) format to represent the image. * * @ shape : [depth, ..., height, width ]. */ - void *NewImage(const std::vector &image_shape, - const DataType dt) const override; + MaceStatus NewImage(const std::vector &image_shape, + const DataType dt, + void **result) const override; void Delete(void *buffer) const override; diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 87a6cb3c1c9ff9f3712c50e07c8c6e0d69f5cf61..1404bf023b6d35351cec21c2e5385cae1366bd74 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -208,7 +208,7 @@ class Tensor { inline void Clear() { MACE_CHECK_NOTNULL(buffer_); - buffer_->Clear(); + buffer_->Clear(raw_size()); } inline void Reshape(const std::vector &shape) { @@ -216,16 +216,21 @@ class Tensor { MACE_CHECK(raw_size() <= buffer_->size()); } - inline void Resize(const std::vector &shape) { + inline MaceStatus Resize(const std::vector &shape) { shape_ = shape; image_shape_.clear(); if (buffer_ != nullptr) { MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage."); - if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) - buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE); + if (raw_size() + EXTRA_BUFFER_PAD_SIZE > buffer_->size()) { + LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to " + << raw_size() + EXTRA_BUFFER_PAD_SIZE; + return buffer_->Resize(raw_size() + EXTRA_BUFFER_PAD_SIZE); + } + return MaceStatus::MACE_SUCCESS; } else { MACE_CHECK(is_buffer_owner_); - buffer_ = new Buffer(allocator_, raw_size() + EXTRA_BUFFER_PAD_SIZE); + buffer_ = new Buffer(allocator_); + return buffer_->Allocate(raw_size() + EXTRA_BUFFER_PAD_SIZE); } } @@ -241,13 +246,14 @@ class Tensor { is_buffer_owner_ = false; } - inline void ResizeImage(const std::vector &shape, - const std::vector &image_shape) { + inline MaceStatus ResizeImage(const std::vector &shape, + const std::vector &image_shape) { shape_ = shape; image_shape_ = image_shape; if (buffer_ == nullptr) { MACE_CHECK(is_buffer_owner_); - buffer_ = new Image(image_shape, dtype_); + buffer_ = new Image(); + return buffer_->Allocate(image_shape, dtype_); } else { MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); Image *image = dynamic_cast(buffer_); @@ -257,24 +263,27 @@ class Tensor { "): current physical image shape: ", image->image_shape()[0], ", ", image->image_shape()[1], " < logical image shape: ", image_shape[0], ", ", image_shape[1]); + return MaceStatus::MACE_SUCCESS; } } - inline void ResizeLike(const Tensor &other) { ResizeLike(&other); } + inline MaceStatus ResizeLike(const Tensor &other) { + return ResizeLike(&other); + } - inline void ResizeLike(const Tensor *other) { + inline MaceStatus ResizeLike(const Tensor *other) { if (other->has_opencl_image()) { if (is_buffer_owner_ && buffer_ != nullptr && !has_opencl_image()) { delete buffer_; buffer_ = nullptr; } - ResizeImage(other->shape(), other->image_shape_); + return ResizeImage(other->shape(), other->image_shape_); } else { if (is_buffer_owner_ && buffer_ != nullptr && has_opencl_image()) { delete buffer_; buffer_ = nullptr; } - Resize(other->shape()); + return Resize(other->shape()); } } diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index bb778f8f3d986a88786697772a9246a30049d25e..545ace635da255256d189b709c5fd9b68545b33b 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -60,9 +60,9 @@ std::vector Workspace::Tensors() const { return names; } -void Workspace::LoadModelTensor(const NetDef &net_def, - DeviceType type, - const unsigned char *model_data) { +MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, + DeviceType type, + const unsigned char *model_data) { MACE_LATENCY_LOGGER(1, "Load model tensors"); index_t model_data_size = 0; for (auto &const_tensor : net_def.tensors()) { @@ -82,7 +82,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def, model_data_size)); } else { tensor_buffer_ = std::unique_ptr( - new Buffer(GetDeviceAllocator(type), model_data_size)); + new Buffer(GetDeviceAllocator(type))); + MaceStatus status = tensor_buffer_->Allocate(model_data_size); + if (status != MaceStatus::MACE_SUCCESS) { + return status; + } tensor_buffer_->Map(nullptr); tensor_buffer_->Copy(const_cast(model_data), 0, model_data_size); @@ -112,14 +116,16 @@ void Workspace::LoadModelTensor(const NetDef &net_def, } if (type == DeviceType::CPU || type == DeviceType::GPU) { - CreateOutputTensorBuffer(net_def, type); + MaceStatus status = CreateOutputTensorBuffer(net_def, type); + if (status != MaceStatus::MACE_SUCCESS) return status; } + return MaceStatus::MACE_SUCCESS; } -void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, - DeviceType device_type) { +MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, + DeviceType device_type) { if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) { - return; + return MaceStatus::MACE_SUCCESS; } DataType dtype = DataType::DT_INVALID; @@ -149,14 +155,24 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, // TODO(liuqi): refactor based on PB if (mem_block.mem_id() >= 20000) { std::unique_ptr image_buf( - new Image({mem_block.x(), mem_block.y()}, dtype)); + new Image()); + MaceStatus status = image_buf->Allocate( + {mem_block.x(), mem_block.y()}, dtype); + if (status != MaceStatus::MACE_SUCCESS) { + return status; + } preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); } } else { if (mem_block.mem_id() < 20000) { std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(device_type), mem_block.x())); + new Buffer(GetDeviceAllocator(device_type))); + MaceStatus status = tensor_buf->Allocate( + mem_block.x() * GetEnumTypeSize(dtype) + EXTRA_BUFFER_PAD_SIZE); + if (status != MaceStatus::MACE_SUCCESS) { + return status; + } preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(tensor_buf)); } @@ -193,6 +209,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, } } } + return MaceStatus::MACE_SUCCESS; } ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) { diff --git a/mace/core/workspace.h b/mace/core/workspace.h index 35bdc790c73d74e6fe37bec5368ce61798e9cbac..7399562dd63ad86b7b5b835c3b1eb2f9cda36d71 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -47,14 +47,15 @@ class Workspace { std::vector Tensors() const; - void LoadModelTensor(const NetDef &net_def, - DeviceType type, - const unsigned char *model_data); + MaceStatus LoadModelTensor(const NetDef &net_def, + DeviceType type, + const unsigned char *model_data); ScratchBuffer *GetScratchBuffer(DeviceType device_type); private: - void CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type); + MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, + DeviceType device_type); TensorMap tensor_map_; diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h index 3d3c907e97221eb4970a47f5caba9b69b0e13070..5d2d5f9adb96571ee0a7def3a527cdb23c192d5d 100644 --- a/mace/kernels/arm/conv_2d_neon.h +++ b/mace/kernels/arm/conv_2d_neon.h @@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input, extern void Conv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK5x5S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK7x7S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK7x7S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); extern void Conv2dNeonK7x7S3(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output); } // namespace kernels diff --git a/mace/kernels/arm/conv_2d_neon_3x3.cc b/mace/kernels/arm/conv_2d_neon_3x3.cc index 0e4ac0eb8c1dcc4ac0c3686d45d80c1f7f3ea266..58b28ddc48a5aa5b880c80e6bbdde8ca32f46e38 100644 --- a/mace/kernels/arm/conv_2d_neon_3x3.cc +++ b/mace/kernels/arm/conv_2d_neon_3x3.cc @@ -24,22 +24,22 @@ namespace kernels { // Ho = 2, Wo = 4, Co = 2 void Conv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 2) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 2) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 1 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) @@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input, void Conv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { - for (index_t c = 0; c < in_channels; ++c) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { + for (index_t c = 0; c < in_shape[1]; ++c) { + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; const float *in_base = input + b * in_batch_size + c * in_image_size; const float *filter_ptr = filter + m * in_channels * 9 + c * 9; diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc index f4fe7ce7ccbaa3ca69de88da6234a17b66097470..3d77d8f6b5535a386dd2d6ba1a18367e1c189bf1 100644 --- a/mace/kernels/arm/conv_2d_neon_5x5.cc +++ b/mace/kernels/arm/conv_2d_neon_5x5.cc @@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK5x5S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc index 057b93138cebb226fab6008798ccc273a29e574d..4432f2a05b848fdc9978b5a10fed7985ff3d4cff 100644 --- a/mace/kernels/arm/conv_2d_neon_7x7.cc +++ b/mace/kernels/arm/conv_2d_neon_7x7.cc @@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK7x7S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) @@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK7x7S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) @@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input, // Ho = 1, Wo = 4, Co = 4 void Conv2dNeonK7x7S3(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, + const index_t *in_shape, + const index_t *out_shape, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; m += 4) { + const index_t out_channels = out_shape[1]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_channels = in_shape[1]; + const index_t in_width = in_shape[3]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; #if defined(MACE_ENABLE_NEON) diff --git a/mace/kernels/arm/depthwise_conv2d_neon.h b/mace/kernels/arm/depthwise_conv2d_neon.h index 130cd360c434ac7283ad83f93587d4ce9c02e3f9..119867bfe3efdc3791aa267566b59bafc9edf8a4 100644 --- a/mace/kernels/arm/depthwise_conv2d_neon.h +++ b/mace/kernels/arm/depthwise_conv2d_neon.h @@ -22,15 +22,9 @@ namespace kernels { void DepthwiseConv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, @@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, diff --git a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc index fb36bdaded33d5217f1ccb9ae1d9427204433cc5..443e57b7ec5a3d7bfe473ea42e79252eab7c4305 100644 --- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc +++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc @@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base, // Ho = 2, Wo = 4, Co = 1 void DepthwiseConv2dNeonK3x3S1(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t* in_shape, + const index_t* out_shape, + const int* pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, @@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, MACE_UNUSED(valid_w_start); MACE_UNUSED(valid_w_stop); #endif - const index_t multiplier = out_channels / in_channels; - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t multiplier = out_shape[1] / in_shape[1]; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { + for (index_t b = 0; b < in_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { index_t c = m / multiplier; index_t multi_index = m % multiplier; const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; + const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9; float *out_base = output + b * out_batch_size + m * out_image_size; index_t h, w; + const index_t pad_top = pad_hw[0]; + const index_t pad_left = pad_hw[1]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; // top for (h = 0; h < valid_h_start; ++h) { - for (w = 0; w < out_width; ++w) { + for (w = 0; w < out_shape[3]; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, @@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, } // h #else for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { - for (index_t iw = 0; iw < out_width; ++iw) { + for (index_t iw = 0; iw < out_shape[3]; ++iw) { DepthwiseConv2dPixel(in_base, filter_ptr, ih, @@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, #endif // bottom - for (; h < out_height; ++h) { - for (w = 0; w < out_width; ++w) { + for (; h < out_shape[2]; ++h) { + for (w = 0; w < out_shape[3]; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, @@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int pad_top, - const int pad_left, + const index_t* in_shape, + const index_t* out_shape, + const int* pad_hw, const index_t valid_h_start, const index_t valid_h_stop, const index_t valid_w_start, @@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, MACE_UNUSED(valid_w_start); MACE_UNUSED(valid_w_stop); #endif - const index_t multiplier = out_channels / in_channels; - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; + const index_t multiplier = out_shape[1] / in_shape[1]; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { + for (index_t b = 0; b < in_shape[0]; ++b) { + for (index_t m = 0; m < out_shape[1]; ++m) { index_t c = m / multiplier; index_t multi_index = m % multiplier; const float *in_base = input + b * in_batch_size + c * in_image_size; - const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; + const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9; float *out_base = output + b * out_batch_size + m * out_image_size; index_t h, w; - + const index_t pad_top = pad_hw[0]; + const index_t pad_left = pad_hw[1]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; // top for (h = 0; h < valid_h_start; ++h) { for (w = 0; w < out_width; ++w) { @@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, #endif // bottom - for (; h < out_height; ++h) { - for (w = 0; w < out_width; ++w) { + for (; h < out_shape[2]; ++h) { + for (w = 0; w < out_shape[3]; ++w) { DepthwiseConv2dPixel(in_base, filter_ptr, h, diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index dfe5540d16e60bf250685e6a725cede177a2ff97..eb374960cab4eb5d14a29316c4d78ab6fb2c324b 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -84,49 +84,45 @@ struct Conv2dFunctor : Conv2dFunctorBase { void Conv2dGeneral(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, + const index_t *in_shape, + const index_t *out_shape, + const index_t *filter_shape, + const int *stride_hw, + const int *dilation_hw, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = in_channels * in_image_size; - const index_t out_batch_size = out_channels * out_image_size; - const index_t filter_size = filter_height * filter_width; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = filter_shape[1] * in_image_size; + const index_t out_batch_size = filter_shape[0] * out_image_size; + const index_t filter_size = filter_shape[2] * filter_shape[3]; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; m += 4) { + for (index_t b = 0; b < in_shape[0]; b++) { + for (index_t m = 0; m < filter_shape[0]; m += 4) { + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; + + const int stride_h = stride_hw[0]; + const int stride_w = stride_hw[1]; + const int dilation_h = dilation_hw[0]; + const int dilation_w = dilation_hw[1]; if (m + 3 < out_channels) { float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = - output + b * out_batch_size + (m + 1) * out_image_size; - float *out_ptr2_base = - output + b * out_batch_size + (m + 2) * out_image_size; - float *out_ptr3_base = - output + b * out_batch_size + (m + 3) * out_image_size; + float *out_ptr1_base = out_ptr0_base + out_image_size; + float *out_ptr2_base = out_ptr1_base + out_image_size; + float *out_ptr3_base = out_ptr2_base + out_image_size; for (index_t c = 0; c < in_channels; ++c) { const float *in_ptr_base = input + b * in_batch_size + c * in_image_size; const float *filter_ptr0 = filter + m * in_channels * filter_size + c * filter_size; - const float *filter_ptr1 = - filter + (m + 1) * in_channels * filter_size + c * filter_size; - const float *filter_ptr2 = - filter + (m + 2) * in_channels * filter_size + c * filter_size; - const float *filter_ptr3 = - filter + (m + 3) * in_channels * filter_size + c * filter_size; + const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; + const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; + const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w + 3 < out_width; w += 4) { // input offset @@ -144,8 +140,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { vo3[ow] = out_ptr3_base[out_offset + ow]; } // calc by row - for (index_t kh = 0; kh < filter_height; ++kh) { - for (index_t kw = 0; kw < filter_width; ++kw) { + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { // outch 0 vo0[0] += in_ptr_base[in_offset + kw * dilation_w] * filter_ptr0[kw]; @@ -185,10 +181,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // kw in_offset += dilation_h * in_width; - filter_ptr0 += filter_width; - filter_ptr1 += filter_width; - filter_ptr2 += filter_width; - filter_ptr3 += filter_width; + filter_ptr0 += filter_shape[3]; + filter_ptr1 += filter_shape[3]; + filter_ptr2 += filter_shape[3]; + filter_ptr3 += filter_shape[3]; } // kh for (index_t ow = 0; ow < 4; ++ow) { @@ -230,8 +226,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // calc by row - for (index_t kh = 0; kh < filter_height; ++kh) { - for (index_t kw = 0; kw < filter_width; ++kw) { + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { // outch 0 vo0[0] += in_ptr_base[in_offset + kw * dilation_w] * filter_ptr0[kw]; @@ -244,7 +240,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // kw in_offset += dilation_h * in_width; - filter_ptr0 += filter_width; + filter_ptr0 += filter_shape[3]; } // kh for (index_t ow = 0; ow < 4; ++ow) { @@ -301,7 +297,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { output_shape.data()); } output->Resize(output_shape); - output->Clear(); index_t batch = output->dim(0); index_t channels = output->dim(1); @@ -419,7 +414,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { if (extra_input_width != padded_input_width) { pad_right += (extra_input_width - padded_input_width); } - } else { + } else if (!use_neon_1x1_s1) { extra_output_height = height; extra_input_height = std::max(padded_input_height, (extra_output_height - 1) * stride_h @@ -478,6 +473,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT); Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT); Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT); + const index_t extra_input_shape[4] = + {batch, input_channels, extra_input_height, extra_input_width}; + const index_t extra_output_shape[4] = + {batch, channels, extra_output_height, extra_output_width}; // decide which convolution function to call if (use_winograd) { @@ -512,6 +511,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { float *transformed_input_data = transformed_input.mutable_data(); float *transformed_output_data = transformed_output.mutable_data(); + conv_func = [=](const float *pad_input, float *pad_output) { WinoGradConv3x3s1(pad_input, transformed_filter_ptr, @@ -529,26 +529,16 @@ struct Conv2dFunctor : Conv2dFunctorBase { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK3x3S1(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_3x3_s2) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK3x3S2(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_1x1_s1) { @@ -566,71 +556,43 @@ struct Conv2dFunctor : Conv2dFunctorBase { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK5x5S1(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_7x7_s1) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK7x7S1(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_7x7_s2) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK7x7S2(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else if (use_neon_7x7_s3) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK7x7S3(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, + extra_input_shape, + extra_output_shape, pad_output); }; } else { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dGeneral(pad_input, filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - extra_output_height, - extra_output_width, - channels, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, + extra_input_shape, + extra_output_shape, + filter_shape.data(), + strides_, + dilations_, pad_output); }; } @@ -639,7 +601,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { const Tensor *pad_input_ptr = input; if (extra_input_height != input_height || extra_input_width != input_width) { - padded_input.Clear(); ConstructNCHWInputWithSpecificPadding(input, pad_top, pad_bottom, @@ -649,13 +610,17 @@ struct Conv2dFunctor : Conv2dFunctorBase { pad_input_ptr = &padded_input; } + // TODO(libin): don't need clear after bias is integrated in each conv Tensor *pad_output_ptr = output; if (extra_output_height != height || extra_output_width != width) { padded_output.Reshape({batch, channels, extra_output_height, extra_output_width}); padded_output.Clear(); pad_output_ptr = &padded_output; + } else if (!use_neon_1x1_s1) { + output->Clear(); } + const float *pad_input_data = pad_input_ptr->data(); float *pad_output_data = pad_output_ptr->mutable_data(); diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc index b0f5229f94de2be770f68eb42d538170da8fa7ca..07c72cb35eece69e9b2cefae8b82841dac397524 100644 --- a/mace/kernels/conv_pool_2d_util.cc +++ b/mace/kernels/conv_pool_2d_util.cc @@ -377,6 +377,7 @@ void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, std::vector output_shape( {batch, channels, height + pad_height, width + pad_width}); output_tensor->Resize(output_shape); + output_tensor->Clear(); Tensor::MappingGuard padded_output_mapper(output_tensor); float *output_data = output_tensor->mutable_data(); diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 14d78a45fee4b6bd62914b46e72a8e804fa83d0f..7c20adddaa91f7fecff80008afcaa9f2680e323f 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -41,48 +41,40 @@ template void Deconv2dNCHW(const T *input, const T *filter, const T *bias, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const index_t filter_height, - const index_t filter_width, - const index_t stride_h, - const index_t stride_w, - const int padding_top, - const int padding_left, + const index_t *in_shape, + const index_t *out_shape, + const index_t *kernel_hw, + const int *strides, + const int *padding, float *output) { #pragma omp parallel for collapse(4) - for (index_t b = 0; b < batch; ++b) { - for (index_t oc = 0; oc < out_channels; ++oc) { - for (index_t oh = 0; oh < out_height; ++oh) { - for (index_t ow = 0; ow < out_width; ++ow) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t oc = 0; oc < out_shape[1]; ++oc) { + for (index_t oh = 0; oh < out_shape[2]; ++oh) { + for (index_t ow = 0; ow < out_shape[3]; ++ow) { index_t filter_start_y, filter_start_x; - index_t start_x = std::max(0, ow + stride_w -1 - padding_left); - index_t start_y = std::max(0, oh + stride_h -1 - padding_top); - start_x /= stride_w; - start_y /= stride_h; - filter_start_x = padding_left + stride_w * start_x - ow; - filter_start_y = padding_top + stride_h * start_y - oh; - filter_start_x = filter_width - 1 - filter_start_x; - filter_start_y = filter_height - 1 - filter_start_y; + index_t start_x = std::max(0, ow + strides[1] -1 - padding[1]); + index_t start_y = std::max(0, oh + strides[0] -1 - padding[0]); + start_x /= strides[1]; + start_y /= strides[0]; + filter_start_x = padding[1] + strides[1] * start_x - ow; + filter_start_y = padding[0] + strides[0] * start_y - oh; + filter_start_x = kernel_hw[1] - 1 - filter_start_x; + filter_start_y = kernel_hw[0] - 1 - filter_start_y; T out_value = 0; index_t out_pos = - ((b * out_channels + oc) * out_height + oh) * out_width + ow; - for (index_t ic = 0; ic < in_channels; ++ic) { + ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow; + for (index_t ic = 0; ic < in_shape[1]; ++ic) { for (index_t f_y = filter_start_y, ih = start_y; - f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) { + f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) { for (index_t f_x = filter_start_x, iw = start_x; - f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) { + f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) { index_t weight_pos = - ((oc * in_channels + ic) * filter_height + f_y) - * filter_width + f_x; + ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y) + * kernel_hw[1] + f_x; index_t in_pos = - ((b * in_channels + ic) * in_height + ih) - * in_width + iw; + ((b * in_shape[1] + ic) * in_shape[2] + ih) + * in_shape[3] + iw; out_value += input[in_pos] * filter[weight_pos]; } } @@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { paddings_.data(), true); output->Resize(output_shape_); } - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t height = output->dim(2); - index_t width = output->dim(3); - - index_t input_batch = input->dim(0); - index_t input_channels = input->dim(1); - index_t input_height = input->dim(2); - index_t input_width = input->dim(3); - index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); - MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); - MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", - input_channels); - - index_t stride_h = strides_[0]; - index_t stride_w = strides_[1]; - - MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); + const index_t *in_shape = input->shape().data(); + const index_t *out_shape = output->shape().data(); + const index_t kernel_hw[2] = {kernel_h, kernel_w}; + + MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ", + output_shape[1]); + MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ", + in_shape[1]); + MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch"); Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard bias_mapper(bias); @@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { auto filter_data = filter->data(); auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); - int padding_top = (paddings_[0] + 1) >> 1; - int padding_left = (paddings_[1] + 1) >> 1; - - deconv::Deconv2dNCHW(input_data, filter_data, bias_data, - batch, input_height, input_width, input_channels, - height, width, channels, - kernel_h, kernel_w, - stride_h, stride_w, padding_top, padding_left, + int padding[2]; + padding[0] = (paddings_[0] + 1) >> 1; + padding[1] = (paddings_[1] + 1) >> 1; + deconv::Deconv2dNCHW(input_data, + filter_data, + bias_data, + in_shape, + out_shape, + kernel_hw, + strides_, + padding, output_data); - DoActivation(output_data, output_data, output->size(), activation_, + DoActivation(output_data, + output_data, + output->size(), + activation_, relux_max_limit_); } }; diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 733591a5ef8fd906945736848ef3bed6f75c1b10..2afd905b97b77180a61de92f5caea403c4e5ea79 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor { : block_size_(block_size), d2s_(d2s) {} void operator()(const Tensor *input, Tensor *output, StatsFuture *future) { MACE_UNUSED(future); - const int batch_size = input->dim(0); - const int input_depth = input->dim(1); - const int input_height = input->dim(2); - const int input_width = input->dim(3); + const index_t batch_size = input->dim(0); + const index_t input_depth = input->dim(1); + const index_t input_height = input->dim(2); + const index_t input_width = input->dim(3); index_t output_depth, output_width, output_height; @@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor { if (d2s_) { #pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - for (int d = 0; d < output_depth; ++d) { - for (int h = 0; h < output_height; ++h) { - const int in_h = h / block_size_; - const int offset_h = (h % block_size_); + for (index_t b = 0; b < batch_size; ++b) { + for (index_t d = 0; d < output_depth; ++d) { + for (index_t h = 0; h < output_height; ++h) { + const index_t in_h = h / block_size_; + const index_t offset_h = (h % block_size_); for (int w = 0; w < output_width; ++w) { const index_t in_w = w / block_size_; const index_t offset_w = w % block_size_; @@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor { } } else { #pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - for (int d = 0; d < input_depth; ++d) { - for (int h = 0; h < input_height; ++h) { - const int out_h = h / block_size_; - const int offset_h = (h % block_size_); - for (int w = 0; w < input_width; ++w) { - const int out_w = w / block_size_; - const int offset_w = (w % block_size_); - const int offset_d = + for (index_t b = 0; b < batch_size; ++b) { + for (index_t d = 0; d < input_depth; ++d) { + for (index_t h = 0; h < input_height; ++h) { + const index_t out_h = h / block_size_; + const index_t offset_h = (h % block_size_); + for (index_t w = 0; w < input_width; ++w) { + const index_t out_w = w / block_size_; + const index_t offset_w = (w % block_size_); + const index_t offset_d = (offset_h * block_size_ + offset_w) * input_depth; - const int out_d = d + offset_d; + const index_t out_d = d + offset_d; const index_t o_index = ((b * output_depth + out_d) * output_height + out_h) * output_width + out_w; diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index ce3c1e48551a66915fcf1888dd21d5ced19edeea..a276b504e0c83cd0606b14965936af796be3f06a 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor void DepthwiseConv2dGeneral(const float *input, const float *filter, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t in_channels, - const index_t out_height, - const index_t out_width, - const index_t out_channels, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const index_t *filter_shape, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, float *output) { - const index_t multiplier = out_channels / in_channels; + const index_t multiplier = filter_shape[0] / filter_shape[1]; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t m = 0; m < out_channels; ++m) { - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { + for (index_t b = 0; b < in_shape[0]; ++b) { + for (index_t m = 0; m < filter_shape[0]; ++m) { + for (index_t h = 0; h < out_shape[2]; ++h) { + for (index_t w = 0; w < out_shape[3]; ++w) { + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; + const index_t filter_height = filter_shape[2]; + const index_t filter_width = filter_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; index_t out_offset = ((b * out_channels + m) * out_height + h) * out_width + w; index_t c = m / multiplier; @@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor float sum = 0; for (index_t kh = 0; kh < filter_height; ++kh) { for (index_t kw = 0; kw < filter_width; ++kw) { - index_t ih = h * stride_h + kh * dilation_h - pad_top; - index_t iw = w * stride_w + kw * dilation_w - pad_left; + index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0]; + index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1]; if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { index_t in_offset = ((b * in_channels + c) * in_height + ih) * in_width + iw; @@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); + const int pad_hw[2] = {pad_top, pad_left}; + const index_t input_shape[4] = + {batch, input_channels, input_height, input_width}; + if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { conv_func = [=](const float *input, float *output) { DepthwiseConv2dNeonK3x3S1(input, filter_data, - batch, - input_height, - input_width, - input_channels, - height, - width, - channels, - pad_top, - pad_left, + input_shape, + output_shape.data(), + pad_hw, valid_h_start, valid_h_stop, valid_w_start, @@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor conv_func = [=](const float *input, float *output) { DepthwiseConv2dNeonK3x3S2(input, filter_data, - batch, - input_height, - input_width, - input_channels, - height, - width, - channels, - pad_top, - pad_left, + input_shape, + output_shape.data(), + pad_hw, valid_h_start, valid_h_stop, valid_w_start, @@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor conv_func = [=](const float *input, float *output) { DepthwiseConv2dGeneral(input, filter_data, - batch, - input_height, - input_width, - input_channels, - height, - width, - channels, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - pad_top, - pad_left, + input_shape, + output_shape.data(), + filter_shape.data(), + strides_, + dilations_, + pad_hw, output); }; } diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc index 0fae44de1fa7c1c77195cfd6c93140c6e60c1d05..178e0720a13555d00387ccc57a87f065cd746b42 100644 --- a/mace/kernels/gemm.cc +++ b/mace/kernels/gemm.cc @@ -140,8 +140,8 @@ inline void GemmTile(const float *A, #endif #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) - for (h = 0; h + 7 < height; h += 8) { - for (k = 0; k + 7 < K; k += 8) { + for (h = 0; h < height - 7; h += 8) { + for (k = 0; k < K - 7; k += 8) { const float *a_ptr = A + (h * stride_k + k); #ifdef __clang__ int nw = width >> 2; @@ -185,156 +185,150 @@ inline void GemmTile(const float *A, float *c_ptr7 = C + (h + 7) * stride_w; asm volatile( - "0: \n" - - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v24.4s}, [%1] \n" - - // load b: 0-7 - "prfm pldl1keep, [%9, #128] \n" - "ld1 {v16.4s}, [%9], #16 \n" - - "prfm pldl1keep, [%10, #128] \n" - "ld1 {v17.4s}, [%10], #16 \n" - - "prfm pldl1keep, [%11, #128] \n" - "ld1 {v18.4s}, [%11], #16 \n" - - "prfm pldl1keep, [%12, #128] \n" - "ld1 {v19.4s}, [%12], #16 \n" - - "prfm pldl1keep, [%2, #128] \n" - "ld1 {v25.4s}, [%2] \n" - - "prfm pldl1keep, [%13, #128] \n" - "ld1 {v20.4s}, [%13], #16 \n" - - "prfm pldl1keep, [%14, #128] \n" - "ld1 {v21.4s}, [%14], #16 \n" - - "prfm pldl1keep, [%15, #128] \n" - "ld1 {v22.4s}, [%15], #16 \n" - - "prfm pldl1keep, [%16, #128] \n" - "ld1 {v23.4s}, [%16], #16 \n" - - "prfm pldl1keep, [%3, #128] \n" - "ld1 {v26.4s}, [%3] \n" - - "fmla v24.4s, v16.4s, %34.s[0] \n" - "fmla v24.4s, v17.4s, %34.s[1] \n" - "fmla v24.4s, v18.4s, %34.s[2] \n" - "fmla v24.4s, v19.4s, %34.s[3] \n" - - "fmla v24.4s, v20.4s, %35.s[0] \n" - "fmla v24.4s, v21.4s, %35.s[1] \n" - "fmla v24.4s, v22.4s, %35.s[2] \n" - "fmla v24.4s, v23.4s, %35.s[3] \n" - - "st1 {v24.4s}, [%1], #16 \n" - - "fmla v25.4s, v16.4s, %36.s[0] \n" - "fmla v25.4s, v17.4s, %36.s[1] \n" - "fmla v25.4s, v18.4s, %36.s[2] \n" - "fmla v25.4s, v19.4s, %36.s[3] \n" - - "fmla v25.4s, v20.4s, %37.s[0] \n" - "fmla v25.4s, v21.4s, %37.s[1] \n" - "fmla v25.4s, v22.4s, %37.s[2] \n" - "fmla v25.4s, v23.4s, %37.s[3] \n" - - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v24.4s}, [%4] \n" - - "st1 {v25.4s}, [%2], #16 \n" - - "fmla v26.4s, v16.4s, %38.s[0] \n" - "fmla v26.4s, v17.4s, %38.s[1] \n" - "fmla v26.4s, v18.4s, %38.s[2] \n" - "fmla v26.4s, v19.4s, %38.s[3] \n" + "prfm pldl1keep, [%9, #128] \n" + "ld1 {v16.4s}, [%9], #16 \n" - "fmla v26.4s, v20.4s, %39.s[0] \n" - "fmla v26.4s, v21.4s, %39.s[1] \n" - "fmla v26.4s, v22.4s, %39.s[2] \n" - "fmla v26.4s, v23.4s, %39.s[3] \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v18.4s}, [%1] \n" - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v25.4s}, [%5] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v19.4s}, [%2] \n" - "st1 {v26.4s}, [%3], #16 \n" - - "fmla v24.4s, v16.4s, %40.s[0] \n" - "fmla v24.4s, v17.4s, %40.s[1] \n" - "fmla v24.4s, v18.4s, %40.s[2] \n" - "fmla v24.4s, v19.4s, %40.s[3] \n" - - "fmla v24.4s, v20.4s, %41.s[0] \n" - "fmla v24.4s, v21.4s, %41.s[1] \n" - "fmla v24.4s, v22.4s, %41.s[2] \n" - "fmla v24.4s, v23.4s, %41.s[3] \n" - - "prfm pldl1keep, [%6, #128] \n" - "ld1 {v26.4s}, [%6] \n" - - "st1 {v24.4s}, [%4], #16 \n" - - "fmla v25.4s, v16.4s, %42.s[0] \n" - "fmla v25.4s, v17.4s, %42.s[1] \n" - "fmla v25.4s, v18.4s, %42.s[2] \n" - "fmla v25.4s, v19.4s, %42.s[3] \n" - - "fmla v25.4s, v20.4s, %43.s[0] \n" - "fmla v25.4s, v21.4s, %43.s[1] \n" - "fmla v25.4s, v22.4s, %43.s[2] \n" - "fmla v25.4s, v23.4s, %43.s[3] \n" - - "prfm pldl1keep, [%7, #128] \n" - "ld1 {v24.4s}, [%7] \n" - - "st1 {v25.4s}, [%5], #16 \n" - - "fmla v26.4s, v16.4s, %44.s[0] \n" - "fmla v26.4s, v17.4s, %44.s[1] \n" - "fmla v26.4s, v18.4s, %44.s[2] \n" - "fmla v26.4s, v19.4s, %44.s[3] \n" - - "fmla v26.4s, v20.4s, %45.s[0] \n" - "fmla v26.4s, v21.4s, %45.s[1] \n" - "fmla v26.4s, v22.4s, %45.s[2] \n" - "fmla v26.4s, v23.4s, %45.s[3] \n" - - "prfm pldl1keep, [%8, #128] \n" - "ld1 {v25.4s}, [%8] \n" - - "st1 {v26.4s}, [%6], #16 \n" - - "fmla v24.4s, v16.4s, %46.s[0] \n" - "fmla v24.4s, v17.4s, %46.s[1] \n" - "fmla v24.4s, v18.4s, %46.s[2] \n" - "fmla v24.4s, v19.4s, %46.s[3] \n" - - "fmla v24.4s, v20.4s, %47.s[0] \n" - "fmla v24.4s, v21.4s, %47.s[1] \n" - "fmla v24.4s, v22.4s, %47.s[2] \n" - "fmla v24.4s, v23.4s, %47.s[3] \n" - - "st1 {v24.4s}, [%7], #16 \n" - - "fmla v25.4s, v16.4s, %48.s[0] \n" - "fmla v25.4s, v17.4s, %48.s[1] \n" - "fmla v25.4s, v18.4s, %48.s[2] \n" - "fmla v25.4s, v19.4s, %48.s[3] \n" - - "fmla v25.4s, v20.4s, %49.s[0] \n" - "fmla v25.4s, v21.4s, %49.s[1] \n" - "fmla v25.4s, v22.4s, %49.s[2] \n" - "fmla v25.4s, v23.4s, %49.s[3] \n" - - "st1 {v25.4s}, [%8], #16 \n" + "0: \n" - "subs %w0, %w0, #1 \n" - "bne 0b \n" - : "=r"(nw), // 0 + "prfm pldl1keep, [%3, #128] \n" + "ld1 {v20.4s}, [%3] \n" + "prfm pldl1keep, [%4, #128] \n" + "ld1 {v21.4s}, [%4] \n" + "prfm pldl1keep, [%5, #128] \n" + "ld1 {v22.4s}, [%5] \n" + "prfm pldl1keep, [%6, #128] \n" + "ld1 {v23.4s}, [%6] \n" + "prfm pldl1keep, [%7, #128] \n" + "ld1 {v24.4s}, [%7] \n" + "prfm pldl1keep, [%8, #128] \n" + "ld1 {v25.4s}, [%8] \n" + "prfm pldl1keep, [%10, #128] \n" + "ld1 {v17.4s}, [%10], #16 \n" + + "fmla v18.4s, v16.4s, %34.s[0] \n" + "fmla v19.4s, v16.4s, %35.s[0] \n" + "fmla v20.4s, v16.4s, %36.s[0] \n" + "fmla v21.4s, v16.4s, %37.s[0] \n" + + "fmla v22.4s, v16.4s, %38.s[0] \n" + "fmla v23.4s, v16.4s, %39.s[0] \n" + "fmla v24.4s, v16.4s, %40.s[0] \n" + "fmla v25.4s, v16.4s, %41.s[0] \n" + + "fmla v18.4s, v17.4s, %34.s[1] \n" + "fmla v19.4s, v17.4s, %35.s[1] \n" + "fmla v20.4s, v17.4s, %36.s[1] \n" + "fmla v21.4s, v17.4s, %37.s[1] \n" + + "prfm pldl1keep, [%11, #128] \n" + "ld1 {v16.4s}, [%11], #16 \n" + + "fmla v22.4s, v17.4s, %38.s[1] \n" + "fmla v23.4s, v17.4s, %39.s[1] \n" + "fmla v24.4s, v17.4s, %40.s[1] \n" + "fmla v25.4s, v17.4s, %41.s[1] \n" + + "fmla v18.4s, v16.4s, %34.s[2] \n" + "fmla v19.4s, v16.4s, %35.s[2] \n" + "fmla v20.4s, v16.4s, %36.s[2] \n" + "fmla v21.4s, v16.4s, %37.s[2] \n" + + "prfm pldl1keep, [%12, #128] \n" + "ld1 {v17.4s}, [%12], #16 \n" + + "fmla v22.4s, v16.4s, %38.s[2] \n" + "fmla v23.4s, v16.4s, %39.s[2] \n" + "fmla v24.4s, v16.4s, %40.s[2] \n" + "fmla v25.4s, v16.4s, %41.s[2] \n" + + "fmla v18.4s, v17.4s, %34.s[3] \n" + "fmla v19.4s, v17.4s, %35.s[3] \n" + "fmla v20.4s, v17.4s, %36.s[3] \n" + "fmla v21.4s, v17.4s, %37.s[3] \n" + + "prfm pldl1keep, [%13, #128] \n" + "ld1 {v16.4s}, [%13], #16 \n" + + "fmla v22.4s, v17.4s, %38.s[3] \n" + "fmla v23.4s, v17.4s, %39.s[3] \n" + "fmla v24.4s, v17.4s, %40.s[3] \n" + "fmla v25.4s, v17.4s, %41.s[3] \n" + + "fmla v18.4s, v16.4s, %42.s[0] \n" + "fmla v19.4s, v16.4s, %43.s[0] \n" + "fmla v20.4s, v16.4s, %44.s[0] \n" + "fmla v21.4s, v16.4s, %45.s[0] \n" + + "prfm pldl1keep, [%14, #128] \n" + "ld1 {v17.4s}, [%14], #16 \n" + + "fmla v22.4s, v16.4s, %46.s[0] \n" + "fmla v23.4s, v16.4s, %47.s[0] \n" + "fmla v24.4s, v16.4s, %48.s[0] \n" + "fmla v25.4s, v16.4s, %49.s[0] \n" + + "fmla v18.4s, v17.4s, %42.s[1] \n" + "fmla v19.4s, v17.4s, %43.s[1] \n" + "fmla v20.4s, v17.4s, %44.s[1] \n" + "fmla v21.4s, v17.4s, %45.s[1] \n" + + "prfm pldl1keep, [%15, #128] \n" + "ld1 {v16.4s}, [%15], #16 \n" + + "fmla v22.4s, v17.4s, %46.s[1] \n" + "fmla v23.4s, v17.4s, %47.s[1] \n" + "fmla v24.4s, v17.4s, %48.s[1] \n" + "fmla v25.4s, v17.4s, %49.s[1] \n" + + "fmla v18.4s, v16.4s, %42.s[2] \n" + "fmla v19.4s, v16.4s, %43.s[2] \n" + "fmla v20.4s, v16.4s, %44.s[2] \n" + "fmla v21.4s, v16.4s, %45.s[2] \n" + + "prfm pldl1keep, [%16, #128] \n" + "ld1 {v17.4s}, [%16], #16 \n" + + "fmla v22.4s, v16.4s, %46.s[2] \n" + "fmla v23.4s, v16.4s, %47.s[2] \n" + "fmla v24.4s, v16.4s, %48.s[2] \n" + "fmla v25.4s, v16.4s, %49.s[2] \n" + + "fmla v18.4s, v17.4s, %42.s[3] \n" + "fmla v19.4s, v17.4s, %43.s[3] \n" + "fmla v20.4s, v17.4s, %44.s[3] \n" + "fmla v21.4s, v17.4s, %45.s[3] \n" + + "st1 {v18.4s}, [%1], #16 \n" + "st1 {v19.4s}, [%2], #16 \n" + "st1 {v20.4s}, [%3], #16 \n" + "st1 {v21.4s}, [%4], #16 \n" + + "fmla v22.4s, v17.4s, %46.s[3] \n" + "fmla v23.4s, v17.4s, %47.s[3] \n" + "fmla v24.4s, v17.4s, %48.s[3] \n" + "fmla v25.4s, v17.4s, %49.s[3] \n" + + "st1 {v22.4s}, [%5], #16 \n" + "st1 {v23.4s}, [%6], #16 \n" + "st1 {v24.4s}, [%7], #16 \n" + "st1 {v25.4s}, [%8], #16 \n" + + "prfm pldl1keep, [%9, #128] \n" + "ld1 {v16.4s}, [%9], #16 \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v18.4s}, [%1] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v19.4s}, [%2] \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + : "=r"(nw), // 0 "=r"(c_ptr0), // 1 "=r"(c_ptr1), // 2 "=r"(c_ptr2), // 3 @@ -351,7 +345,7 @@ inline void GemmTile(const float *A, "=r"(b_ptr5), // 14 "=r"(b_ptr6), // 15 "=r"(b_ptr7) // 16 - : "0"(nw), // 17 + : "0"(nw), // 17 "1"(c_ptr0), // 18 "2"(c_ptr1), // 19 "3"(c_ptr2), // 20 @@ -369,20 +363,20 @@ inline void GemmTile(const float *A, "15"(b_ptr6), // 32 "16"(b_ptr7), // 33 "w"(a0), // 34 - "w"(a1), // 35 - "w"(a2), // 36 - "w"(a3), // 37 - "w"(a4), // 38 - "w"(a5), // 39 - "w"(a6), // 40 - "w"(a7), // 41 - "w"(a8), // 42 - "w"(a9), // 43 - "w"(a10), // 44 - "w"(a11), // 45 - "w"(a12), // 46 - "w"(a13), // 47 - "w"(a14), // 48 + "w"(a2), // 35 + "w"(a4), // 36 + "w"(a6), // 37 + "w"(a8), // 38 + "w"(a10), // 39 + "w"(a12), // 40 + "w"(a14), // 41 + "w"(a1), // 42 + "w"(a3), // 43 + "w"(a5), // 44 + "w"(a7), // 45 + "w"(a9), // 46 + "w"(a11), // 47 + "w"(a13), // 48 "w"(a15) // 49 : "cc", "memory", "v16", @@ -585,7 +579,6 @@ void Gemm(const float *A, } memset(C, 0, sizeof(float) * batch * height * width); - // It is better to use large block size if it fits for fast cache. // Assume l1 cache size is 32k, we load three blocks at a time (A, B, C), // the block size should be sqrt(32k / sizeof(T) / 3). diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc index 5cee48620aa0aa6be6600bbbe331016a879c4c54..6b55696614201b24d4492275c2ae219a5038926e 100644 --- a/mace/kernels/opencl/activation.cc +++ b/mace/kernels/opencl/activation.cc @@ -45,7 +45,8 @@ void ActivationFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 4587a2cb7b2c8a5cebe0470533a1457bb6937e1a..c47213f593cce3da126555993b5b500e95019414 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -58,7 +58,8 @@ void AddNFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index f28c9ccc6cda25ec713c108bc1eae2ad3f9a38ed..80fafdbcb3f0f129f6ce97ab6cf57406cb617b60 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -56,7 +56,8 @@ void BatchNormFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index b6d2b4b1855d2210cb25fb6b99921800d18a6cba..e50dcf58611360833ae2fbaeff2dc9d2d721c01b 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -49,7 +49,8 @@ void BiasAddFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -90,7 +91,8 @@ void BiasAddFunctor::operator()(const Tensor *input, } else { std::vector roundup_gws(lws.size()); for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); + if (lws[i] != 0) + roundup_gws[i] = RoundUp(gws[i], lws[i]); } error = runtime->command_queue().enqueueNDRangeKernel( diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 1bce914c1b817a489f444e32b5284c25a7f0d527..bf629e373b77ce57f24640c342a6ce6fe2c5ab45 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -93,7 +93,8 @@ void BufferToImageFunctor::operator()( built_options.emplace("-DOUT_OF_RANGE_CHECK"); if (!kernel_error_) { kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 7cb082544f55c2bf72711ec0fe6ec0e8448442eb..d16a3d8a73c14df3282c940f01db5b2848a78d34 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -56,7 +56,8 @@ void ChannelShuffleFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 96c15fd8adfe4369d483cc6be424d341fe59b743..239041008bcf08cd898bbe9b7f722d68cb4afdec 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -67,7 +67,8 @@ static void Concat2(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); @@ -148,7 +149,8 @@ static void ConcatN(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index 52ed0368fc6fbe824ee2f254394ecfbc6324071b..5b79ea662a3f0d101ca4f4e3ab0faf2ce5f2ffd3 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -100,7 +100,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index f5600883d850e7f05052e8830b2ab79815ffd15e..5386c4173ad5441be104c7e10c183113223915ef 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -86,7 +86,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index 2329984a0c53592c8d73a4b629a63167081c2a33..e44d898168fdfd096fa495d6b076a668f4f1a2fd 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -94,7 +94,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc index bbcbec6c01126095af0e706986d5c47a1ffa707a..abb4b43effff898009bdc56436f9ae44b16cc40b 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d_opencl.cc @@ -65,7 +65,8 @@ void Deconv2dOpencl(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index fd25f948c355999909bcd670e41ff249dc4e5aea..609ad20516444970013ecc5ba796eafb16c060f3 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -86,7 +86,8 @@ void DepthToSpaceOpFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index 43a24e662828a5282914c230553220cc2adc30c9..c7800d0ae24e5d4ed7471cc1f1ea3829ddacd9fc 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -97,7 +97,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc index 94b4c322bc625fa82f9d2e482c99e5b95fdd41d3..4f0590466b02c1a400682a89b656394bdd7318b3 100644 --- a/mace/kernels/opencl/eltwise.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -97,7 +97,8 @@ void EltwiseFunctor::operator()(const Tensor *input0, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index 0022b92380208a7f4eb06dff68a8ec45c18dff39..6e0678daff35642a9c673ecd1a76e4059858aa67 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -74,7 +74,8 @@ void FCWXKernel(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); @@ -200,7 +201,8 @@ void FCWTXKernel(cl::Kernel *kernel, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + (*kernel_error)->Allocate(1); (*kernel_error)->Map(nullptr); *((*kernel_error)->mutable_data()) = 0; (*kernel_error)->UnMap(); diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index 09b040dd454fbe1a2d24a85019c66116d3610e70..1cefff9e6ba56f00ed366e16aae268dcd0a78e16 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -86,7 +86,8 @@ void ImageToBufferFunctor::operator()( built_options.emplace("-DOUT_OF_RANGE_CHECK"); if (!kernel_error_) { kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 9a16694a0284f1b6583ee633487b4725283bafea..cc63ed04962938f1c4b75e57ec7a618f06fbf2aa 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -54,7 +54,8 @@ void MatMulFunctor::operator()(const Tensor *A, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index 012edd70cd9a8c73a409886d37fad0b29ef8411b..467a595309c32f26de3ae271040045092252d840 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -57,7 +57,8 @@ bool BufferToImageOpImpl(Tensor *buffer, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error->Allocate(1); kernel_error->Map(nullptr); *(kernel_error->mutable_data()) = 0; kernel_error->UnMap(); diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index bc093c16e3f2b66017fe368436b5f172bb9b3d5f..34fbf659ebab12a5258f87234df7f131701f84cf 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -60,7 +60,8 @@ void PadFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index df2fcbe9223f902c721de2ace2aa7d5b780498c1..8a9f91e90df8c79369793b35f59a49e39af7c6e1 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -72,7 +72,8 @@ void PoolingFunctor::operator()(const Tensor *input, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index 1c36b27e37dbb5a5649203c2d8fa37a69f5f3266..0c86cae840ac15ce4a56be3727d8d747cfe9d179 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -78,7 +78,8 @@ void ResizeBilinearFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 7944ee88b9845f93438d363069e9afbc4065f873..21fdbca1ecb6b2a492787ffe93601af9466bce96 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -51,7 +51,8 @@ void SliceFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index 24329be41496ee740e34125a9cfaa08f793d5644..8e5be84509bd5ae4d49fdcfecbd0cdbd7e9f0359 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -70,7 +70,8 @@ void SoftmaxFunctor::operator()(const Tensor *logits, if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index fa4850cb95348b460fefde519c77fcd97f818504..c3c45f0b2a18bce690cad718b2d61391d243a0f4 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -70,7 +70,8 @@ void SpaceToBatchFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index fcf815281ea783fc184819b8d86ac3480bcadd66..da7dea0b974c09e7cf8e7e45442ff44a95eadfe4 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -39,7 +39,8 @@ void WinogradTransformFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); @@ -138,7 +139,8 @@ void WinogradInverseTransformFunctor::operator()( if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); + kernel_error_->Allocate(1); kernel_error_->Map(nullptr); *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 97a65f1eae53b26d68c7c43e07833d4f01a9c33f..9c510b34d3d9b3d162e7e2ceae0dc0210aec264b 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -75,39 +75,38 @@ struct PoolingFunctor: PoolingFunctorBase { } void MaxPooling(const float *input, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t channels, - const index_t out_height, - const index_t out_width, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = channels * in_image_size; - const index_t out_batch_size = channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < out_shape[1]; ++c) { const index_t out_base = b * out_batch_size + c * out_image_size; const index_t in_base = b * in_batch_size + c * in_image_size; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w < out_width; ++w) { const index_t out_offset = out_base + h * out_width + w; float res = std::numeric_limits::lowest(); - for (int fh = 0; fh < filter_height; ++fh) { - for (int fw = 0; fw < filter_width; ++fw) { - int inh = h * stride_h + dilation_h * fh - pad_top; - int inw = w * stride_w + dilation_w * fw - pad_left; + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { index_t input_offset = in_base + inh * in_width + inw; res = std::max(res, input[input_offset]); @@ -122,40 +121,38 @@ struct PoolingFunctor: PoolingFunctorBase { } void AvgPooling(const float *input, - const index_t batch, - const index_t in_height, - const index_t in_width, - const index_t channels, - const index_t out_height, - const index_t out_width, - const int filter_height, - const int filter_width, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - const int pad_top, - const int pad_left, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, float *output) { - const index_t in_image_size = in_height * in_width; - const index_t out_image_size = out_height * out_width; - const index_t in_batch_size = channels * in_image_size; - const index_t out_batch_size = channels * out_image_size; + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < out_shape[1]; ++c) { const index_t out_base = b * out_batch_size + c * out_image_size; const index_t in_base = b * in_batch_size + c * in_image_size; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w < out_width; ++w) { const index_t out_offset = out_base + h * out_width + w; float res = 0; int block_size = 0; - for (int fh = 0; fh < filter_height; ++fh) { - for (int fw = 0; fw < filter_width; ++fw) { - int inh = h * stride_h + dilation_h * fh - pad_top; - int inw = w * stride_w + dilation_w * fw - pad_left; + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { index_t input_offset = in_base + inh * in_width + inw; res += input[input_offset]; @@ -200,59 +197,25 @@ struct PoolingFunctor: PoolingFunctorBase { const float *input = input_tensor->data(); float *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - index_t batch = output_shape[0]; - index_t channels = output_shape[1]; - index_t height = output_shape[2]; - index_t width = output_shape[3]; - - index_t input_height = input_shape[2]; - index_t input_width = input_shape[3]; - - int filter_h = kernels_[0]; - int filter_w = kernels_[1]; - - int stride_h = strides_[0]; - int stride_w = strides_[1]; - - int dilation_h = dilations_[0]; - int dilation_w = dilations_[1]; - - int pad_top = paddings[0] / 2; - int pad_left = paddings[1] / 2; + int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; if (pooling_type_ == PoolingType::MAX) { MaxPooling(input, - batch, - input_height, - input_width, - channels, - height, - width, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - pad_top, - pad_left, + input_shape, + output_shape.data(), + kernels_, + strides_, + dilations_, + pad_hw, output); } else if (pooling_type_ == PoolingType::AVG) { AvgPooling(input, - batch, - input_height, - input_width, - channels, - height, - width, - filter_h, - filter_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - pad_top, - pad_left, + input_shape, + output_shape.data(), + kernels_, + strides_, + dilations_, + pad_hw, output); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/activation.h b/mace/ops/activation.h index 5f07425a649992907f6c9d5103fbf156119cdf7f..7c6d3b5690df5928a835c6bb98c7f2db542da20d 100644 --- a/mace/ops/activation.h +++ b/mace/ops/activation.h @@ -38,7 +38,7 @@ class ActivationOp : public Operator { const Tensor *input_tensor = this->Input(0); const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr; - Tensor *output_tensor = this->outputs_[0]; + Tensor *output_tensor = this->Output(0); output_tensor->ResizeLike(input_tensor); functor_(input_tensor, alpha_tensor, output_tensor, future); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 319fc108a6699c8e704adec9dc694307a0047bfe..733673a7273dae6cc19f308e8561ef5c5fd16201 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -620,6 +620,8 @@ struct Expector { static void Near(const Tensor &x, const Tensor &y, const double rel_err, const double abs_err) { + MACE_UNUSED(rel_err); + MACE_UNUSED(abs_err); Equal(x, y); } }; diff --git a/mace/public/mace.h b/mace/public/mace.h index cfeeb0cadfb7314d774e9ddfd37e46c6d5c863da..46cf4ac32df4b311aac8741c5356b1a5308f673f 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -56,7 +56,11 @@ class RunMetadata { const char *MaceVersion(); -enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 }; +enum MaceStatus { + MACE_SUCCESS = 0, + MACE_INVALID_ARGS = 1, + MACE_OUT_OF_RESOURCES = 2 +}; // MACE input/output tensor class MaceTensor { @@ -84,13 +88,14 @@ class MaceTensor { class MaceEngine { public: - explicit MaceEngine(const NetDef *net_def, - DeviceType device_type, - const std::vector &input_nodes, - const std::vector &output_nodes, - const unsigned char *model_data); + explicit MaceEngine(DeviceType device_type); ~MaceEngine(); + MaceStatus Init(const NetDef *net_def, + const std::vector &input_nodes, + const std::vector &output_nodes, + const unsigned char *model_data); + MaceStatus Run(const std::map &inputs, std::map *outputs); diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 22e8c68034808115f71c915a9d36f2eedefe5e6a..4f65e971e0dd3148b6376c47ccf3e93046aa741e 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -128,7 +128,7 @@ def main(unused_args): FLAGS.weight_file) output_graph_def = converter.run() - print("Transform model to one that can better run on device.") + print("Transform model to one that can better run on device") if not FLAGS.runtime: cpu_graph_def = copy.deepcopy(output_graph_def) option.device = mace_pb2.CPU diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 277727d4041c97141d552740275488650ca4a3a2..8819128a946289907f73af77fb9c3c1cb67c7b61 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -136,23 +136,25 @@ class MaceKeyword(object): class TransformerRule(Enum): - REMOVE_IDENTITY_OP = 0 - TRANSFORM_GLOBAL_POOLING = 1 - FOLD_SOFTMAX = 2 - FOLD_BATCHNORM = 3, - FOLD_CONV_AND_BN = 4, - FOLD_DEPTHWISE_CONV_AND_BN = 5, - TRANSFORM_GPU_WINOGRAD = 6, - TRANSFORM_ADD_TO_BIASADD = 7, - FOLD_BIASADD = 8, - FOLD_ACTIVATION = 9, - TRANSPOSE_FILTERS = 10, - RESHAPE_FC_WEIGHT = 11, - TRANSPOSE_DATA_FORMAT = 12, - TRANSFORM_GLOBAL_CONV_TO_FC = 13, - TRANSFORM_BUFFER_IMAGE = 14, - ADD_DEVICE_AND_DATA_TYPE = 15, - SORT_BY_EXECUTION = 16 + REMOVE_USELESS_RESHAPE_OP = 0 + REMOVE_IDENTITY_OP = 1 + TRANSFORM_GLOBAL_POOLING = 2 + FOLD_RESHAPE = 3 + TRANSFORM_MATMUL_TO_FC = 4 + FOLD_BATCHNORM = 5 + FOLD_CONV_AND_BN = 6 + FOLD_DEPTHWISE_CONV_AND_BN = 7 + TRANSFORM_GPU_WINOGRAD = 8 + TRANSFORM_ADD_TO_BIASADD = 9 + FOLD_BIASADD = 10 + FOLD_ACTIVATION = 11 + TRANSPOSE_FILTERS = 12 + RESHAPE_FC_WEIGHT = 13 + TRANSPOSE_DATA_FORMAT = 14 + TRANSFORM_GLOBAL_CONV_TO_FC = 15 + TRANSFORM_BUFFER_IMAGE = 16 + ADD_DEVICE_AND_DATA_TYPE = 17 + SORT_BY_EXECUTION = 18 class ConverterInterface(object): @@ -199,9 +201,11 @@ class ConverterOption(object): self._device = mace_pb2.CPU self._winograd_enabled = False self._transformer_option = [ + TransformerRule.REMOVE_USELESS_RESHAPE_OP, TransformerRule.REMOVE_IDENTITY_OP, TransformerRule.TRANSFORM_GLOBAL_POOLING, - TransformerRule.FOLD_SOFTMAX, + TransformerRule.FOLD_RESHAPE, + TransformerRule.TRANSFORM_MATMUL_TO_FC, TransformerRule.FOLD_BATCHNORM, TransformerRule.FOLD_CONV_AND_BN, TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN, diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py index 6f2247e6075982a28ae038493b20408e6e965f0b..2917562df353872b4a4b5b9f1a9ad398a1e992e3 100644 --- a/mace/python/tools/converter_tool/tensorflow_converter.py +++ b/mace/python/tools/converter_tool/tensorflow_converter.py @@ -101,9 +101,11 @@ class TensorflowConverter(base_converter.ConverterInterface): 'AvgPool': self.convert_pooling, 'MaxPool': self.convert_pooling, 'Squeeze': self.convert_identity, + 'MatMul': self.convert_matmul, 'Identity': self.convert_identity, 'Reshape': self.convert_reshape, 'Shape': self.convert_nop, + 'Transpose': self.convert_transpose, 'Softmax': self.convert_softmax, 'ResizeBilinear': self.convert_resize_bilinear, 'Placeholder': self.convert_nop, @@ -144,7 +146,8 @@ class TensorflowConverter(base_converter.ConverterInterface): for i in xrange(len(op.input)): if op.input[i][-2:] == ':0': op_name = op.input[i][:-2] - if op_name in self._option.input_nodes: + if op_name in self._option.input_nodes \ + or op_name in self._option.output_nodes: op.input[i] = op_name for i in xrange(len(op.output)): if op.output[i][-2:] == ':0': @@ -411,6 +414,10 @@ class TensorflowConverter(base_converter.ConverterInterface): self._skip_tensor.update(tf_op.inputs[-1].name) + def convert_matmul(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.MatMul.name + def convert_reshape(self, tf_op): op = self.convert_general_op(tf_op) op.type = MaceOp.Reshape.name @@ -430,6 +437,20 @@ class TensorflowConverter(base_converter.ConverterInterface): shape_arg.ints.extend(shape_value) + def convert_transpose(self, tf_op): + perm = tf_op.inputs[1].eval().astype(np.int32) + ordered_perm = np.sort(perm) + + mace_check(np.array_equal(perm, ordered_perm), + "Transpose not supported yet, only internal transpose" + " in composed ops might be supported") + + op = self.convert_general_op(tf_op) + op.type = 'Identity' + del op.input[1:] + + self._skip_tensor.add(tf_op.inputs[1].name) + def convert_mean(self, tf_op): op = self.convert_general_op(tf_op) del op.input[1:] diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index a1776b2232583a660b304d5d8245d8e1967bc82c..825dd64f2171465247260d3d357d089d4b323fdf 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -53,9 +53,11 @@ class Transformer(base_converter.ConverterInterface): def __init__(self, option, model): # DO NOT reorder the following transformers self._registered_transformers_order = [ + TransformerRule.REMOVE_USELESS_RESHAPE_OP, TransformerRule.REMOVE_IDENTITY_OP, TransformerRule.TRANSFORM_GLOBAL_POOLING, - TransformerRule.FOLD_SOFTMAX, + TransformerRule.FOLD_RESHAPE, + TransformerRule.TRANSFORM_MATMUL_TO_FC, TransformerRule.FOLD_BATCHNORM, TransformerRule.FOLD_CONV_AND_BN, TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN, @@ -72,10 +74,14 @@ class Transformer(base_converter.ConverterInterface): TransformerRule.SORT_BY_EXECUTION, ] self._registered_transformers = { + TransformerRule.REMOVE_USELESS_RESHAPE_OP: + self.remove_useless_reshape_op, TransformerRule.REMOVE_IDENTITY_OP: self.remove_identity_op, TransformerRule.TRANSFORM_GLOBAL_POOLING: self.transform_global_pooling, - TransformerRule.FOLD_SOFTMAX: self.fold_softmax, + TransformerRule.FOLD_RESHAPE: self.fold_reshape, + TransformerRule.TRANSFORM_MATMUL_TO_FC: + self.transform_matmul_to_fc, TransformerRule.FOLD_BATCHNORM: self.fold_batchnorm, TransformerRule.FOLD_CONV_AND_BN: self.fold_conv_and_bn, # data_format related @@ -161,18 +167,26 @@ class Transformer(base_converter.ConverterInterface): for output_tensor in op.output: self._producer[output_tensor] = op for input_node in self._option.input_nodes.values(): - op = mace_pb2.OperatorDef() - op.name = self.normalize_op_name(input_node.name) - op.type = 'Input' - op.output.extend(input_node.name) - output_shape = op.output_shape.add() - output_shape.dims.extend(input_node.shape) - if self._option.device == mace_pb2.CPU: - self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) - ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) - else: - ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) - self._producer[op.output[0]] = op + input_node_existed = False + for op in self._model.op: + if input_node.name in op.output: + input_node_existed = True + break + if not input_node_existed: + op = mace_pb2.OperatorDef() + op.name = self.normalize_op_name(input_node.name) + op.type = 'Input' + op.output.extend([input_node.name]) + output_shape = op.output_shape.add() + output_shape.dims.extend(input_node.shape) + if ConverterUtil.data_format( + self._consumers[input_node.name][0]) \ + == DataFormat.NCHW: + self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) + ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) + else: + ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) + self._producer[op.output[0]] = op @staticmethod def replace(obj_list, source, target): @@ -191,6 +205,12 @@ class Transformer(base_converter.ConverterInterface): def normalize_op_name(name): return name.replace(':', '_') + def get_tensor_shape(self, tensor): + producer = self._producer[tensor] + for i in xrange(len(producer.output)): + if producer.output[i] == tensor: + return list(producer.output_shape[i].dims) + def consumer_count(self, tensor_name): return len(self._consumers.get(tensor_name, [])) @@ -203,23 +223,68 @@ class Transformer(base_converter.ConverterInterface): return False - def replace_output_node(self, op): - """if it is an output node, change output node to the op before it""" - if self.is_op_output_node(op): - real_output_node = self._producer[op.input[0]] - self.replace(real_output_node.output, op.input[0], op.output[0]) - print("change %s to %s" % (real_output_node.name, op.name)) + def safe_remove_node(self, op, replace_op): + """remove op. + 1. change the inputs of its consumers to the outputs of replace_op + 2. if the op is output node, change output node to replace op""" + + if replace_op is None: + # When no replace op specified, we change the inputs of + # its consumers to the input of the op. This handles the case + # that the op is identity op and its input is a tensor. + mace_check(len(op.output) == 1 and len(op.input) == 1, + "cannot remove op that w/o replace op specified" + " and input/output length > 1" + str(op)) + + for consumer_op in self._consumers.get(op.output[0], []): + self.replace(consumer_op.input, op.output[0], op.input[0]) + + mace_check(op.output[0] not in self._option.output_nodes, + "cannot remove op that is output node") + else: + mace_check(len(op.output) == len(replace_op.output), + "cannot remove op since len(op.output) " + "!= len(replace_op.output)") + + for i in xrange(len(op.output)): + for consumer_op in self._consumers.get(op.output[i], []): + self.replace(consumer_op.input, + op.output[i], + replace_op.output[i]) + + # if the op is output node, change replace_op output name to the op + # output name + for i in xrange(len(op.output)): + if op.output[i] in self._option.output_nodes: + for consumer in self._consumers.get( + replace_op.output[i], []): + self.replace(consumer.input, + replace_op.output[i], + op.output[i]) + replace_op.output[i] = op.output[i] + + self._model.op.remove(op) + + def remove_useless_reshape_op(self): + net = self._model + for op in net.op: + if op.type == MaceOp.Reshape.name: + shape = list(ConverterUtil.get_arg( + op, MaceKeyword.mace_shape_str).ints) + if shape == self.get_tensor_shape(op.input[0]): + print("Remove useless reshape: %s(%s)" + % (op.name, op.type)) + op.type = 'Identity' + + return False def remove_identity_op(self): net = self._model for op in net.op: if op.type == 'Identity': print("Remove identity: %s(%s)" % (op.name, op.type)) - for consumer_op in self._consumers.get(op.output[0], []): - Transformer.replace(consumer_op.input, op.output[0], - op.input[0]) - self.replace_output_node(op) - net.op.remove(op) + self.safe_remove_node(op, + self._producer.get(op.input[0], None)) return True return False @@ -264,10 +329,10 @@ class Transformer(base_converter.ConverterInterface): and len(self._consts[consumer_op.input[1]].dims) == 1: print("Fold batchnorm: %s(%s)" % (op.name, op.type)) consumer_op.type = MaceOp.FoldedBatchNorm.name - inputs = [op.input[0], op.input[1], consumer_op.input[1]] - consumer_op.input[:] = inputs[:] + consumer_op.input[:] = [op.input[0], op.input[1], + consumer_op.input[1]] - net.op.remove(op) + self.safe_remove_node(op, None) return True return False @@ -514,7 +579,7 @@ class Transformer(base_converter.ConverterInterface): filter.float_data[:] = weight_tensor_value.flat[:] filter.dims[:] = weight_tensor_value.shape[:] - net.op.remove(op) + self.safe_remove_node(op, iwt_op) return False @@ -544,10 +609,8 @@ class Transformer(base_converter.ConverterInterface): consumer_op = self._consumers[op.output[0]][0] if consumer_op.type == MaceOp.BiasAdd.name: print("Fold biasadd: %s(%s)" % (op.name, op.type)) - op.name = consumer_op.name op.input.append(consumer_op.input[1]) - op.output[0] = consumer_op.output[0] - net.op.remove(consumer_op) + self.safe_remove_node(consumer_op, op) return True return False @@ -575,7 +638,7 @@ class Transformer(base_converter.ConverterInterface): or arg.name == MaceKeyword.mace_activation_max_limit_str: # noqa op.arg.extend([arg]) - net.op.remove(consumer_op) + self.safe_remove_node(consumer_op, op) return True return False @@ -651,11 +714,14 @@ class Transformer(base_converter.ConverterInterface): op.output.extend([input_node.name]) output_shape = op.output_shape.add() output_shape.dims.extend(input_node.shape) + self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) dims_arg = op.arg.add() dims_arg.name = MaceKeyword.mace_dims_str dims_arg.ints.extend([0, 3, 1, 2]) + ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) + for output_node in self._option.output_nodes.values(): output_name = MaceKeyword.mace_output_node_name \ + '_' + output_node.name @@ -673,6 +739,8 @@ class Transformer(base_converter.ConverterInterface): dims_arg.name = MaceKeyword.mace_dims_str dims_arg.ints.extend([0, 2, 3, 1]) + ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) + return False def transpose_filters(self): @@ -695,21 +763,29 @@ class Transformer(base_converter.ConverterInterface): filter_data = filter_data.transpose(3, 2, 0, 1) filter.float_data[:] = filter_data.flat filter.dims[:] = filter_data.shape + if op.type == MaceOp.FullyConnected.name: + weight = self._consts[op.input[1]] + weight_data = np.array(weight.float_data).reshape( + weight.dims) + weight_data = weight_data.transpose(1, 0) + weight.float_data[:] = weight_data.flat + weight.dims[:] = weight_data.shape self.set_filter_format(FilterFormat.OIHW) return False def reshape_fc_weight(self): + print("Reshape fully connected weight shape") net = self._model for op in net.op: if op.type == MaceOp.FullyConnected.name: weight = self._consts[op.input[1]] - # NCHW - input_shape = list(self._producer[op.input[0]] - .output_shape[0].dims) - weight_shape = [weight.dims[0]] + input_shape[1:] - del weight.dims[:] - weight.dims.extend(weight_shape) + input_op = self._producer[op.input[0]] + input_shape = list(input_op.output_shape[0].dims) + input_data_format = ConverterUtil.data_format(input_op) + weight.dims[:] = [weight.dims[0]] + input_shape[1:] + if input_data_format == DataFormat.NHWC: + self.transpose_shape(weight.dims, [0, 3, 1, 2]) return False @@ -788,6 +864,8 @@ class Transformer(base_converter.ConverterInterface): arg.name = MaceKeyword.mace_buffer_type arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value + ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) + for output_node in self._option.output_nodes.values(): output_name = MaceKeyword.mace_output_node_name \ + '_' + output_node.name @@ -803,14 +881,16 @@ class Transformer(base_converter.ConverterInterface): arg.name = MaceKeyword.mace_buffer_type arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value + ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) + return False - def fold_softmax(self): + def fold_reshape(self): changed = False net = self._model for op in net.op: - if op.type == MaceOp.Softmax.name: - print("Fold softmax: %s(%s)" % (op.name, op.type)) + if op.type == MaceOp.Softmax.name or op.type == MaceOp.MatMul.name: + print("Fold reshape: %s(%s)" % (op.name, op.type)) if self.consumer_count(op.output[0]) == 1: consumer = self._consumers[op.output[0]][0] if consumer.type == MaceOp.Reshape.name: @@ -818,15 +898,14 @@ class Transformer(base_converter.ConverterInterface): MaceKeyword.mace_shape_str).ints # noqa del op.output_shape[0].dims[:] op.output_shape[0].dims.extend(shape) - self.replace_output_node(consumer) - net.op.remove(consumer) + self.safe_remove_node(consumer, op) changed = True producer = self._producer[op.input[0]] if producer.type == MaceOp.Reshape.name: - op.input[0] = producer.input[0] - self.replace_output_node(producer) - net.op.remove(producer) + self.safe_remove_node(producer, + self._producer[ + producer.input[0]]) changed = True if len(op.output_shape[0].dims) < 4: @@ -839,6 +918,20 @@ class Transformer(base_converter.ConverterInterface): return False + def transform_matmul_to_fc(self): + net = self._model + for op in net.op: + if op.type == MaceOp.MatMul.name: + input_shape = self.get_tensor_shape(op.input[0]) + _, h, w, _ = self.sort_feature_map_shape(input_shape, + ConverterUtil.data_format(self._producer[op.input[0]])) # noqa + if h == 1 and w == 1 and op.input[1] in self._consts: + weight = self._consts[op.input[1]] + if len(weight.dims) == 2: + op.type = MaceOp.FullyConnected.name + + return False + def transform_global_conv_to_fc(self): """Transform global conv to fc should be placed after transposing input/output and filter""" @@ -917,4 +1010,8 @@ class Transformer(base_converter.ConverterInterface): del net.op[:] net.op.extend(sorted_nodes) + + print("Final ops:") + for op in net.op: + print("%s (%s)" % (op.name, op.type)) return False diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2 index d4eb7c309e43d8613200e52e44f965dbd31908fc..2e923d6816725b17ffd8f236083595692f278f72 100644 --- a/mace/python/tools/mace_engine_factory.h.jinja2 +++ b/mace/python/tools/mace_engine_factory.h.jinja2 @@ -64,26 +64,25 @@ MaceStatus CreateMaceEngineFromCode( } const unsigned char * model_data = nullptr; NetDef net_def; + MaceStatus status = MaceStatus::MACE_SUCCESS; switch (model_name_map[model_name]) { {% for i in range(model_tags |length) %} case {{ i }}: model_data = mace::{{model_tags[i]}}::LoadModelData(model_data_file); net_def = mace::{{model_tags[i]}}::CreateNet(); - - engine->reset( - new mace::MaceEngine(&net_def, device_type, input_nodes, output_nodes, - model_data)); + engine->reset(new mace::MaceEngine(device_type)); + status = (*engine)->Init(&net_def, input_nodes, output_nodes, model_data); if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) { mace::{{model_tags[i]}}::UnloadModelData(model_data); } break; {% endfor %} default: - return MaceStatus::MACE_INVALID_ARGS; + status = MaceStatus::MACE_INVALID_ARGS; } - return MaceStatus::MACE_SUCCESS; + return status; } {% else %} MaceStatus CreateMaceEngineFromCode( diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py index 65369b4625a9ead2c176921d661844cbb731a3d8..493544da30cd95c0a10940db1f4b850f5988d824 100644 --- a/mace/python/tools/tf_dsp_converter_lib.py +++ b/mace/python/tools/tf_dsp_converter_lib.py @@ -95,12 +95,19 @@ def add_shape_const_node(net_def, op, values, name): def convert_op_outputs(mace_op_def, tf_op): + mace_op_def.out_max_byte_size.extend( + [max_elem_size(output) for output in tf_op.outputs]) mace_op_def.output_type.extend( [tf_dtype_2_mace_dtype(output.dtype) for output in tf_op.outputs]) output_shapes = [] for output in tf_op.outputs: output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) + shape_list = output.shape.as_list() + if not shape_list: + shape_list = [1] + elif len(shape_list) == 2: + shape_list = [1, 1, shape_list[0], shape_list[1]] + output_shape.dims.extend(shape_list) output_shapes.append(output_shape) mace_op_def.output_shape.extend(output_shapes) @@ -159,8 +166,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): op_def.input.append(input_tensor.name) op_def.input.extend([t.name for t in s2b_op.inputs[1:]]) op_def.input.extend([min_tensor.name, max_tensor.name]) - op_def.out_max_byte_size.extend( - [max_elem_size(out) for out in quantize_op.outputs]) convert_op_outputs(op_def, quantize_op) elif len(first_op.outputs) > 0 and \ first_op.type == 'QuantizedReshape' and \ @@ -193,9 +198,71 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax') op_def.input.extend( [input_tensor.name, min_tensor.name, max_tensor.name]) - op_def.out_max_byte_size.extend( - [max_elem_size(out) for out in quantize_reshape_op.outputs]) convert_op_outputs(op_def, quantize_reshape_op) + # remove Squeeze + elif len(first_op.outputs) > 0 and \ + first_op.type == 'Requantize' and \ + len(first_op.outputs[0].consumers()) > 0 and \ + first_op.outputs[0].consumers()[0].type == 'Dequantize' and \ + len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) \ + > 0 and \ + first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type \ + == 'Squeeze': + dequantize_op = first_op.outputs[0].consumers()[0] + squeeze_op = dequantize_op.outputs[0].consumers()[0] + reshape_op = squeeze_op.outputs[0].consumers()[0] + min_op = reshape_op.outputs[0].consumers()[0] + max_op = reshape_op.outputs[0].consumers()[1] + quantize_op = min_op.outputs[0].consumers()[0] + + resolved_ops.add(dequantize_op.name) + resolved_ops.add(squeeze_op.name) + resolved_ops.add(reshape_op.name) + resolved_ops.add(min_op.name) + resolved_ops.add(max_op.name) + resolved_ops.add(quantize_op.name) + + op_def.name = quantize_op.name + op_def.input.extend([t.name for t in first_op.inputs]) + convert_op_outputs(op_def, quantize_op) + + # Squeeze -> Softmax + next_op = quantize_op.outputs[0].consumers()[0] \ + if len(quantize_op.outputs) > 0 else None + dequantize_op = next_op.outputs[0].consumers()[0] \ + if next_op and len(next_op.outputs) > 0 and \ + next_op.type == 'QuantizedReshape' and \ + len(next_op.outputs[0].consumers()) > 0 else None + softmax_op = dequantize_op.outputs[0].consumers()[0]\ + if dequantize_op and len(dequantize_op.outputs) > 0 and \ + dequantize_op.type == 'Dequantize' and \ + len(dequantize_op.outputs[0].consumers()) > 0 else None + if softmax_op and softmax_op.type == 'Softmax': + reshape_op = softmax_op.outputs[0].consumers()[0] + min_op = reshape_op.outputs[0].consumers()[0] + max_op = reshape_op.outputs[0].consumers()[1] + quantize_op = min_op.outputs[0].consumers()[0] + quantize_reshape_op = quantize_op.outputs[0].consumers()[0] + + resolved_ops.add(next_op.name) + resolved_ops.add(dequantize_op.name) + resolved_ops.add(softmax_op.name) + resolved_ops.add(reshape_op.name) + resolved_ops.add(min_op.name) + resolved_ops.add(max_op.name) + resolved_ops.add(quantize_op.name) + resolved_ops.add(quantize_reshape_op.name) + + softmax_op_def = net_def.op.add() + softmax_op_def.padding = padding_mode['NA'] + softmax_op_def.name = quantize_reshape_op.name + softmax_op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax') + softmax_op_def.input.extend([ + get_tensor_name_from_op(op_def.name, 0), + get_tensor_name_from_op(op_def.name, 1), + get_tensor_name_from_op(op_def.name, 2)]) + convert_op_outputs(softmax_op_def, quantize_reshape_op) + elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' and \ len(first_op.outputs[0].consumers()) > 0 and \ first_op.outputs[0].consumers()[0].type == 'Tanh': @@ -220,8 +287,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type) op_def.input.extend( [input_tensor.name, min_tensor.name, max_tensor.name]) - op_def.out_max_byte_size.extend( - [max_elem_size(out) for out in quantize_op.outputs]) convert_op_outputs(op_def, quantize_op) # tanh is last op else: @@ -251,8 +316,6 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): get_tensor_name_from_op(op_def.name, 1), get_tensor_name_from_op(op_def.name, 2) ]) - new_tanh_op_def.out_max_byte_size.extend( - [max_elem_size(tanh_op.outputs[0])]) convert_op_outputs(new_tanh_op_def, tanh_op) elif has_padding_and_strides(first_op): op_def.padding = padding_mode[first_op.get_attr('padding')] @@ -266,19 +329,13 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides') op_def.input.extend([strides_tensor]) - op_def.out_max_byte_size.extend( - [max_elem_size(out) for out in first_op.outputs]) convert_op_outputs(op_def, first_op) elif is_node_flatten_reshape(first_op): op_def.type = 'Flatten' - op_def.input.extend([t.name for t in first_op.inputs]) - op_def.out_max_byte_size.extend( - [max_elem_size(out) for out in first_op.outputs]) + op_def.input.extend([first_op.inputs[0].name]) convert_op_outputs(op_def, first_op) elif dsp_ops.has_op(first_op.type): op_def.input.extend([t.name for t in first_op.inputs]) - op_def.out_max_byte_size.extend( - [max_elem_size(out) for out in first_op.outputs]) convert_op_outputs(op_def, first_op) else: raise Exception('Unsupported op: ', first_op) @@ -478,7 +535,8 @@ def fuse_quantize(net_def, input_node, output_node): skip_ops = skip_ops.union( [flatten_op.name, minf_op.name, maxf_op.name]) skip_tensors = skip_tensors.union( - [flatten_op.input[1], minf_op.input[1], maxf_op.input[1]]) + [minf_op.input[0], maxf_op.input[0], + quantize_op.input[1], quantize_op.input[2]]) quantize_op.type = 'AutoQuantize' del quantize_op.input[1:] diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 339aa2f870c5a8c3f8a2942f87d6db2cb04277e2..5d2b018c4704ed6761947b93c161f9baecb05168 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -318,8 +318,10 @@ void MaceRunFunc(const int in_out_size) { new FileStorageFactory(file_path)); mace::SetKVStorageFactory(storage_factory); - MaceEngine engine(&net_def, device, input_names, output_names, - reinterpret_cast(data.data())); + MaceEngine engine(device); + MaceStatus status = engine.Init(&net_def, input_names, output_names, + reinterpret_cast(data.data())); + ASSERT_EQ(status, MaceStatus::MACE_SUCCESS); std::map inputs; std::map outputs; diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index fa1d0e31cc32f75c2cdabd1b42118ddbd11cfe4b..df8270cba9896903496225bfdba19b8d611be589 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -323,8 +323,10 @@ void MaceRun(const int in_out_size, &net_def); } - MaceEngine engine(&net_def, device, input_names, output_names, - reinterpret_cast(data.data())); + MaceEngine engine(device); + MaceStatus status = engine.Init(&net_def, input_names, output_names, + reinterpret_cast(data.data())); + ASSERT_EQ(status, MaceStatus::MACE_SUCCESS); std::map inputs; std::map outputs; diff --git a/mace/utils/logging.cc b/mace/utils/logging.cc index 52ddca5f2021381dc79d4ce81ff2dbfa3040d470..a8b06e698a1772918467f74101de2921d25e8cb3 100644 --- a/mace/utils/logging.cc +++ b/mace/utils/logging.cc @@ -15,6 +15,7 @@ #include "mace/utils/logging.h" #include +#include #if defined(ANDROID) || defined(__ANDROID__) #include #include