diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 78acb3303658e96b8d8c5e81f741775a8809c9b3..403639c2cfeaaf505c4ffb05ba28c49ce8100ba9 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -82,7 +82,6 @@ extern void Register_BiasAdd(OperatorRegistry *op_registry); extern void Register_ChannelShuffle(OperatorRegistry *op_registry); extern void Register_Concat(OperatorRegistry *op_registry); extern void Register_Conv2D(OperatorRegistry *op_registry); -extern void Register_CWise(OperatorRegistry *op_registry); extern void Register_DepthToSpace(OperatorRegistry *op_registry); extern void Register_DepthwiseConv2d(OperatorRegistry *op_registry); extern void Register_Dequantize(OperatorRegistry *op_registry); @@ -125,7 +124,6 @@ OperatorRegistry::OperatorRegistry() { ops::Register_ChannelShuffle(this); ops::Register_Concat(this); ops::Register_Conv2D(this); - ops::Register_CWise(this); ops::Register_DepthToSpace(this); ops::Register_DepthwiseConv2d(this); ops::Register_Dequantize(this); diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 7a3bd994fa8baaae98a5878f92c73c0ef6ca74ae..c369b15cd00d4cde99bbb172d468d1e4b0147c9f 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -119,19 +119,20 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { tensor_map_[const_tensor.name()] = std::move(tensor); } - if (type == DeviceType::OPENCL) { - CreateImageOutputTensor(net_def); + if (type == DeviceType::CPU || type == DeviceType::OPENCL) { + CreateOutputTensorBuffer(net_def, type); } } -void Workspace::CreateImageOutputTensor(const NetDef &net_def) { +void Workspace::CreateOutputTensorBuffer(const NetDef &net_def, + DeviceType device_type) { if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) { return; } DataType dtype = DataType::DT_INVALID; - // We use the data type of the first op (with mem id, must be image), - // as GPU have consistent data type for each layer for now. + // We use the data type of the first op with mem id, + // as CPU&GPU have consistent data type for each layer for now. // As DSP may have different data output type for each op, // we stick to the same concept. for (auto &op : net_def.op()) { @@ -148,11 +149,19 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { } MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); for (auto &mem_block : net_def.mem_arena().mem_block()) { - std::unique_ptr image_buf( - new Image({mem_block.x(), mem_block.y()}, dtype)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); + if (device_type == DeviceType::OPENCL) { + std::unique_ptr image_buf( + new Image({mem_block.x(), mem_block.y()}, dtype)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(image_buf)); + } else { + std::unique_ptr tensor_buf( + new Buffer(GetDeviceAllocator(device_type), mem_block.x())); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); + } } - VLOG(3) << "Preallocate image to tensors"; + VLOG(3) << "Preallocate buffer to tensors"; for (auto &op : net_def.op()) { if (!op.mem_id().empty()) { auto mem_ids = op.mem_id(); @@ -161,15 +170,17 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { std::unique_ptr tensor (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype)); tensor->SetSourceOpName(op.name()); - VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" - << " Mem: " << mem_ids[i] - << " Image shape: " - << dynamic_cast(tensor->UnderlyingBuffer()) - ->image_shape()[0] - << ", " - << dynamic_cast(tensor->UnderlyingBuffer()) - ->image_shape()[1]; tensor_map_[op.output(i)] = std::move(tensor); + if (device_type == DeviceType::OPENCL) { + VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" + << " Mem: " << mem_ids[i] + << " Image shape: " + << dynamic_cast(tensor->UnderlyingBuffer()) + ->image_shape()[0] + << ", " + << dynamic_cast(tensor->UnderlyingBuffer()) + ->image_shape()[1]; + } } } } diff --git a/mace/core/workspace.h b/mace/core/workspace.h index b4b75995d25de4cabd90285d848baa4191c846ba..e9e11ea38810a8fd71c9871c07cb199c803d3dd2 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -52,7 +52,7 @@ class Workspace { ScratchBuffer *GetScratchBuffer(DeviceType device_type); private: - void CreateImageOutputTensor(const NetDef &net_def); + void CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type); TensorMap tensor_map_; diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h index a4b6d0282c9d05ef594316997460e63259bad4a0..0b02541297f3d1c7172015e6c0afe091e16a4834 100644 --- a/mace/kernels/arm/conv_2d_neon.h +++ b/mace/kernels/arm/conv_2d_neon.h @@ -51,6 +51,39 @@ extern void Conv2dNeonK3x3S2(const float *input, const index_t out_channels, float *output); +extern void Conv2dNeonK7x7S1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output); + +extern void Conv2dNeonK7x7S2(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output); + +extern void Conv2dNeonK7x7S3(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output); + } // namespace kernels } // namespace mace diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc new file mode 100644 index 0000000000000000000000000000000000000000..7a8acaa8e8a22d297168ecd5d0313b6cb095b78a --- /dev/null +++ b/mace/kernels/arm/conv_2d_neon_7x7.cc @@ -0,0 +1,573 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +#define Conv2dNeonK7x7SnLoadCalc4 \ + /* load filter (4 outch x 1 height x 4 width) */ \ + float32x4_t vf00, vf01; \ + float32x4_t vf10, vf11; \ + float32x4_t vf20, vf21; \ + float32x4_t vf30, vf31; \ + vf00 = vld1q_f32(filter_ptr0); \ + vf01 = vld1q_f32(filter_ptr0 + 4); \ + vf10 = vld1q_f32(filter_ptr1); \ + vf11 = vld1q_f32(filter_ptr1 + 4); \ + vf20 = vld1q_f32(filter_ptr2); \ + vf21 = vld1q_f32(filter_ptr2 + 4); \ + vf30 = vld1q_f32(filter_ptr3); \ + vf31 = vld1q_f32(filter_ptr3 + 4); \ + \ + /* outch 0 */ \ + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi5, vget_low_f32(vf01), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 0); \ + \ + /* outch 1 */ \ + vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0); \ + vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1); \ + vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0); \ + vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1); \ + vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 0); \ + vo1 = vmlaq_lane_f32(vo1, vi5, vget_low_f32(vf11), 1); \ + vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 0); \ + \ + /* outch 2 */ \ + vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0); \ + vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1); \ + vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0); \ + vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1); \ + vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 0); \ + vo2 = vmlaq_lane_f32(vo2, vi5, vget_low_f32(vf21), 1); \ + vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 0); \ + \ + /* outch 3 */ \ + vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0); \ + vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1); \ + vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0); \ + vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \ + vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 0); \ + vo3 = vmlaq_lane_f32(vo3, vi5, vget_low_f32(vf31), 1); \ + vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 0); + +#define Conv2dNeonK7x7SnLoadCalc1 \ + /* load filter (1 outch x 1 height x 4 width) */ \ + float32x4_t vf00, vf01; \ + vf00 = vld1q_f32(filter_ptr0); \ + vf01 = vld1q_f32(filter_ptr0 + 4); \ + \ + /* outch 0 */ \ + vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 0); \ + vo0 = vmlaq_lane_f32(vo0, vi5, vget_low_f32(vf01), 1); \ + vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 0); + +inline void Conv2dCPUK7x7Calc(const float *in_ptr_base, + const float *filter_ptr0, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_image_size, + float *out_ptr0_base, + const index_t io, + const int stride) { + for (index_t ih = 0; ih < out_height; ++ih) { + for (index_t iw = 0; iw < out_width; ++iw) { + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + out_ptr0_base[io * out_image_size + ih * out_width + iw] += + in_ptr_base[(ih * stride + i) * in_width + (iw * stride + j)] * + filter_ptr0[io * in_channels * 49 + i * 7 + j]; + } + } + } + } +} + + +// Ho = 1, Wo = 4, Co = 4 +void Conv2dNeonK7x7S1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output) { + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; m += 4) { + if (m + 3 < out_channels) { + float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; + const float *filter_ptr1 = + filter + (m + 1) * in_channels * 49 + c * 49; + const float *filter_ptr2 = + filter + (m + 2) * in_channels * 49 + c * 49; + const float *filter_ptr3 = + filter + (m + 3) * in_channels * 49 + c * 49; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_offset = h * in_width + w; + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + float32x4_t vi8; // for tmp use + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + + Conv2dNeonK7x7SnLoadCalc4; + + in_offset += in_width; + filter_ptr0 += 7; + filter_ptr1 += 7; + filter_ptr2 += 7; + filter_ptr3 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + + filter_ptr0 -= 49; + filter_ptr1 -= 49; + filter_ptr2 -= 49; + filter_ptr3 -= 49; + } // w + } // h +#else + for (index_t io = 0; io < 4; ++io) { + Conv2dCPUK7x7Calc(in_ptr_base, filter_ptr0, in_width, in_channels, + out_height, out_width, out_image_size, + out_ptr0_base, io, 1); + } // for +#endif + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_offset = h * in_width + w; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + float32x4_t vi8; // for tmp use + // load input + vi0 = vld1q_f32(in_ptr_base + in_offset); + vi4 = vld1q_f32(in_ptr_base + in_offset + 4); + vi8 = vld1q_f32(in_ptr_base + in_offset + 8); + vi1 = vextq_f32(vi0, vi4, 1); + vi2 = vextq_f32(vi0, vi4, 2); + vi3 = vextq_f32(vi0, vi4, 3); + vi5 = vextq_f32(vi4, vi8, 1); + vi6 = vextq_f32(vi4, vi8, 2); + + Conv2dNeonK7x7SnLoadCalc1; + + in_offset += in_width; + filter_ptr0 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + filter_ptr0 -= 49; + } // w + } // h +#else + Conv2dCPUK7x7Calc(in_ptr_base, filter_ptr0, in_width, in_channels, + out_height, out_width, out_image_size, + out_ptr0_base, 0, 1); +#endif + } // c + } // mm + } // if + } // m + } // b +} + +// Ho = 1, Wo = 4, Co = 4 +void Conv2dNeonK7x7S2(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output) { + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; m += 4) { + if (m + 3 < out_channels) { + float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; + const float *filter_ptr1 = + filter + (m + 1) * in_channels * 49 + c * 49; + const float *filter_ptr2 = + filter + (m + 2) * in_channels * 49 + c * 49; + const float *filter_ptr3 = + filter + (m + 3) * in_channels * 49 + c * 49; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * in_width + in_w; + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4x2_t vvi0, vvi1; // to de-interleave + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + // load input + // [0.2.4.6, 1.3.5.7] + vvi0 = vld2q_f32(in_ptr_base + in_offset); + // [8.10.12.14, 9.11.13.15] + vvi1 = vld2q_f32(in_ptr_base + in_offset + 8); + vi0 = vvi0.val[0]; // [0.2.4.6] + vi1 = vvi0.val[1]; // [1.3.5.7] + vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8] + vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9] + vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10] + vi5 = vextq_f32(vi1, vvi1.val[1], 2); // [5.7.9.11] + vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] + + Conv2dNeonK7x7SnLoadCalc4; + + in_offset += in_width; + filter_ptr0 += 7; + filter_ptr1 += 7; + filter_ptr2 += 7; + filter_ptr3 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + + filter_ptr0 -= 49; + filter_ptr1 -= 49; + filter_ptr2 -= 49; + filter_ptr3 -= 49; + } // w + } // h +#else + for (index_t io = 0; io < 4; ++io) { + Conv2dCPUK7x7Calc(in_ptr_base, filter_ptr0, in_width, in_channels, + out_height, out_width, out_image_size, + out_ptr0_base, io, 2); + } // for +#endif + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * in_width + in_w; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load ouput + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4x2_t vvi0, vvi1; // to de-interleave + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + // load input + // [0.2.4.6, 1.3.5.7] + vvi0 = vld2q_f32(in_ptr_base + in_offset); + // [8.10.12.14, 9.11.13.15] + vvi1 = vld2q_f32(in_ptr_base + in_offset + 8); + vi0 = vvi0.val[0]; // [0.2.4.6] + vi1 = vvi0.val[1]; // [1.3.5.7] + vi2 = vextq_f32(vi0, vvi1.val[0], 1); // [2.4.6.8] + vi3 = vextq_f32(vi1, vvi1.val[1], 1); // [3.5.7.9] + vi4 = vextq_f32(vi0, vvi1.val[0], 2); // [4.6.8.10] + vi5 = vextq_f32(vi1, vvi1.val[1], 2); // [5.7.9.11] + vi6 = vextq_f32(vi0, vvi1.val[0], 3); // [6.8.10.12] + + Conv2dNeonK7x7SnLoadCalc1; + + in_offset += in_width; + filter_ptr0 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + filter_ptr0 -= 49; + } // w + } // h +#else + Conv2dCPUK7x7Calc(in_ptr_base, filter_ptr0, in_width, in_channels, + out_height, out_width, out_image_size, + out_ptr0_base, 0, 2); +#endif + } // c + } // mm + } // if + } // m + } // b +} + +// Ho = 1, Wo = 4, Co = 4 +void Conv2dNeonK7x7S3(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output) { + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; m += 4) { + if (m + 3 < out_channels) { + float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output + b * out_batch_size + (m + 1) * out_image_size; + float *out_ptr2_base = + output + b * out_batch_size + (m + 2) * out_image_size; + float *out_ptr3_base = + output + b * out_batch_size + (m + 3) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49; + const float *filter_ptr1 = + filter + (m + 1) * in_channels * 49 + c * 49; + const float *filter_ptr2 = + filter + (m + 2) * in_channels * 49 + c * 49; + const float *filter_ptr3 = + filter + (m + 3) * in_channels * 49 + c * 49; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_h = h * 3; + index_t in_w = w * 3; + index_t in_offset = in_h * in_width + in_w; + // output (4 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0, vo1, vo2, vo3; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + vo1 = vld1q_f32(out_ptr1_base + out_offset); + vo2 = vld1q_f32(out_ptr2_base + out_offset); + vo3 = vld1q_f32(out_ptr3_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4x3_t vvi0, vvi1; // to de-interleave + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + // load input + // [0.3.6.9, 1.4.7.10, 2.5.8.11] + vvi0 = vld3q_f32(in_ptr_base + in_offset); + // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx] + vvi1 = vld3q_f32(in_ptr_base + in_offset + 12); + vi0 = vvi0.val[0]; // [0.3.6.9] + vi1 = vvi0.val[1]; // [1.4.7.10] + vi2 = vvi0.val[2]; // [2.5.8.11] + vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12] + vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13] + vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14] + vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] + + Conv2dNeonK7x7SnLoadCalc4; + + in_offset += in_width; + filter_ptr0 += 7; + filter_ptr1 += 7; + filter_ptr2 += 7; + filter_ptr3 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + vst1q_f32(out_ptr1_base + out_offset, vo1); + vst1q_f32(out_ptr2_base + out_offset, vo2); + vst1q_f32(out_ptr3_base + out_offset, vo3); + + filter_ptr0 -= 49; + filter_ptr1 -= 49; + filter_ptr2 -= 49; + filter_ptr3 -= 49; + } // w + } // h +#else + for (index_t io = 0; io < 4; ++io) { + Conv2dCPUK7x7Calc(in_ptr_base, filter_ptr0, in_width, in_channels, + out_height, out_width, out_image_size, + out_ptr0_base, io, 3); + } // for +#endif + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49; +#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t in_h = h * 3; + index_t in_w = w * 3; + index_t in_offset = in_h * in_width + in_w; + // output (1 outch x 1 height x 4 width): vo_outch_height + float32x4_t vo0; + // load output + index_t out_offset = h * out_width + w; + vo0 = vld1q_f32(out_ptr0_base + out_offset); + for (index_t r = 0; r < 7; ++r) { + // input (3 slide) + float32x4x3_t vvi0, vvi1; // to de-interleave + float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6; + // load input + // [0.3.6.9, 1.4.7.10, 2.5.8.11] + vvi0 = vld3q_f32(in_ptr_base + in_offset); + // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx] + vvi1 = vld3q_f32(in_ptr_base + in_offset + 12); + vi0 = vvi0.val[0]; // [0.3.6.9] + vi1 = vvi0.val[1]; // [1.4.7.10] + vi2 = vvi0.val[2]; // [2.5.8.11] + vi3 = vextq_f32(vi0, vvi1.val[0], 1); // [3.6.9.12] + vi4 = vextq_f32(vi1, vvi1.val[1], 1); // [4.7.10.13] + vi5 = vextq_f32(vi2, vvi1.val[2], 1); // [5.8.11.14] + vi6 = vextq_f32(vi0, vvi1.val[0], 2); // [6.9.12.15] + + Conv2dNeonK7x7SnLoadCalc1; + + in_offset += in_width; + filter_ptr0 += 7; + } // r + + vst1q_f32(out_ptr0_base + out_offset, vo0); + filter_ptr0 -= 49; + } // w + } // h +#else + Conv2dCPUK7x7Calc(in_ptr_base, filter_ptr0, in_width, in_channels, + out_height, out_width, out_image_size, + out_ptr0_base, 0, 3); +#endif + } // c + } // mm + } // if + } // m + } // b +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 99af6628e7b0840b2de03ebeebf7f469169ecf54..e2b177b696a81c74378efeb19cff2aa719501a6b 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -227,6 +227,12 @@ struct Conv2dFunctor : Conv2dFunctorBase { && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1; bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; + bool use_neon_7x7_s1 = filter_h == 7 && filter_w == 7 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; + bool use_neon_7x7_s2 = filter_h == 7 && filter_w == 7 + && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1; + bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7 + && stride_h == 3 && stride_w == 3 && dilation_h == 1 && dilation_w == 1; std::vector transformed_input_shape; std::vector transformed_output_shape; @@ -291,6 +297,44 @@ struct Conv2dFunctor : Conv2dFunctorBase { if (extra_input_width != padded_input_width) { pad_right += (extra_input_width - padded_input_width); } + } else if (use_neon_7x7_s1) { + extra_output_height = height; + extra_input_height = + std::max(padded_input_height, extra_output_height + 6); + extra_output_width = RoundUp(width, 4); + extra_input_width = std::max(padded_input_width, extra_output_width + 6); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } + } else if (use_neon_7x7_s2) { + extra_output_height = height; + extra_input_height = + std::max(padded_input_height, (extra_output_height - 1) * 2 + 7); + extra_output_width = RoundUp(width, 4); + extra_input_width = + std::max(padded_input_width, (extra_output_width - 1) * 2 + 7); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } + } else if (use_neon_7x7_s3) { + extra_output_height = height; + extra_input_height = + std::max(padded_input_height, (extra_output_height - 1) * 3 + 7); + extra_output_width = RoundUp(width, 4); + extra_input_width = + std::max(padded_input_width, (extra_output_width - 1) * 3 + 7); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } } // decide scratch size before allocate it @@ -416,6 +460,45 @@ struct Conv2dFunctor : Conv2dFunctorBase { channels, pad_output); }; + } else if (use_neon_7x7_s1) { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK7x7S1(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + extra_output_height, + extra_output_width, + channels, + pad_output); + }; + } else if (use_neon_7x7_s2) { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK7x7S2(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + extra_output_height, + extra_output_width, + channels, + pad_output); + }; + } else if (use_neon_7x7_s3) { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK7x7S3(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + extra_output_height, + extra_output_width, + channels, + pad_output); + }; } else { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dGeneral(pad_input, diff --git a/mace/kernels/cwise.h b/mace/kernels/cwise.h deleted file mode 100644 index dde27da7235b76b9187382f79b22224e6bfd688c..0000000000000000000000000000000000000000 --- a/mace/kernels/cwise.h +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_CWISE_H_ -#define MACE_KERNELS_CWISE_H_ - -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL - -namespace mace { -namespace kernels { - -enum CWiseType { - MUL = 0, - ADD = 1, - MAX = 2, - MIN = 3, - SUB = 4, - DIV = 5, - NEG = 6, - ABS = 7, -}; - -struct CWiseFunctorBase { - CWiseFunctorBase(const CWiseType type, const float coeff) - : type_(type), coeff_(coeff) {} - - CWiseType type_; - float coeff_; -}; - -template -struct CWiseFunctor : CWiseFunctorBase { - CWiseFunctor(const CWiseType type, const float coeff) - : CWiseFunctorBase(type, coeff) {} - - void operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard output_guard(output); - - const T *input_ptr = input->data(); - T *output_ptr = output->mutable_data(); - const index_t size = input->size(); - - switch (type_) { - case MUL: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = coeff_ * input_ptr[i]; - } - break; - case ADD: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = coeff_ + input_ptr[i]; - } - break; - case MAX: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::max(input_ptr[i], coeff_); - } - break; - case MIN: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::min(input_ptr[i], coeff_); - } - break; - case SUB: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = input_ptr[i] - coeff_; - } - break; - case DIV: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = input_ptr[i] / coeff_; - } - break; - case NEG: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = 0 - input_ptr[i]; - } - break; - case ABS: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - T val = input_ptr[i]; - output_ptr[i] = (val > 0)? val : 0 - val; - } - break; - default: - LOG(FATAL) << "CWise op not support type " << type_; - } - } -}; - -#ifdef MACE_ENABLE_OPENCL -template -struct CWiseFunctor : CWiseFunctorBase { - CWiseFunctor(const CWiseType type, const float coeff) - : CWiseFunctorBase(type, coeff) {} - - void operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); - - cl::Kernel kernel_; - uint32_t kwg_size_; - std::unique_ptr kernel_error_; - std::vector input_shape_; -}; -#endif // MACE_ENABLE_OPENCL - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_CWISE_H_ diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index aa1a3e21c0be4a202f3fe1954b1a4bca1bb3b118..8ebb4364c830d094b5b2763ee98824de49400810 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -35,10 +35,15 @@ enum EltwiseType { MAX = 2, MIN = 3, SUB = 4, + DIV = 5, + NEG = 6, + ABS = 7, + SQR_DIFF = 8, }; struct EltwiseFunctorBase { - EltwiseFunctorBase(const EltwiseType type, const std::vector &coeff) + EltwiseFunctorBase(const EltwiseType type, + const std::vector &coeff) : type_(type), coeff_(coeff) {} EltwiseType type_; @@ -47,63 +52,195 @@ struct EltwiseFunctorBase { template struct EltwiseFunctor : EltwiseFunctorBase { - EltwiseFunctor(const EltwiseType type, const std::vector &coeff) + EltwiseFunctor(const EltwiseType type, + const std::vector &coeff) : EltwiseFunctorBase(type, coeff) {} void operator()(const Tensor *input0, const Tensor *input1, + const index_t start_axis, + const bool is_scaler, + const float value, + const bool swap, Tensor *output, StatsFuture *future) { - Tensor::MappingGuard input0_guard(input0); - Tensor::MappingGuard input1_guard(input1); - Tensor::MappingGuard output_guard(output); + if (is_scaler) { + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard output_guard(output); - const T *input0_ptr = input0->data(); - const T *input1_ptr = input1->data(); - T *output_ptr = output->mutable_data(); - const index_t size = input0->size(); - - switch (type_) { - case PROD: + const T *input0_ptr = input0->data(); + T *output_ptr = output->mutable_data(); + const index_t num = input0->size(); + switch (type_) { + case PROD: +#pragma omp parallel for + for (index_t i = 0; i < num; ++i) { + output_ptr[i] = input0_ptr[i] * value; + } + break; + case SUM: + if (coeff_.empty()) { #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = input0_ptr[i] * input1_ptr[i]; - } - break; - case SUM: - if (coeff_.empty()) { + for (index_t i = 0; i < num; ++i) { + output_ptr[i] = input0_ptr[i] + value; + } + } else { + const float coeff_0 = swap ? coeff_[1] : coeff_[0]; + const float coeff_1 = swap ? coeff_[0] : coeff_[1]; #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = input0_ptr[i] + input1_ptr[i]; + for (index_t i = 0; i < num; ++i) { + output_ptr[i] = coeff_0 * input0_ptr[i] + + coeff_1 * value; + } } - } else { + break; + case MAX: #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = - coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i]; + for (index_t i = 0; i < num; ++i) { + output_ptr[i] = std::max(input0_ptr[i], value); } - } - break; - case MAX: + break; + case MIN: #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::max(input0_ptr[i], input1_ptr[i]); - } - break; - case MIN: + for (index_t i = 0; i < num; ++i) { + output_ptr[i] = std::min(input0_ptr[i], value); + } + break; + case SUB: #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = std::min(input0_ptr[i], input1_ptr[i]); - } - break; - case SUB: + for (index_t i = 0; i < num; ++i) { + output_ptr[i] = swap ? value - input0_ptr[i] : + input0_ptr[i] - value; + } + break; + case DIV: + if (!swap) { + MACE_CHECK(fabs(value) > 1e-6, "cannot divided by 0."); #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = input0_ptr[i] - input1_ptr[i]; - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type_; + for (index_t i = 0; i < num; ++i) { + output_ptr[i] = input0_ptr[i] / value; + } + } else { +#pragma omp parallel for + for (index_t i = 0; i < num; ++i) { + MACE_CHECK(fabs(input0_ptr[i]) > 1e-6, "cannot divided by 0."); + output_ptr[i] = value / input0_ptr[i]; + } + } + break; + case SQR_DIFF: +#pragma omp parallel for + for (index_t i = 0; i < num; ++i) { + const float tmp = input0_ptr[i] - value; + output_ptr[i] = tmp * tmp; + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type_; + } + } else { + MACE_CHECK_NOTNULL(input0); + MACE_CHECK_NOTNULL(input1); + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); + Tensor::MappingGuard output_guard(output); + + const T *input0_ptr = input0->data(); + const T *input1_ptr = input1->data(); + T *output_ptr = output->mutable_data(); + const index_t size0 = input0->size(); + const index_t size1 = input1->size(); + + const index_t num = size0 / size1; + switch (type_) { + case PROD: +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j= 0; j < size1; ++j) { + output_ptr[i * size1 + j] = + input0_ptr[i * size1 + j] * input1_ptr[j]; + } + } + break; + case SUM: + if (coeff_.empty()) { +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j = 0; j < size1; ++j) { + output_ptr[i * size1 + j] = + input0_ptr[i * size1 + j] + input1_ptr[j]; + } + } + } else { + const float coeff_0 = swap ? coeff_[1] : coeff_[0]; + const float coeff_1 = swap ? coeff_[0] : coeff_[1]; +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j = 0; j < size1; ++j) { + output_ptr[i * size1 + j] = + coeff_0 * input0_ptr[i * size1 + j] + + coeff_1 * input1_ptr[j]; + } + } + } + break; + case MAX: +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j = 0; j < size1; ++j) { + output_ptr[i * size1 + j] = + std::max(input0_ptr[i * size1 + j], input1_ptr[j]); + } + } + break; + case MIN: +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j = 0; j < size1; ++j) { + output_ptr[i * size1 + j] = + std::min(input0_ptr[i * size1 + j], input1_ptr[j]); + } + } + break; + case SUB: +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j = 0; j < size1; ++j) { + output_ptr[i * size1 + j] = swap ? + input0_ptr[i * size1 + j] - input1_ptr[j] : + input1_ptr[j] - input0_ptr[i * size1 + j]; + } + } + break; + case DIV: +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j = 0; j < size1; ++j) { + if (!swap) { + MACE_CHECK(fabs(input1_ptr[j]) > 1e-6, "cannot divided by 0."); + output_ptr[i * size1 + j] = + input0_ptr[i * size1 + j] / input1_ptr[j]; + } else { + MACE_CHECK(fabs(input0_ptr[i * size1 + j]) > 1e-6, + "cannot divided by 0."); + output_ptr[i * size1 + j] = + input1_ptr[j] / input0_ptr[i * size1 + j]; + } + } + } + break; + case SQR_DIFF: +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < num; ++i) { + for (index_t j = 0; j < size1; ++j) { + const T tmp = input0_ptr[i * size1 + j] - input1_ptr[j]; + output_ptr[i * size1 + j] = tmp * tmp; + } + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type_; + } } } }; @@ -111,11 +248,16 @@ struct EltwiseFunctor : EltwiseFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct EltwiseFunctor : EltwiseFunctorBase { - EltwiseFunctor(const EltwiseType type, const std::vector &coeff) + EltwiseFunctor(const EltwiseType type, + const std::vector &coeff) : EltwiseFunctorBase(type, coeff) {} void operator()(const Tensor *input0, const Tensor *input1, + const index_t start_axis, + const bool is_scaler, + const float value, + const bool swap, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/opencl/cl/cwise.cl b/mace/kernels/opencl/cl/cwise.cl deleted file mode 100644 index 2d3f3105cbddb0dfd9d8b3b208bf400772f60fb4..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/cl/cwise.cl +++ /dev/null @@ -1,56 +0,0 @@ -#include - -__kernel void cwise(KERNEL_ERROR_PARAMS - GLOBAL_WORK_GROUP_SIZE_DIM2 - __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ - __private const int width, - __private const int channel, - __private const float value, - __write_only image2d_t output) { - const int w = get_global_id(0); - const int hb = get_global_id(1); - -#ifndef NON_UNIFORM_WORK_GROUP - if (w >= global_size_dim0 || hb >= global_size_dim1) return; -#endif - - const int remain_chan = channel - mul24((w / width), 4); - - DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(w, hb)); - DATA_TYPE4 in1 = (DATA_TYPE4){value, value, value, value}; - DATA_TYPE4 out; - -#if CWISE_TYPE == 0 - out = in0 * in1; -#elif CWISE_TYPE == 1 - out = in0 + in1; -#elif CWISE_TYPE == 2 - out = fmax(in0, in1); -#elif CWISE_TYPE == 3 - out = fmin(in0, in1); -#elif CWISE_TYPE == 4 - out = in0 - in1; -#elif CWISE_TYPE == 5 - out = in0 / in1; -#elif CWISE_TYPE == 6 - in1 = (DATA_TYPE4)(0, 0, 0, 0); - out = in1 - in0; -#elif CWISE_TYPE == 7 - out = fabs(in0); -#endif - -#if CWISE_TYPE == 1 || CWISE_TYPE == 2 || CWISE_TYPE == 3 || CWISE_TYPE == 4 - if (remain_chan < 4) { - switch (remain_chan) { - case 1: - out.y = 0; - case 2: - out.z = 0; - case 3: - out.w = 0; - } - } -#endif - - WRITE_IMAGET(output, (int2)(w, hb), out); -} diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index 58838a7d29aad87345706cb66ecea0d86d4c22a4..b2ebebeccd93c43c686f98f83997152305dd8a1f 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -1,30 +1,62 @@ #include __kernel void eltwise(KERNEL_ERROR_PARAMS - GLOBAL_WORK_GROUP_SIZE_DIM2 - __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input0, __read_only image2d_t input1, + __private const float value, + __private const int height, + __private const int width, + __private const int channel, #ifdef COEFF_SUM __private const float coeff0, __private const float coeff1, #endif __write_only image2d_t output) { - const int w = get_global_id(0); - const int hb = get_global_id(1); + const int c = get_global_id(0); + const int w = get_global_id(1); + const int hb = get_global_id(2); #ifndef NON_UNIFORM_WORK_GROUP - if (w >= global_size_dim0 || hb >= global_size_dim1) return; + if (c >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) + return; #endif - DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); - DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); + int pos_w; + int pos_h; +#if START_AXIS == 0 + pos_w = mad24(c, width, w); + pos_h = hb; +#elif START_AXIS == 1 + pos_w = mad24(c, width, w); + pos_h = hb % height; +#elif START_AXIS == 2 + pos_w = mad24(c, width, w); + pos_h = 0; +#elif START_AXIS == 3 + pos_w = c; + pos_h = 0; +#endif + const int pos = mad24(c, width, w); + const int remain_channel = channel - 4 * c; + DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(pos, hb)); + DATA_TYPE4 in1 ; +#if IS_SCALER == 1 + in1 = (DATA_TYPE4){value, value, value, value}; +#else + in1 = READ_IMAGET(input1, SAMPLER, (int2)(pos_w, pos_h)); +#endif DATA_TYPE4 out; #if ELTWISE_TYPE == 0 out = in0 * in1; #elif ELTWISE_TYPE == 1 #ifdef COEFF_SUM - out = mad(coeff0, in0, mad(coeff1, in1, 0)); + #if NEEDSWAP == 0 + out = mad(coeff0, in0, mad(coeff1, in1, 0)); + #else + out = mad(coeff1, in0, mad(coeff0, in1, 0)); + #endif #else out = in0 + in1; #endif @@ -34,8 +66,49 @@ __kernel void eltwise(KERNEL_ERROR_PARAMS #elif ELTWISE_TYPE == 3 out = fmin(in0, in1); #elif ELTWISE_TYPE == 4 - out = in0 - in1; + #if NEED_SWAP == 0 + out = in0 - in1; + #else + out = in1 - in0; + #endif +#elif ELTWISE_TYPE == 5 + #if NEED_SWAP == 0 + if (fabs(in1.x) > 0.000001f) + out.x = in0.x / in1.x; + if (fabs(in1.y) > 0.000001f) + out.y = in0.y / in1.y; + if (fabs(in1.z) > 0.000001f) + out.z = in0.z / in1.z; + if (fabs(in1.w) > 0.000001f) + out.w = in0.w / in1.w; + #else + if (fabs(in1.x) > 0.000001f) + out.x = in1.x / in0.x; + if (fabs(in1.y) > 0.000001f) + out.y = in1.y / in0.y; + if (fabs(in1.z) > 0.000001f) + out.z = in1.z / in0.z; + if (fabs(in1.w) > 0.000001f) + out.w = in1.w / in0.w; + #endif +#elif ELTWISE_TYPE == 8 + DATA_TYPE4 diff = in0 - in1; + out = diff * diff; +#endif + +#if ELTWISE_TYPE == 1 || ELTWISE_TYPE == 2 || ELTWISE_TYPE == 3 \ + || ELTWISE_TYPE == 4 || ELTWISE_TYPE == 8 + if (remain_channel < 4) { + switch (remain_channel) { + case 1: + out.y = 0; + case 2: + out.z = 0; + case 3: + out.w = 0; + } + } #endif - WRITE_IMAGET(output, (int2)(w, hb), out); + WRITE_IMAGET(output, (int2)(pos, hb), out); } diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc deleted file mode 100644 index a9565a3d41c41a6f1d1975c6c744aafa5eb5a6e8..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/cwise_opencl.cc +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/cwise.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" - -namespace mace { -namespace kernels { - -template -void CWiseFunctor::operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - const index_t batch = input->dim(0); - const index_t height = input->dim(1); - const index_t width = input->dim(2); - const index_t channels = input->dim(3); - - const index_t channel_blocks = RoundUpDiv4(channels); - const index_t width_pixels = channel_blocks * width; - const index_t batch_height_pixels = batch * height; - - auto runtime = OpenCLRuntime::Global(); - const uint32_t gws[2] = {static_cast(width_pixels), - static_cast(batch_height_pixels)}; - if (kernel_.get() == nullptr) { - std::set built_options; - auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("cwise"); - built_options.emplace("-Dcwise=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - built_options.emplace(MakeString("-DCWISE_TYPE=", type_)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - kernel_ = runtime->BuildKernel("cwise", kernel_name, built_options); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - if (!IsVecEqual(input_shape_, input->shape())) { - uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } - kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(width)); - kernel_.setArg(idx++, static_cast(channels)); - kernel_.setArg(idx++, static_cast(coeff_)); - kernel_.setArg(idx++, *(output->opencl_image())); - input_shape_ = input->shape(); - } - - const std::vector lws = {kwg_size_ / 16, 16, 0}; - std::stringstream ss; - ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) - << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } -} - -template struct CWiseFunctor; -template struct CWiseFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index 629ba89045b043f2b1f7965eefc875a32a78b8ae..0ec4a1e59e4925f8e94e1221360df9d1ac38fd50 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -23,6 +23,10 @@ namespace kernels { template void EltwiseFunctor::operator()(const Tensor *input0, const Tensor *input1, + const index_t start_axis, + const bool is_scaler, + const float value, + const bool swap, Tensor *output, StatsFuture *future) { const index_t batch = input0->dim(0); @@ -31,14 +35,15 @@ void EltwiseFunctor::operator()(const Tensor *input0, const index_t channels = input0->dim(3); const index_t channel_blocks = RoundUpDiv4(channels); - const index_t width_pixels = channel_blocks * width; const index_t batch_height_pixels = batch * height; - const uint32_t gws[2] = {static_cast(width_pixels), + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), static_cast(batch_height_pixels)}; + const int scaler = is_scaler ? 1 : 0; + const int need_swap = swap ? 1 : 0; auto runtime = OpenCLRuntime::Global(); - if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -47,6 +52,9 @@ void EltwiseFunctor::operator()(const Tensor *input0, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); + built_options.emplace(MakeString("-DSTART_AXIS=", start_axis)); + built_options.emplace(MakeString("-DIS_SCALER=", scaler)); + built_options.emplace(MakeString("-DNEEDSWAP=", need_swap)); if (runtime->IsOutOfRangeCheckEnabled()) { built_options.emplace("-DOUT_OF_RANGE_CHECK"); kernel_error_ = std::move(std::unique_ptr( @@ -73,9 +81,14 @@ void EltwiseFunctor::operator()(const Tensor *input0, if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); } kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, *(input1->opencl_image())); + kernel_.setArg(idx++, value); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); if (!coeff_.empty()) { kernel_.setArg(idx++, coeff_[0]); kernel_.setArg(idx++, coeff_[1]); @@ -85,11 +98,11 @@ void EltwiseFunctor::operator()(const Tensor *input0, input_shape_ = input0->shape(); } - const std::vector lws = {kwg_size_ / 16, 16, 0}; + const std::vector lws = {8, kwg_size_ / 64, 8, 0}; std::stringstream ss; ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 4b89361a11b33876dde98f27c20d7d3c1141f9a8..dcb981d828d41d1c3a51c0c6531a98824c64362a 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -152,6 +152,9 @@ BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128); BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128); +BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128); +BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128); +BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128); // 3 channels input BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 51c63e4ee3309bee050048aea287eb2ca892980b..049e38d201cd8e7bed75b63cfc76a84e80d5d22f 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -878,7 +878,7 @@ void TestArbitraryPadConvNxN(const std::vector &shape, 1e-4, 1e-4); }; - for (int kernel_size : {3, 5}) { + for (int kernel_size : {3, 5, 7}) { for (int stride : {2, 3}) { func(kernel_size, kernel_size, stride, stride); } diff --git a/mace/ops/cwise.cc b/mace/ops/cwise.cc deleted file mode 100644 index fa975875d7322489edb3524790b94a5867d740f0..0000000000000000000000000000000000000000 --- a/mace/ops/cwise.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/cwise.h" - -namespace mace { -namespace ops { - -void Register_CWise(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("CWise") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - CWiseOp); - -#ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("CWise") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), - CWiseOp); - - REGISTER_OPERATOR(op_registry, OpKeyBuilder("CWise") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), - CWiseOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/cwise.h b/mace/ops/cwise.h deleted file mode 100644 index 8cef0e10814cc35cbe09677c606c406aceeefc21..0000000000000000000000000000000000000000 --- a/mace/ops/cwise.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_CWISE_H_ -#define MACE_OPS_CWISE_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/cwise.h" - -namespace mace { -namespace ops { - -template -class CWiseOp : public Operator { - public: - CWiseOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - x_(OperatorBase::GetSingleArgument("x", 1.0)), - functor_(static_cast( - OperatorBase::GetSingleArgument( - "type", static_cast( - kernels::CWiseType::ADD))), - this->x_) {} - - bool Run(StatsFuture *future) override { - const Tensor *input_tensor = this->Input(INPUT); - Tensor *output_tensor = this->Output(OUTPUT); - output_tensor->ResizeLike(input_tensor); - - functor_(input_tensor, output_tensor, future); - return true; - } - - protected: - const float x_; - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); - - private: - kernels::CWiseFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_CWISE_H_ diff --git a/mace/ops/cwise_benchmark.cc b/mace/ops/cwise_benchmark.cc deleted file mode 100644 index 8d41d85dd5a9704490a17bf0e446a9691bf62b5c..0000000000000000000000000000000000000000 --- a/mace/ops/cwise_benchmark.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/core/operator.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/core/testing/test_benchmark.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -namespace { -template -void CWise(int iters, int batch, int channels, - int height, int width, float x, int type) { - mace::testing::StopTiming(); - - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); - - if (D == DeviceType::OPENCL) { - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("CWise", "CWiseBM") - .Input("InputImage") - .Output("Output") - .AddIntArg("type", type) - .AddFloatArg("x", x) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("CWise", "CWiseBM") - .Input("Input") - .Output("Output") - .AddIntArg("type", type) - .AddFloatArg("x", x) - .Finalize(net.NewOperatorDef()); - } - - // Warm-up - for (int i = 0; i < 5; ++i) { - net.RunOp(D); - } - net.Sync(); - - mace::testing::StartTiming(); - while (iters--) { - net.RunOp(D); - } - net.Sync(); -} -} // namespace - -#define BM_CWISE_MACRO(N, C, H, W, X, G, TYPE, DEVICE) \ - static void \ - BM_CWISE_##N##_##C##_##H##_##W##_##X##_##G##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - CWise(iters, N, C, H, W, X, G); \ - } \ - BENCHMARK( \ - BM_CWISE_##N##_##C##_##H##_##W##_##X##_##G##_##TYPE##_##DEVICE) - -#define BM_CWISE(N, C, H, W, X, G) \ - BM_CWISE_MACRO(N, C, H, W, X, G, float, CPU); \ - BM_CWISE_MACRO(N, C, H, W, X, G, float, OPENCL); \ - BM_CWISE_MACRO(N, C, H, W, X, G, half, OPENCL); - -BM_CWISE(1, 1, 512, 512, 2, 0); -BM_CWISE(1, 3, 128, 128, 2, 1); -BM_CWISE(1, 3, 512, 512, 2, 4); -BM_CWISE(1, 32, 112, 112, 2, 5); -BM_CWISE(1, 32, 112, 112, 2, 6); -BM_CWISE(1, 32, 112, 112, 2, 7); -BM_CWISE(1, 64, 256, 256, 3, 0); -BM_CWISE(1, 64, 512, 512, 3, 1); -BM_CWISE(1, 128, 56, 56, 3, 4); -BM_CWISE(1, 128, 256, 256, 3, 5); -BM_CWISE(1, 64, 512, 512, 3, 6); -BM_CWISE(1, 64, 512, 512, 3, 7); -BM_CWISE(1, 256, 14, 14, 3, 0); -BM_CWISE(1, 512, 14, 14, 3, 1); -BM_CWISE(1, 1024, 7, 7, 3, 4); -BM_CWISE(32, 1, 256, 256, 3, 5); -BM_CWISE(32, 1, 256, 256, 3, 6); -BM_CWISE(32, 1, 256, 256, 3, 7); - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/cwise_test.cc b/mace/ops/cwise_test.cc deleted file mode 100644 index e5510106ee71b02ef58edfc4894ef4e7b4a9973d..0000000000000000000000000000000000000000 --- a/mace/ops/cwise_test.cc +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/core/operator.h" -#include "mace/ops/ops_test_util.h" -#include "../kernels/cwise.h" - -namespace mace { -namespace ops { -namespace test { - -class CWiseOpTest : public OpsTestBase {}; - -namespace { -template -void Simple(const kernels::CWiseType type, - const std::vector &shape, - const std::vector &input0, - const float x, - const std::vector &output) { - // Construct graph - OpsTestNet net; - - // Add input data - net.AddInputFromArray("Input1", shape, input0); - - if (D == DeviceType::CPU) { - OpDefBuilder("CWise", "CWiseTest") - .Input("Input1") - .AddIntArg("type", static_cast(type)) - .AddFloatArg("x", x) - .Output("Output") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } else { - BufferToImage(&net, "Input1", "InputImg1", - kernels::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("CWise", "CWiseTest") - .Input("InputImg1") - .AddIntArg("type", static_cast(type)) - .AddFloatArg("x", x) - .Output("OutputImg") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - ImageToBuffer(&net, "OutputImg", "Output", - kernels::BufferType::IN_OUT_CHANNEL); - } - - auto expected = CreateTensor(shape, output); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-3); -} -} // namespace - -TEST_F(CWiseOpTest, CPUSimple) { - Simple(kernels::CWiseType::MUL, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 0.1, {0.1, 0.2, .3, .4, .5, .6}); - - Simple(kernels::CWiseType::ADD, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 2.0, {3, 4, 5, 6, 7, 8}); - - Simple(kernels::CWiseType::DIV, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 0.1, {10, 20, 30, 40, 50, 60}); - - Simple(kernels::CWiseType::SUB, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 2.0, {-1, 0, 1, 2, 3, 4}); - - Simple(kernels::CWiseType::NEG, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 2.0, {-1, -2, -3, -4, -5, -6}); - - Simple(kernels::CWiseType::ABS, {1, 1, 2, 3}, - {1, -2, -0.0001, 4, 5, 6}, 2.0, {1, 2, 0.0001, 4, 5, 6}); -} - -TEST_F(CWiseOpTest, GPUSimple) { - Simple(kernels::CWiseType::MUL, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 0.1, {0.1, 0.2, .3, .4, .5, .6}); - - Simple(kernels::CWiseType::ADD, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 2.0, {3, 4, 5, 6, 7, 8}); - - Simple(kernels::CWiseType::DIV, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 0.1, {10, 20, 30, 40, 50, 60}); - - Simple(kernels::CWiseType::SUB, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 2.0, {-1, 0, 1, 2, 3, 4}); - - Simple(kernels::CWiseType::NEG, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, 2.0, {-1, -2, -3, -4, -5, -6}); - - Simple(kernels::CWiseType::ABS, {1, 1, 2, 3}, - {1, -2, -0.0001, 4, 5, 6}, 2.0, {1, 2, 0.0001, 4, 5, 6}); -} - -namespace { -template -void RandomTest(const kernels::CWiseType type, - const std::vector &shape) { - testing::internal::LogToStderr(); - srand(time(NULL)); - - // Construct graph - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input1", shape); - - OpDefBuilder("CWise", "CWiseTest") - .Input("Input1") - .AddIntArg("type", static_cast(type)) - .AddFloatArg("x", 1.2) - .Output("Output") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(); - - BufferToImage(&net, "Input1", "InputImg1", - kernels::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("CWise", "CWiseTest") - .Input("InputImg1") - .AddIntArg("type", static_cast(type)) - .AddFloatArg("x", 1.2) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Output("OutputImg") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - ImageToBuffer(&net, "OutputImg", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); - - if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(*net.GetTensor("Output"), - *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); - } else { - ExpectTensorNear(*net.GetTensor("Output"), - *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); - } -} -} // namespace - -TEST_F(CWiseOpTest, OPENCLRandomFloat) { - RandomTest(kernels::CWiseType::MUL, - {3, 23, 37, 19}); - RandomTest(kernels::CWiseType::ADD, - {13, 32, 32, 64}); - RandomTest(kernels::CWiseType::SUB, - {3, 32, 32, 64}); - RandomTest(kernels::CWiseType::DIV, - {13, 32, 32, 64}); - RandomTest(kernels::CWiseType::NEG, - {13, 32, 32, 64}); -} - -TEST_F(CWiseOpTest, OPENCLRandomHalf) { - RandomTest(kernels::CWiseType::MUL, - {3, 23, 37, 19}); - RandomTest(kernels::CWiseType::ADD, - {13, 32, 32, 64}); - RandomTest(kernels::CWiseType::SUB, - {3, 32, 32, 64}); - RandomTest(kernels::CWiseType::DIV, - {13, 32, 32, 64}); - RandomTest(kernels::CWiseType::NEG, - {13, 32, 32, 64}); -} - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index 818fa5e533d219256f3528f01c5434e99d390a40..2972a83ad20f2a7c62dbfc28d1ee54ce79f27c9b 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -32,24 +32,53 @@ class EltwiseOp : public Operator { OperatorBase::GetRepeatedArgument("coeff")) {} bool Run(StatsFuture *future) override { - const Tensor *input0 = this->Input(0); - const Tensor *input1 = this->Input(1); - Tensor *output = this->Output(OUTPUT); - MACE_CHECK(input0->dim_size() == input1->dim_size()) + if (this->InputSize() == 1) { + const Tensor* input = this->Input(0); + Tensor *output = this->Output(OUTPUT); + start_axis_ = input->dim_size() - 1; + is_scaler_ = true; + output->ResizeLike(input); + const float x = OperatorBase::GetSingleArgument("x", 1.0); + functor_(input, nullptr, start_axis_, + is_scaler_, x, false, output, future); + } else { + const index_t size0 = this->Input(0)->size(); + const index_t size1 = this->Input(1)->size(); + const bool swap = (size0 < size1); + const Tensor *input0 = swap ? this->Input(1) : this->Input(0); + const Tensor *input1 = swap ? this->Input(0) : this->Input(1); + + Tensor *output = this->Output(OUTPUT); + MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape"; - for (int i = 0; i < input0->dim_size(); ++i) { - MACE_CHECK(input0->dim(i) == input1->dim(i)) - << "Inputs of Eltwise op must be same shape"; + start_axis_ = input0->dim_size() - 1; + is_scaler_ = (input1->size() == 1); + uint32_t compared_size = 1; + if (!is_scaler_) { + while (start_axis_ >= 0) { + MACE_CHECK(input0->dim(start_axis_) == input1->dim(start_axis_), + "Invalid inputs dimension at axis: ") << start_axis_ + << "input 0: " << input0->dim(start_axis_) + << "input 1: " << input1->dim(start_axis_); + compared_size *= input1->dim(start_axis_); + if (compared_size == input1->size()) { + break; + } + start_axis_--; + } + } + output->ResizeLike(input0); + const float x = OperatorBase::GetSingleArgument("x", 1.0); + functor_(input0, input1, start_axis_, + is_scaler_, x, swap, output, future); } - - output->ResizeLike(input0); - - functor_(input0, input1, output, future); return true; } private: kernels::EltwiseFunctor functor_; + index_t start_axis_; + bool is_scaler_; private: OP_OUTPUT_TAGS(OUTPUT); diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index ca24242b04bef9e55031a57e54cf76e79f06b7ff..6dd3b33da7f96da77d6e5dc26458e072042e6fcc 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -25,23 +25,26 @@ class EltwiseOpTest : public OpsTestBase {}; namespace { template void Simple(const kernels::EltwiseType type, - const std::vector &shape, + const std::vector &shape0, + const std::vector &shape1, const std::vector &input0, const std::vector &input1, const std::vector &output, + const float x = 1.f, const std::vector coeff = {}) { // Construct graph OpsTestNet net; // Add input data - net.AddInputFromArray("Input1", shape, input0); - net.AddInputFromArray("Input2", shape, input1); + net.AddInputFromArray("Input1", shape0, input0); + net.AddInputFromArray("Input2", shape1, input1); if (D == DeviceType::CPU) { OpDefBuilder("Eltwise", "EltwiseTest") .Input("Input1") .Input("Input2") .AddIntArg("type", static_cast(type)) + .AddFloatArg("x", x) .AddFloatsArg("coeff", coeff) .Output("Output") .Finalize(net.NewOperatorDef()); @@ -57,6 +60,7 @@ void Simple(const kernels::EltwiseType type, .Input("InputImg1") .Input("InputImg2") .AddIntArg("type", static_cast(type)) + .AddFloatArg("x", x) .AddFloatsArg("coeff", coeff) .Output("OutputImg") .Finalize(net.NewOperatorDef()); @@ -68,7 +72,7 @@ void Simple(const kernels::EltwiseType type, kernels::BufferType::IN_OUT_CHANNEL); } - auto expected = CreateTensor(shape, output); + auto expected = CreateTensor(shape0, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -76,53 +80,200 @@ void Simple(const kernels::EltwiseType type, TEST_F(EltwiseOpTest, CPUSimple) { Simple(kernels::EltwiseType::PROD, {1, 1, 2, 3}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {1, 4, 9, 16, 25, 36}); Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}); Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, - {3, 6, 9, 12, 15, 18}, {2, 1}); + {3, 6, 9, 12, 15, 18}, 1., {2, 1}); Simple(kernels::EltwiseType::MAX, {1, 1, 2, 3}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, {1, 2, 3, 4, 6, 6}); Simple(kernels::EltwiseType::MIN, {1, 1, 2, 3}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, {1, 1, 3, 3, 5, 6}); + Simple(kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, + {0, 1, 0, 1, 1, 0}); + Simple(kernels::EltwiseType::DIV, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3, 2, 10, 24}, + {1, 2, 1, 2, 0.5, 0.25}); + + Simple(kernels::EltwiseType::PROD, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3}, + {1, 4, 9, 4, 10, 18}); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3}, + {2, 4, 6, 5, 7, 9}); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3}, + {3, 6, 9, 9, 12, 15}, 1., {2, 1}); + Simple(kernels::EltwiseType::MAX, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {1, 2, 3, 4, 5, 6}); + Simple(kernels::EltwiseType::MIN, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {1, 1, 3, 1, 1, 3}); + Simple(kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {0, 1, 0, 9, 16, 9}); + Simple(kernels::EltwiseType::DIV, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {1, 2, 1, 4, 5, 2}); + + Simple(kernels::EltwiseType::PROD, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {2}, + {2, 4, 6, 8, 10, 12}, 2); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {2}, + {3, 4, 5, 6, 7, 8}, 2); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {2}, + {4, 6, 8, 10, 12, 14}, 2, {2, 1}); + Simple(kernels::EltwiseType::MAX, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {3}, + {3, 3, 3, 4, 5, 6}, 3); + Simple(kernels::EltwiseType::MIN, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {3}, + {1, 2, 3, 3, 3, 3}, 3); + Simple(kernels::EltwiseType::DIV, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {0.5}, + {2, 4, 6, 8, 10, 12}, 0.5); + Simple(kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {3}, + {4, 1, 0, 1, 4, 9}, 3); } TEST_F(EltwiseOpTest, GPUSimple) { Simple(kernels::EltwiseType::PROD, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, - {1, 4, 9, 16, 25, 36}); + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {1, 4, 9, 16, 25, 36}); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {2, 4, 6, 8, 10, 12}); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {3, 6, 9, 12, 15, 18}, 1., {2, 1}); + Simple(kernels::EltwiseType::MAX, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, + {1, 2, 3, 4, 6, 6}); + Simple(kernels::EltwiseType::MIN, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, + {1, 1, 3, 3, 5, 6}); + Simple(kernels::EltwiseType::DIV, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3, 2, 10, 24}, + {1, 2, 1, 2, 0.5, 0.25}); + Simple(kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, + {1, 1, 2, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, + {0, 1, 0, 1, 1, 0}); + + Simple(kernels::EltwiseType::PROD, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3}, + {1, 4, 9, 4, 10, 18}); Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, - {2, 4, 6, 8, 10, 12}); + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3}, + {2, 4, 6, 5, 7, 9}); Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, - {3, 6, 9, 12, 15, 18}, {2, 1}); + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 2, 3}, + {3, 6, 9, 9, 12, 15}, 1., {2, 1}); Simple(kernels::EltwiseType::MAX, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, - {1, 2, 3, 4, 6, 6}); + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {1, 2, 3, 4, 5, 6}); Simple(kernels::EltwiseType::MIN, {1, 1, 2, 3}, - {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6}, - {1, 1, 3, 3, 5, 6}); + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {1, 1, 3, 1, 1, 3}); + Simple(kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {0, 1, 0, 9, 16, 9}); + Simple(kernels::EltwiseType::DIV, {1, 1, 2, 3}, + {1, 1, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 1, 3}, + {1, 2, 1, 4, 5, 2}); + + Simple(kernels::EltwiseType::PROD, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {2}, + {2, 4, 6, 8, 10, 12}, 2); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {2}, + {3, 4, 5, 6, 7, 8}, 2); + Simple(kernels::EltwiseType::SUM, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {2}, + {4, 6, 8, 10, 12, 14}, 2, {2, 1}); + Simple(kernels::EltwiseType::MAX, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {3}, + {3, 3, 3, 4, 5, 6}, 3); + Simple(kernels::EltwiseType::MIN, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {3}, + {1, 2, 3, 3, 3, 3}, 3); + Simple(kernels::EltwiseType::SQR_DIFF, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {3}, + {4, 1, 0, 1, 4, 9}, 3); + Simple(kernels::EltwiseType::DIV, {1, 1, 2, 3}, + {1, 1, 1, 1}, + {1, 2, 3, 4, 5, 6}, {0.5}, + {2, 4, 6, 8, 10, 12}, 0.5); } namespace { template void RandomTest(const kernels::EltwiseType type, - const std::vector &shape) { + const std::vector &shape1, + const std::vector &shape2) { testing::internal::LogToStderr(); srand(time(NULL)); // Construct graph OpsTestNet net; + bool is_divide = (type == kernels::EltwiseType::DIV); + // Add input data - net.AddRandomInput("Input1", shape); - net.AddRandomInput("Input2", shape); + net.AddRandomInput("Input1", shape1, true, is_divide); + net.AddRandomInput("Input2", shape2, true, is_divide); + + OpDefBuilder("Eltwise", "EltwiseTest") .Input("Input1") @@ -166,24 +317,110 @@ void RandomTest(const kernels::EltwiseType type, TEST_F(EltwiseOpTest, OPENCLRandomFloat) { RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}, {3, 23, 37, 19}); RandomTest(kernels::EltwiseType::SUM, + {13, 32, 32, 64}, {13, 32, 32, 64}); RandomTest(kernels::EltwiseType::MAX, + {3, 32, 32, 64}, {3, 32, 32, 64}); RandomTest(kernels::EltwiseType::MIN, + {13, 32, 32, 64}, + {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::DIV, + {13, 32, 32, 64}, + {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::SQR_DIFF, + {13, 32, 32, 64}, {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}, + {1, 1, 37, 19}); + RandomTest(kernels::EltwiseType::SUM, + {13, 32, 32, 64}, + {1, 1, 32, 64}); + RandomTest(kernels::EltwiseType::MAX, + {3, 32, 32, 64}, + {1, 1, 32, 64}); + RandomTest(kernels::EltwiseType::MIN, + {13, 32, 32, 64}, + {1, 1, 32, 64}); + RandomTest(kernels::EltwiseType::DIV, + {13, 32, 32, 63}, + {1, 1, 32, 63}); + RandomTest(kernels::EltwiseType::SQR_DIFF, + {13, 32, 32, 64}, + {1, 1, 32, 64}); + RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}, + {1, 1, 1, 19}); + RandomTest(kernels::EltwiseType::SUM, + {13, 32, 32, 64}, + {1, 1, 1, 64}); + RandomTest(kernels::EltwiseType::MAX, + {3, 32, 32, 64}, + {1, 1, 1, 64}); + RandomTest(kernels::EltwiseType::MIN, + {13, 32, 32, 64}, + {1, 1, 1, 64}); + RandomTest(kernels::EltwiseType::DIV, + {13, 32, 32, 64}, + {1, 1, 1, 64}); + RandomTest(kernels::EltwiseType::SQR_DIFF, + {13, 32, 32, 64}, + {1, 1, 1, 64}); } TEST_F(EltwiseOpTest, OPENCLRandomHalf) { RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}, {3, 23, 37, 19}); + RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}, + {1, 23, 37, 19}); + RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}, + {1, 1, 37, 19}); + RandomTest(kernels::EltwiseType::PROD, + {3, 23, 37, 19}, + {1, 1, 1, 19}); RandomTest(kernels::EltwiseType::SUM, - {13, 32, 32, 64}); + {13, 32, 32, 64}, + {1, 1, 1, 1}); + RandomTest(kernels::EltwiseType::SUM, + {13, 32, 32, 64}, + {1, 1, 1, 64}); + RandomTest(kernels::EltwiseType::SUM, + {13, 32, 32, 64}, + {1, 1, 32, 64}); RandomTest(kernels::EltwiseType::MAX, + {3, 32, 32, 64}, {3, 32, 32, 64}); + RandomTest(kernels::EltwiseType::MAX, + {3, 32, 32, 64}, + {1, 1, 32, 64}); RandomTest(kernels::EltwiseType::MIN, + {13, 32, 32, 64}, + {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::SQR_DIFF, + {13, 32, 32, 64}, + {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::SQR_DIFF, + {13, 32, 32, 64}, + {1, 1, 1, 64}); + RandomTest(kernels::EltwiseType::SQR_DIFF, + {13, 32, 32, 64}, + {1, 1, 32, 64}); + RandomTest(kernels::EltwiseType::DIV, + {13, 32, 32, 64}, {13, 32, 32, 64}); + RandomTest(kernels::EltwiseType::DIV, + {13, 32, 32, 64}, + {1, 1, 1, 64}); + RandomTest(kernels::EltwiseType::DIV, + {13, 32, 32, 64}, + {1, 1, 32, 64}); } } // namespace test diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 162435705bef37254bcb1a68654c16b296777104..1439bf08c8adefca8524e68b6d34d74bec2deceb 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -150,7 +150,8 @@ class OpsTestNet { template void AddRandomInput(const std::string &name, const std::vector &shape, - bool positive = true) { + bool positive = true, + bool truncate = false) { Tensor *input = ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); input->Resize(shape); @@ -162,14 +163,24 @@ class OpsTestNet { std::normal_distribution nd(0, 1); if (DataTypeToEnum::value == DT_HALF) { std::generate( - input_data, input_data + input->size(), [&gen, &nd, positive] { - return half_float::half_cast(positive ? std::abs(nd(gen)) - : nd(gen)); + input_data, input_data + input->size(), + [&gen, &nd, positive, truncate] { + float d = nd(gen); + if (truncate) { + if (std::abs(d) > 100.f) d = 100.f; + if (std::abs(d) < 0.001f) d = 0.001f; + } + return half_float::half_cast(positive ?std::abs(d) : d); }); } else { std::generate(input_data, input_data + input->size(), - [&gen, &nd, positive] { - return positive ? std::abs(nd(gen)) : nd(gen); + [&gen, &nd, positive, truncate] { + float d = nd(gen); + if (truncate) { + if (std::abs(d) > 100.f) d = 100.f; + if (std::abs(d) < 0.001f) d = 0.001f; + } + return (positive ?std::abs(d) : d); }); } } diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index 160579720b422d74a94eb8f8ba7fa59866b70604..cc961c36f334ab2c5080b34cfec41c6e210bbf98 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -1188,8 +1188,11 @@ def convert_to_mace_pb(model_file, weight_file, input_node_str, print "PB Converted." if device == 'gpu': print "start optimize memory." - mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) - mem_optimizer.optimize() + memory_optimizer.optimize_gpu_memory(net_def) + print "Memory optimization done." + elif device == 'cpu': + print "start optimize memory." + memory_optimizer.optimize_cpu_memory(net_def) print "Memory optimization done." return net_def diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index fddb50e276d9f23f00ced9b666681467585283ee..38e3a36b6550f76f441983fb2826fa7b2268a0a5 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -22,13 +22,13 @@ class MemoryOptimizer(object): self.net_def = net_def self.idle_mem = set() self.op_mem = {} # op_name->mem_id - self.mem_block = {} # mem_id->[x, y] + self.mem_block = {} # mem_id->[size] or mem_id->[x, y] self.total_mem_count = 0 self.ref_counter = {} consumers = {} for op in net_def.op: - if self.is_buffer_image_op(op): + if not self.op_need_optimize_memory(op): continue for ipt in op.input: if ipt not in consumers: @@ -36,7 +36,7 @@ class MemoryOptimizer(object): consumers[ipt].append(op) # only ref op's output tensor for op in net_def.op: - if self.is_buffer_image_op(op): + if not self.op_need_optimize_memory(op): continue for output in op.output: tensor_name = output @@ -45,29 +45,47 @@ class MemoryOptimizer(object): else: self.ref_counter[tensor_name] = 0 - def is_buffer_image_op(self, op): - if op.type == 'BufferToImage': - for arg in op.arg: - if arg.name == 'mode' and arg.i == 0: - return True - return op.type == 'ImageToBuffer' + def op_need_optimize_memory(self, op): + return True - def get_mem_size(self, op_type, output_shape): - mem_size = [0, 0] - if op_type == 'WinogradTransform' or op_type == 'MatMul': - mem_size[0] = output_shape[2] * output_shape[3] - mem_size[1] = output_shape[0] * int((output_shape[1] + 3) / 4) - else: - mem_size[0] = output_shape[2] * int((output_shape[3] + 3) / 4) - mem_size[1] = output_shape[0] * output_shape[1] - return mem_size + def get_op_mem_block(self, op_type, output_shape): + return [reduce(operator.mul, output_shape, 1)] + + def mem_size(self, memory_block): + return memory_block[0] + + def sub_mem_block(self, mem_block1, mem_block2): + return self.mem_size(mem_block1) - self.mem_size(mem_block2) + + def resize_mem_block(self, old_mem_block, op_mem_block): + return [max(old_mem_block[0], op_mem_block[0])] + + def add_net_mem_blocks(self): + for mem in self.mem_block: + arena = self.net_def.mem_arena + block = arena.mem_block.add() + block.mem_id = mem + block.x = self.mem_block[mem][0] + block.y = 1 - def mem_area(self, memory_size): - return memory_size[0] * memory_size[1] + def get_total_origin_mem_size(self): + origin_mem_size = 0 + for op in self.net_def.op: + if not self.op_need_optimize_memory(op): + continue + origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) + return origin_mem_size + + def get_total_optimized_mem_size(self): + optimized_mem_size = 0 + for mem in self.mem_block: + print mem, self.mem_block[mem] + optimized_mem_size += self.mem_size(self.mem_block[mem]) + return optimized_mem_size def optimize(self): for op in self.net_def.op: - if self.is_buffer_image_op(op): + if not self.op_need_optimize_memory(op): continue if not op.output_shape: print('WARNING: There is no output shape information to ' @@ -78,38 +96,42 @@ class MemoryOptimizer(object): 'the number of output.') return for i in range(len(op.output)): - op_mem_size = self.get_mem_size(op.type, - op.output_shape[i].dims) + op_mem_block = self.get_op_mem_block(op.type, + op.output_shape[i].dims) mem_id = -1 if len(self.idle_mem) > 0: - best_mem_candidate_id = -1 - best_mem_candidate_delta_area = sys.maxint - best_mem_candidate_shape = [] + best_mem_add_size = sys.maxint + best_mem_waste_size = sys.maxint for mid in self.idle_mem: - reuse_mem_size = self.mem_block[mid] - resize_mem_size = [ - max(reuse_mem_size[0], op_mem_size[0]), - max(reuse_mem_size[1], op_mem_size[1]) - ] - delta_mem_area = self.mem_area( - resize_mem_size) - self.mem_area(reuse_mem_size) - if delta_mem_area < best_mem_candidate_delta_area: - best_mem_candidate_id = mid - best_mem_candidate_delta_area = delta_mem_area - best_mem_candidate_shape = resize_mem_size - - if best_mem_candidate_delta_area <= self.mem_area( - op_mem_size): - # reuse - self.mem_block[ - best_mem_candidate_id] = best_mem_candidate_shape - mem_id = best_mem_candidate_id + old_mem_block = self.mem_block[mid] + new_mem_block = self.resize_mem_block( + old_mem_block, op_mem_block) + add_mem_size = self.sub_mem_block(new_mem_block, + old_mem_block) + waste_mem_size = self.sub_mem_block(new_mem_block, + op_mem_block) + + # minimize add_mem_size; if best_mem_add_size is 0, + # then minimize waste_mem_size + if (best_mem_add_size > 0 and + add_mem_size < best_mem_add_size) \ + or (best_mem_add_size == 0 and + waste_mem_size < best_mem_waste_size): + best_mem_id = mid + best_mem_add_size = add_mem_size + best_mem_waste_size = waste_mem_size + best_mem_block = new_mem_block + + # if add mem size < op mem size, then reuse it + if best_mem_add_size <= self.mem_size(op_mem_block): + self.mem_block[best_mem_id] = best_mem_block + mem_id = best_mem_id self.idle_mem.remove(mem_id) if mem_id == -1: mem_id = self.total_mem_count self.total_mem_count += 1 - self.mem_block[mem_id] = op_mem_size + self.mem_block[mem_id] = op_mem_block op.mem_id.extend([mem_id]) self.op_mem[op.output[i]] = mem_id @@ -123,6 +145,43 @@ class MemoryOptimizer(object): elif self.ref_counter[ipt] < 0: raise Exception('ref count is less than 0') + self.add_net_mem_blocks() + + print('total op: %d', len(self.net_def.op)) + print('origin mem: %d, optimized mem: %d', + self.get_total_origin_mem_size(), + self.get_total_optimized_mem_size()) + + +class GPUMemoryOptimizer(MemoryOptimizer): + def op_need_optimize_memory(self, op): + if op.type == 'BufferToImage': + for arg in op.arg: + if arg.name == 'mode' and arg.i == 0: + return False + return op.type != 'ImageToBuffer' + + def get_op_mem_block(self, op_type, output_shape): + mem_block = [0, 0] + if op_type == 'WinogradTransform' or op_type == 'MatMul': + mem_block[0] = output_shape[2] * output_shape[3] + mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4) + else: + mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4) + mem_block[1] = output_shape[0] * output_shape[1] + return mem_block + + def mem_size(self, memory_block): + return memory_block[0] * memory_block[1] * 4 + + def resize_mem_block(self, old_mem_block, op_mem_block): + resize_mem_block = [ + max(old_mem_block[0], op_mem_block[0]), + max(old_mem_block[1], op_mem_block[1]) + ] + return resize_mem_block + + def add_net_mem_blocks(self): for mem in self.mem_block: arena = self.net_def.mem_arena block = arena.mem_block.add() @@ -130,21 +189,12 @@ class MemoryOptimizer(object): block.x = self.mem_block[mem][0] block.y = self.mem_block[mem][1] - print('total op: %d', len(self.net_def.op)) - origin_mem_size = 0 - optimized_mem_size = 0 - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) - for mem in self.mem_block: - print mem, self.mem_block[mem] - optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) - print('origin mem: %d, optimized mem: %d', origin_mem_size, - optimized_mem_size) +def optimize_gpu_memory(net_def): + mem_optimizer = GPUMemoryOptimizer(net_def) + mem_optimizer.optimize() -def optimize_memory(net_def): +def optimize_cpu_memory(net_def): mem_optimizer = MemoryOptimizer(net_def) mem_optimizer.optimize() diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 780dfa4d5e2aa99878b55bf3b3681c8bc20fdfe6..fc057dd0f29b8509ca403ea158f5f228066eca28 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -829,37 +829,25 @@ class TFConverter(object): self.resolved_ops[op.name] = 1 self.unused_tensor.add(get_input_tensor(op, 1).name) - def convert_math(self, op, math_type): + def convert_eltwise(self, op, math_type): op_def = self.net_def.op.add() arg = op_def.arg.add() arg.name = 'T' arg.i = self.dt op_def.name = op.name - - if len(op.inputs) == 1: - op_def.type = "CWise" - op_def.input.extend([input.name for input in op.inputs]) - x_arg = op_def.arg.add() - x_arg.name = 'x' - x_arg.f = 0 - elif len(op.inputs) >= 2: + op_def.type = "Eltwise" + op_def.input.extend([input.name for input in op.inputs]) + x_value = op.get_attr('x') + if len(op.inputs) >= 2: input_tensor0 = get_input_tensor(op, 0) input_tensor1 = get_input_tensor(op, 1) - if input_tensor0.shape == input_tensor1.shape: - op_def.type = "Eltwise" - op_def.input.extend([input.name for input in op.inputs]) - else: - op_def.type = "CWise" - x_value = 0 - if len(input_tensor1.shape) == 4: - op_def.input.extend([op.inputs[1].name]) - x_value = get_input_tensor(op, 0).eval().astype(np.float32) - else: - op_def.input.extend([op.inputs[0].name]) - x_value = get_input_tensor(op, 1).eval().astype(np.float32) - x_arg = op_def.arg.add() - x_arg.name = 'x' - x_arg.f = x_value + if len(input_tensor0) == 1: + x_value = input_tensor0.eval().astype(np.float32) + elif len(input_tensor1) == 1: + x_value = input_tensor1.eval().astype(np.float32) + x_arg = op_def.arg.add() + x_arg.name = 'x' + x_arg.f = x_value type_arg = op_def.arg.add() type_arg.name = 'type' type_arg.i = math_type_mode[math_type] @@ -1156,11 +1144,11 @@ class TFConverter(object): elif op.type == 'SpaceToDepth': self.convert_depth_to_space(op, False) elif op.type in ['Neg', 'neg', 'Negative', 'negative']: - self.convert_math(op, 'NEG') + self.convert_eltwise(op, 'NEG') elif op.type == 'Mul': - self.convert_math(op, 'MUL') + self.convert_eltwise(op, 'MUL') elif op.type == 'Sub': - self.convert_math(op, 'SUB') + self.convert_eltwise(op, 'SUB') elif self.is_softmax(op): self.convert_softmax(op) elif op.type in ['Relu', 'Sigmoid', 'Tanh']: @@ -1367,8 +1355,11 @@ def convert_to_mace_pb(model_file, input_node, input_shape, output_node, print "Model Converted." if device == 'gpu': print "start optimize memory." - mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) - mem_optimizer.optimize() + memory_optimizer.optimize_gpu_memory(net_def) + print "Memory optimization done." + elif device == 'cpu': + print "start optimize memory." + memory_optimizer.optimize_cpu_memory(net_def) print "Memory optimization done." return net_def