提交 d66c971c 编写于 作者: 李滨

Merge branch 'fix_conv' into 'master'

Support padding for 1x1

See merge request !1023
......@@ -503,7 +503,7 @@ class ScratchBuffer: public Buffer {
virtual ~ScratchBuffer() {}
MaceStatus GrowSize(const index_t size) {
if (size > size_) {
if (offset_ + size > size_) {
VLOG(1) << "Grow scratch size to: " << size;
MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size");
return Resize(size);
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_
#define MACE_OPS_ARM_CONV_2D_NEON_H_
#include "mace/core/types.h"
namespace mace {
namespace ops {
void Conv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK5x5S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK1x7S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x1S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x7S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x7S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK7x7S3(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK1x15S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Conv2dNeonK15x1S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output);
// calculate one output channel and one input channel
inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
const float *filter_ptr,
const index_t in_width,
const index_t filter_height,
const index_t filter_width,
const index_t out_height,
const index_t out_width,
float *out_ptr,
const int stride) {
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) {
out_ptr[h * out_width + w] +=
in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
filter_ptr[i * filter_width + j];
}
}
}
}
}
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_CONV_2D_NEON_H_
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/ops/arm/conv_2d_neon.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
inline void Conv2dCPUK15x1Calc(const float *in_ptr,
const float *filter_ptr,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t w,
const index_t tile_width,
const index_t out_image_size,
float *out_ptr,
const index_t io,
const int stride) {
for (index_t ih = 0; ih < out_height; ++ih) {
for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
for (int i = 0; i < 15; ++i) {
for (int j = 0; j < 1; ++j) {
out_ptr[io * out_image_size + ih * out_width + w + iw] +=
in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
filter_ptr[io * in_channels * 15 + i * 1 + j];
}
}
}
}
}
// Ho = 4, Wo = 1, Co = 1
void Conv2dNeonK15x1S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
const index_t tile_width =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t w = 0; w < out_shape[3]; w += tile_width) {
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
/* load filter (1 outch x 4 height x 1 width) */
float32x4_t vf0, vf1, vf2, vf3;
vf0 = vld1q_f32(filter_ptr);
vf1 = vld1q_f32(filter_ptr + 4);
vf2 = vld1q_f32(filter_ptr + 8);
vf3 = vld1q_f32(filter_ptr + 11);
for (index_t h = 0; h + 3 < out_height; h += 4) {
for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
// load output
index_t out_offset = h * out_width + w + wt;
// output (1 outch x 4 height x 1 width): vo_outch_height
float32x4_t vo = {out_ptr_base[out_offset],
out_ptr_base[out_offset + out_width],
out_ptr_base[out_offset + 2 * out_width],
out_ptr_base[out_offset + 3 * out_width]};
// input offset
index_t in_offset = h * in_width + w + wt;
// input (3 slide)
float32x4_t vi0 = {in_ptr_base[in_offset],
in_ptr_base[in_offset + in_width],
in_ptr_base[in_offset + 2 * in_width],
in_ptr_base[in_offset + 3 * in_width]};
float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
in_ptr_base[in_offset + 5 * in_width],
in_ptr_base[in_offset + 6 * in_width],
in_ptr_base[in_offset + 7 * in_width]};
float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
in_ptr_base[in_offset + 9 * in_width],
in_ptr_base[in_offset + 10 * in_width],
in_ptr_base[in_offset + 11 * in_width]};
float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
in_ptr_base[in_offset + 13 * in_width],
in_ptr_base[in_offset + 14 * in_width],
in_ptr_base[in_offset + 15 * in_width]};
float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
in_ptr_base[in_offset + 17 * in_width]};
float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
out_ptr_base[out_offset] = vo[0];
out_ptr_base[out_offset + out_width] = vo[1];
out_ptr_base[out_offset + 2 * out_width] = vo[2];
out_ptr_base[out_offset + 3 * out_width] = vo[3];
} // wt
} // h
#else
Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, out_width, w, tile_width,
out_image_size, out_ptr_base, 0, 1);
#endif
} // c
} // w
} // m
} // b
}
} // namespace ops
} // namespace mace
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/ops/arm/conv_2d_neon.h"
#include "mace/utils/logging.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
inline void Conv2dCPUK1x15Calc(const float *in_ptr,
const float *filter_ptr,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t h,
const index_t tile_height,
const index_t out_width,
const index_t out_image_size,
float *out_ptr,
const index_t io,
const int stride) {
for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 1; ++i) {
for (int j = 0; j < 15; ++j) {
out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
filter_ptr[io * in_channels * 15 + i * 15 + j];
}
}
}
}
}
// Ho = 1, Wo = 4, Co = 1
void Conv2dNeonK1x15S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
const index_t tile_height =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t h = 0; h < out_shape[2]; h += tile_height) {
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
/* load filter (1 outch x 4 height x 1 width) */
float32x4_t vf0, vf1, vf2, vf3;
vf0 = vld1q_f32(filter_ptr);
vf1 = vld1q_f32(filter_ptr + 4);
vf2 = vld1q_f32(filter_ptr + 8);
vf3 = vld1q_f32(filter_ptr + 11);
for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo;
// load output
index_t out_offset = (h + ht) * out_width + w;
vo = vld1q_f32(out_ptr_base + out_offset);
// input (3 slide)
float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
vi10, vi11, vi12, vi13, vi14, vi16;
// input offset
index_t in_offset = (h + ht) * in_width + w;
// load input
vi0 = vld1q_f32(in_ptr_base + in_offset);
vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
vi1 = vextq_f32(vi0, vi4, 1);
vi2 = vextq_f32(vi0, vi4, 2);
vi3 = vextq_f32(vi0, vi4, 3);
vi5 = vextq_f32(vi4, vi8, 1);
vi6 = vextq_f32(vi4, vi8, 2);
vi7 = vextq_f32(vi4, vi8, 3);
vi9 = vextq_f32(vi8, vi12, 1);
vi10 = vextq_f32(vi8, vi12, 2);
vi11 = vextq_f32(vi8, vi12, 3);
vi13 = vextq_f32(vi12, vi16, 1);
vi14 = vextq_f32(vi12, vi16, 2);
vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
vst1q_f32(out_ptr_base + out_offset, vo);
} // w
} // ht
#else
Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
out_height, h, tile_height, out_width,
out_image_size, out_ptr_base, 0, 1);
#endif
} // c
} // h
} // m
} // b
}
} // namespace ops
} // namespace mace
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/ops/arm/conv_2d_neon.h"
namespace mace {
namespace ops {
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK1x7S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
output + b * out_batch_size + (m + 3) * out_image_size;
#endif
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON)
const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
/* load filter (4 outch x 1 height x 4 width) */
float32x4_t vf00, vf01;
float32x4_t vf10, vf11;
float32x4_t vf20, vf21;
float32x4_t vf30, vf31;
vf00 = vld1q_f32(filter_ptr0);
vf01 = vld1q_f32(filter_ptr0 + 3);
vf10 = vld1q_f32(filter_ptr1);
vf11 = vld1q_f32(filter_ptr1 + 3);
vf20 = vld1q_f32(filter_ptr2);
vf21 = vld1q_f32(filter_ptr2 + 3);
vf30 = vld1q_f32(filter_ptr3);
vf31 = vld1q_f32(filter_ptr3 + 3);
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3;
// load output
index_t out_offset = h * out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset);
vo1 = vld1q_f32(out_ptr1_base + out_offset);
vo2 = vld1q_f32(out_ptr2_base + out_offset);
vo3 = vld1q_f32(out_ptr3_base + out_offset);
// input (3 slide)
float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
// input offset
index_t in_offset = h * in_width + w;
// load input
vi0 = vld1q_f32(in_ptr_base + in_offset);
vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
vi1 = vextq_f32(vi0, vi4, 1);
vi2 = vextq_f32(vi0, vi4, 2);
vi3 = vextq_f32(vi0, vi4, 3);
vi5 = vextq_f32(vi4, vi8, 1);
vi6 = vextq_f32(vi4, vi8, 2);
#if defined(__aarch64__)
/* outch 0 */
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
/* outch 1 */
vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
/* outch 2 */
vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
/* outch 3 */
vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
#else
/* outch 0 */
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
/* outch 1 */
vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
/* outch 2 */
vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
/* outch 3 */
vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
#endif
vst1q_f32(out_ptr0_base + out_offset, vo0);
vst1q_f32(out_ptr1_base + out_offset, vo1);
vst1q_f32(out_ptr2_base + out_offset, vo2);
vst1q_f32(out_ptr3_base + out_offset, vo3);
} // w
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
in_width, 1, 7, out_height, out_width,
out_ptr0_base + oc * out_image_size, 1);
}
#endif
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float *out_ptr0_base =
output + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7;
#if defined(MACE_ENABLE_NEON)
/* load filter (1 outch x 1 height x 4 width) */
float32x4_t vf00, vf01;
vf00 = vld1q_f32(filter_ptr0);
vf01 = vld1q_f32(filter_ptr0 + 3);
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0;
// load output
index_t out_offset = h * out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset);
// input (3 slide)
float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
// input offset
index_t in_offset = h * in_width + w;
// load input
vi0 = vld1q_f32(in_ptr_base + in_offset);
vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
vi1 = vextq_f32(vi0, vi4, 1);
vi2 = vextq_f32(vi0, vi4, 2);
vi3 = vextq_f32(vi0, vi4, 3);
vi5 = vextq_f32(vi4, vi8, 1);
vi6 = vextq_f32(vi4, vi8, 2);
#if defined(__aarch64__)
vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
#else
vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
#endif
vst1q_f32(out_ptr0_base + out_offset, vo0);
} // w
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
out_height, out_width, out_ptr0_base, 1);
#endif
} // c
}
} // if
} // m
} // b
}
} // namespace ops
} // namespace mace
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <utility>
#include <algorithm>
#include "mace/ops/arm/fp32/conv_2d.h"
#include "mace/utils/memory.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input,
const Tensor *filter,
const int out_tile_height,
const int out_tile_width,
std::vector<index_t> *output_shape,
std::vector<int> *in_pad_size,
std::vector<int> *out_pad_size) {
in_pad_size->resize(4);
out_pad_size->resize(4);
output_shape->resize(4);
const index_t in_height = input->dim(2);
const index_t in_width = input->dim(3);
const index_t stride_h = strides_[0];
const index_t stride_w = strides_[1];
const index_t dilation_h = dilations_[0];
const index_t dilation_w = dilations_[1];
const index_t filter_h = filter->dim(2);
const index_t filter_w = filter->dim(3);
std::vector<int> paddings(2);
if (paddings_.empty()) {
CalcNCHWPaddingAndOutputSize(input->shape().data(),
filter->shape().data(),
dilations_.data(),
strides_.data(),
padding_type_,
output_shape->data(),
paddings.data());
} else {
paddings = paddings_;
CalcNCHWOutputSize(input->shape().data(),
filter->shape().data(),
paddings_.data(),
dilations_.data(),
strides_.data(),
RoundType::FLOOR,
output_shape->data());
}
const index_t out_height = (*output_shape)[2];
const index_t out_width = (*output_shape)[3];
const index_t
padded_out_height = RoundUp<index_t>(out_height, out_tile_height);
const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_width);
const index_t padded_in_height =
std::max(in_height + paddings[0], (padded_out_height - 1) * stride_h
+ (filter_h - 1) * dilation_h + 1);
const index_t padded_in_width =
std::max(in_width + paddings[1], (padded_out_width - 1) * stride_w
+ (filter_w - 1) * dilation_w + 1);
(*in_pad_size)[0] = paddings[0] >> 1;
(*in_pad_size)[1] =
static_cast<int>(padded_in_height - in_height - (*in_pad_size)[0]);
(*in_pad_size)[2] = paddings[1] >> 1;
(*in_pad_size)[3] =
static_cast<int>(padded_in_width - in_width - (*in_pad_size)[2]);
(*out_pad_size)[0] = 0;
(*out_pad_size)[1] = static_cast<int>(padded_out_height - out_height);
(*out_pad_size)[2] = 0;
(*out_pad_size)[3] = static_cast<int>(padded_out_width - out_width);
}
MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output,
const int out_tile_height,
const int out_tile_width,
std::unique_ptr<const Tensor>
*padded_input,
std::unique_ptr<Tensor>
*padded_output) {
std::vector<index_t> output_shape;
std::vector<int> in_pad_size;
std::vector<int> out_pad_size;
CalOutputShapeAndPadSize(input,
filter,
out_tile_height,
out_tile_width,
&output_shape,
&in_pad_size,
&out_pad_size);
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t batch = input->dim(0);
const index_t in_channels = input->dim(1);
const index_t in_height = input->dim(2);
const index_t in_width = input->dim(3);
const index_t out_channels = output->dim(1);
const index_t out_height = output->dim(2);
const index_t out_width = output->dim(3);
const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
const index_t
padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
const index_t
padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
const bool is_in_padded =
padded_in_height != in_height || padded_in_width != in_width;
const bool is_out_padded =
padded_out_height != out_height || padded_out_width != out_width;
auto scratch_buffer = context->device()->scratch_buffer();
const index_t padded_in_size =
MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
sizeof(float) * batch * in_channels * padded_in_height
* padded_in_width) : 0);
const index_t padded_out_size = is_out_padded ? PadAlignSize(
sizeof(float) * batch * out_channels * padded_out_height
* padded_out_width) : 0;
scratch_buffer->Rewind();
scratch_buffer->GrowSize(padded_in_size + padded_out_size);
if (is_in_padded) {
std::unique_ptr<Tensor>
padded_in =
make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
DataType::DT_FLOAT);
padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
*padded_input = std::move(padded_in);
}
if (is_out_padded) {
std::unique_ptr<Tensor>
padded_out = make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
DataType::DT_FLOAT);
padded_out->Resize({batch, out_channels, padded_out_height,
padded_out_width});
*padded_output = std::move(padded_out);
}
return MaceStatus::MACE_SUCCESS;
}
void Conv2dBase::PadInput(const Tensor &src,
const int pad_top,
const int pad_left,
mace::Tensor *dst) {
if (dst == &src) return;
const index_t batch = src.dim(0);
const index_t channels = src.dim(1);
const index_t height = src.dim(2);
const index_t width = src.dim(3);
const index_t padded_height = dst->dim(2);
const index_t padded_width = dst->dim(3);
const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
const int pad_right = static_cast<int>(padded_width - width - pad_left);
auto in_data = src.data<float>();
auto padded_in_data = dst->mutable_data<float>();
const index_t img_size = height * width;
const index_t padded_img_size = padded_height * padded_width;
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t bc = b * channels + c;
const float *in_base = in_data + bc * img_size;
float *padded_in_base = padded_in_data + bc * padded_img_size;
memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
padded_in_base += pad_top * padded_width;
for (index_t h = 0; h < height; ++h) {
memset(padded_in_base,
0,
sizeof(float) * pad_left);
memcpy(padded_in_base + pad_left,
in_base,
sizeof(float) * width);
memset(padded_in_base + pad_left + width,
0,
sizeof(float) * pad_right);
in_base += width;
padded_in_base += padded_width;
}
memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
}
}
}
void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) {
if (dst == &src) return;
const index_t batch = dst->dim(0);
const index_t channels = dst->dim(1);
const index_t height = dst->dim(2);
const index_t width = dst->dim(3);
const index_t padded_height = src.dim(2);
const index_t padded_width = src.dim(3);
auto padded_out_data = src.data<float>();
auto out_data = dst->mutable_data<float>();
const index_t img_size = height * width;
const index_t padded_img_size = padded_height * padded_width;
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t bc = (b * channels + c);
float *out_base = out_data + bc * img_size;
const float *padded_out_base = padded_out_data + bc * padded_img_size;
for (index_t h = 0; h < height; ++h) {
memcpy(out_base,
padded_out_base,
sizeof(float) * width);
out_base += width;
padded_out_base += padded_width;
} // h
} // c
} // b
}
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -15,10 +15,14 @@
#ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
#define MACE_OPS_ARM_FP32_CONV_2D_H_
#include <vector>
#include <memory>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
#include "mace/ops/arm/fp32/gemm.h"
#include "mace/ops/common/conv_pool_2d_util.h"
namespace mace {
namespace ops {
......@@ -27,13 +31,51 @@ namespace fp32 {
class Conv2dBase {
public:
Conv2dBase() = default;
Conv2dBase(const std::vector<int> strides,
const std::vector<int> dilations,
const std::vector<int> paddings,
const Padding padding_type)
: strides_(strides),
dilations_(dilations),
paddings_(paddings),
padding_type_(padding_type) {}
virtual ~Conv2dBase() = default;
virtual MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) = 0;
protected:
void CalOutputShapeAndPadSize(const Tensor *input,
const Tensor *filter,
const int out_tile_height,
const int out_tile_width,
std::vector<index_t> *output_shape,
std::vector<int> *in_pad_size,
std::vector<int> *out_pad_size);
MaceStatus ResizeOutAndPadInOut(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output,
const int out_tile_height,
const int out_tile_width,
std::unique_ptr<const Tensor> *padded_input,
std::unique_ptr<Tensor> *padded_output);
void PadInput(const Tensor &src,
const int pad_top,
const int pad_left,
Tensor *dst);
void UnPadOutput(const Tensor &src, Tensor *dst);
const std::vector<int> strides_;
const std::vector<int> dilations_;
const std::vector<int> paddings_;
const Padding padding_type_;
};
} // namespace fp32
......
......@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/fp32/conv_2d_1x1.h"
namespace mace {
......@@ -25,20 +24,68 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
const Tensor *filter,
Tensor *output) {
index_t batch = input->dim(0);
index_t height = input->dim(2);
index_t width = input->dim(3);
index_t in_height = input->dim(2);
index_t in_width = input->dim(3);
index_t in_channels = input->dim(1);
index_t out_channels = filter->dim(0);
MACE_RETURN_IF_ERROR(output->Resize({batch, out_channels, height, width}));
context->device()->scratch_buffer()->Rewind();
std::vector<index_t> output_shape;
std::vector<int> in_pad_size;
std::vector<int> out_pad_size;
CalOutputShapeAndPadSize(input,
filter,
1,
1,
&output_shape,
&in_pad_size,
&out_pad_size);
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t out_channels = output_shape[1];
const index_t out_height = output_shape[2];
const index_t out_width = output_shape[3];
const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
// pad input and transform input
const bool is_in_padded =
in_height != padded_in_height || in_width != padded_in_width;
auto scratch_buffer = context->device()->scratch_buffer();
const index_t padded_in_size = is_in_padded ? PadAlignSize(
sizeof(float) * batch * in_channels * padded_in_height
* padded_in_width) : 0;
const index_t pack_filter_size =
PadAlignSize(sizeof(float) * out_channels * in_channels);
const index_t pack_input_size =
PadAlignSize(
sizeof(float) * in_channels * padded_in_height * padded_in_width);
const index_t pack_output_size =
PadAlignSize(
sizeof(float) * out_channels * padded_in_height * padded_in_width);
const index_t gemm_pack_size =
pack_filter_size + pack_input_size + pack_output_size;
scratch_buffer->Rewind();
scratch_buffer->GrowSize(padded_in_size + gemm_pack_size);
const Tensor *padded_in = input;
Tensor tmp_padded_in
(scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
if (is_in_padded) {
tmp_padded_in.Resize({batch, in_channels, padded_in_height,
padded_in_width});
PadInput(*input, in_pad_size[0], in_pad_size[2], &tmp_padded_in);
padded_in = &tmp_padded_in;
}
return gemm_.Compute(context,
filter,
input,
padded_in,
batch,
out_channels,
in_channels,
in_channels,
height * width,
out_height * out_width,
false,
false,
false,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -15,6 +15,7 @@
#ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
#define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
......@@ -28,7 +29,8 @@ namespace fp32 {
class Conv2dK1x1 : public Conv2dBase {
public:
Conv2dK1x1() : gemm_(true) {}
Conv2dK1x1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK1x1() {}
MaceStatus Compute(
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
#include "mace/ops/arm/fp32/conv_2d.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
class Conv2dK1x7S1 : public Conv2dBase {
public:
Conv2dK1x7S1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK1x7S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
class Conv2dK7x1S1 : public Conv2dBase {
public:
Conv2dK7x1S1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK7x1S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
class Conv2dK1x15S1 : public Conv2dBase {
public:
Conv2dK1x15S1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK1x15S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
class Conv2dK15x1S1 : public Conv2dBase {
public:
Conv2dK15x1S1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK15x1S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,22 +12,49 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/utils/macros.h"
#include "mace/ops/arm/conv_2d_neon.h"
#include <memory>
#include "mace/ops/arm/fp32/conv_2d_3x3.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
2,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
// Ho = 2, Wo = 4, Co = 2
void Conv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
......@@ -42,26 +69,26 @@ void Conv2dNeonK3x3S1(const float *input,
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 1 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size;
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
#endif
output_data + b * out_batch_size + (m + 1) * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
const float
*in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;
#if defined(MACE_ENABLE_NEON)
float *out_ptr1 = out_ptr1_base;
const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width;
input_data + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width;
input_data + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width;
const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
#endif
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
input_data + b * in_batch_size + c * in_image_size + 3 * in_width;
const float
*filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
#if defined(__aarch64__)
float *out_ptr0 = out_ptr0_base;
// load filter (2 outch x 3 height x 3 width): vf_outch_height
......@@ -179,7 +206,7 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width;
out_ptr1 += out_width;
} // h
#elif defined(MACE_ENABLE_NEON) // arm v7
#else // arm v7
float *out_ptr0 = out_ptr0_base;
// load filter (2 outch x 3 height x 3 width): vf_outch_height
......@@ -301,32 +328,28 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width;
out_ptr1 += out_width;
} // h
#else
for (index_t oc = 0; oc < 2; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0 + oc * in_channels * 9,
in_width, 3, 3, out_height, out_width,
out_ptr0_base + oc * out_image_size, 1);
}
#endif
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float *out_ptr0_base =
output + b * out_batch_size + mm * out_image_size;
output_data + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr0 =
input + b * in_batch_size + c * in_image_size;
#if defined(MACE_ENABLE_NEON)
input_data + b * in_batch_size + c * in_image_size;
const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width;
input_data + b * in_batch_size + c * in_image_size
+ 1 * in_width;
const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width;
input_data + b * in_batch_size + c * in_image_size
+ 2 * in_width;
const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width;
#endif
const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
input_data + b * in_batch_size + c * in_image_size
+ 3 * in_width;
const float
*filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#if defined(__aarch64__)
float *out_ptr0 = out_ptr0_base;
// load filter (1 outch x 3 height x 3 width): vf_outch_height
......@@ -409,7 +432,7 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width;
} // h
#elif defined(MACE_ENABLE_NEON) // arm v7
#else // arm v7
float *out_ptr0 = out_ptr0_base;
// load filter (1 outch x 3 height x 3 width): vf_outch_height
......@@ -494,22 +517,52 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0 += out_width;
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr0, filter_ptr0, in_width, 3, 3, out_height,
out_width, out_ptr0_base, 1);
#endif
} // c
} // mm
} // if
} // m
} // b
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
void Conv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
......@@ -523,11 +576,12 @@ void Conv2dNeonK3x3S2(const float *input,
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + m * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size;
const float
*in_base = input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
float *out_base = output_data + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#if defined(__aarch64__)
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q_f32(filter_ptr);
......@@ -587,7 +641,7 @@ void Conv2dNeonK3x3S2(const float *input,
vst1q_f32(out_base + out_offset, vo);
} // w
} // h
#elif defined(MACE_ENABLE_NEON) // arm v7
#else // arm v7
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x2_t vf01, vf23, vf45, vf67, vf78;
vf01 = vld1_f32(filter_ptr);
......@@ -649,14 +703,16 @@ void Conv2dNeonK3x3S2(const float *input,
vst1q_f32(out_base + out_offset, vo);
} // w
} // h
#else
Conv2dCPUKHxKWCalc(in_base, filter_ptr, in_width, 3, 3, out_height,
out_width, out_base, 2);
#endif
} // c
} // m
} // b
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
#include "mace/ops/arm/fp32/conv_2d.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
class Conv2dK3x3S1 : public Conv2dBase {
public:
Conv2dK3x3S1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK3x3S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
class Conv2dK3x3S2 : public Conv2dBase {
public:
Conv2dK3x3S2(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK3x3S2() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
......@@ -34,13 +34,6 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
const index_t in_width = input->dim(3);
const index_t out_channels = filter->dim(0);
index_t padded_in_height = in_height + pad_top_ + pad_bottom_;
index_t padded_in_width = in_width + pad_left_ + pad_right_;
index_t out_height = padded_in_height - 2;
index_t out_width = padded_in_width - 2;
output->Resize({batch, out_channels, out_height, out_width});
// When size of input feature map is bigger than 16x16,
// set winograd out tile size to 6 to get higher performance.
index_t out_tile_size = 2;
......@@ -48,10 +41,35 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
out_tile_size = 6;
}
const index_t padded_out_height = RoundUp<index_t>(out_height, out_tile_size);
const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_size);
padded_in_height = std::max(padded_in_height, padded_out_height + 2);
padded_in_width = std::max(padded_in_width, padded_out_width + 2);
std::vector<index_t> output_shape;
std::vector<int> in_pad_size;
std::vector<int> out_pad_size;
CalOutputShapeAndPadSize(input,
filter,
out_tile_size,
out_tile_size,
&output_shape,
&in_pad_size,
&out_pad_size);
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const index_t out_height = output_shape[2];
const index_t out_width = output_shape[3];
const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
const index_t
padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
const index_t
padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
const int pad_top = in_pad_size[0];
const int pad_left = in_pad_size[2];
bool is_in_padded =
padded_in_height != in_height || padded_in_width != in_width;
bool is_out_padded =
padded_out_height != out_height || padded_out_width != out_width;
......@@ -63,8 +81,9 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
// pad input and transform input
auto scratch_buffer = context->device()->scratch_buffer();
const index_t padded_in_size = PadAlignSize(
sizeof(float) * batch * in_channels * padded_in_height * padded_in_width);
const index_t padded_in_size = is_in_padded ? PadAlignSize(
sizeof(float) * batch * in_channels * padded_in_height
* padded_in_width) : 0;
const index_t padded_out_size = is_out_padded ? PadAlignSize(
sizeof(float) * batch * out_channels * padded_out_height
* padded_out_width) : 0;
......@@ -81,8 +100,18 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
scratch_buffer->GrowSize(
padded_in_size + padded_out_size + transformed_in_size
+ transformed_out_size + gemm_pack_size);
Tensor padded_in(scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
padded_in.Resize({batch, in_channels, padded_in_height, padded_in_width});
const Tensor *padded_in = input;
Tensor tmp_padded_in
(scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
if (is_in_padded) {
tmp_padded_in.Resize({batch, in_channels, padded_in_height,
padded_in_width});
Tensor::MappingGuard guard(&tmp_padded_in);
PadInput(*input, pad_top, pad_left, &tmp_padded_in);
padded_in = &tmp_padded_in;
}
Tensor *padded_out = output;
Tensor tmp_padded_out
(scratch_buffer->Scratch(padded_out_size), DataType::DT_FLOAT);
......@@ -94,22 +123,10 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
auto transformed_in = scratch_buffer->Scratch(transformed_in_size);
auto transformed_out = scratch_buffer->Scratch(transformed_out_size);
auto padded_in_data = padded_in.data<float>();
auto padded_in_data = padded_in->data<float>();
auto padded_out_data = padded_out->mutable_data<float>();
auto transformed_in_data = transformed_in.mutable_data<float>();
auto transformed_out_data = transformed_out.mutable_data<float>();
const index_t padded_bottom = padded_in_height - in_height - pad_top_;
const index_t padded_right = padded_in_width - in_width - pad_left_;
ConstructNCHWInputWithSpecificPadding(input,
pad_top_,
padded_bottom,
pad_left_,
padded_right,
&padded_in);
Tensor::MappingGuard filter_guard(filter);
auto filter_data = filter->data<float>();
if (!filter->is_weight() || out_tile_size != out_tile_size_) {
......@@ -215,47 +232,11 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
default:MACE_NOT_IMPLEMENTED;
}
if (is_out_padded) {
UnPackOutput(*padded_out, output);
}
UnPadOutput(*padded_out, output);
return MaceStatus::MACE_SUCCESS;
}
void Conv2dK3x3Winograd::UnPackOutput(const Tensor &src, Tensor *dst) {
const index_t batch = dst->dim(0);
const index_t channels = dst->dim(1);
const index_t height = dst->dim(2);
const index_t width = dst->dim(3);
const index_t padded_height = src.dim(2);
const index_t padded_width = src.dim(3);
if (height == padded_height && width == padded_width) {
return;
}
auto padded_out_data = src.data<float>();
auto out_data = dst->mutable_data<float>();
const index_t img_size = height * width;
const index_t padded_img_size = padded_height * padded_width;
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t h = 0; h < height; ++h) {
memcpy(
out_data + (b * channels + c) * img_size
+ h * width,
padded_out_data
+ (b * channels + c) * padded_img_size
+ h * padded_width,
sizeof(float) * width);
} // h
} // c
} // b
}
// OCHW => TOC
void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter,
const index_t in_channels,
......
......@@ -15,6 +15,7 @@
#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
#define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
#include <vector>
#include <memory>
#include "mace/public/mace.h"
......@@ -30,12 +31,10 @@ namespace fp32 {
class Conv2dK3x3Winograd : public Conv2dBase {
public:
Conv2dK3x3Winograd(int pad_top, int pad_bottom, int pad_left, int pad_right)
: gemm_(),
pad_top_(pad_top),
pad_bottom_(pad_bottom),
pad_left_(pad_left),
pad_right_(pad_right),
Conv2dK3x3Winograd(const std::vector<int> paddings,
const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type),
gemm_(),
transformed_filter_(nullptr),
out_tile_size_(0) {}
......@@ -48,9 +47,6 @@ class Conv2dK3x3Winograd : public Conv2dBase {
Tensor *output);
private:
void UnPackOutput(const Tensor &padded_output,
Tensor *output);
void TransformFilter4x4(const float *filter,
const index_t in_channels,
const index_t out_channels,
......@@ -94,10 +90,6 @@ class Conv2dK3x3Winograd : public Conv2dBase {
float *output);
Gemm gemm_;
int pad_top_;
int pad_bottom_;
int pad_left_;
int pad_right_;
std::unique_ptr<Tensor> transformed_filter_;
index_t out_tile_size_;
};
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,14 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/ops/arm/conv_2d_neon.h"
#include <memory>
#include "mace/ops/arm/fp32/conv_2d_5x5.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
#define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
......@@ -76,12 +76,40 @@ namespace ops {
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK5x5S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
......@@ -96,26 +124,26 @@ void Conv2dNeonK5x5S1(const float *input,
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size;
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
output_data + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
output_data + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
output + b * out_batch_size + (m + 3) * out_image_size;
#endif
output_data + b * out_batch_size + (m + 3) * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
const float *filter_ptr1 =
filter + (m + 1) * in_channels * 25 + c * 25;
filter_data + (m + 1) * in_channels * 25 + c * 25;
const float *filter_ptr2 =
filter + (m + 2) * in_channels * 25 + c * 25;
filter_data + (m + 2) * in_channels * 25 + c * 25;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 25 + c * 25;
filter_data + (m + 3) * in_channels * 25 + c * 25;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -158,23 +186,16 @@ void Conv2dNeonK5x5S1(const float *input,
filter_ptr3 -= 25;
} // w
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 25,
in_width, 5, 5, out_height, out_width,
out_ptr0_base + oc * out_image_size, 1);
}
#endif
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float *out_ptr0_base =
output + b * out_batch_size + mm * out_image_size;
output_data + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + mm * in_channels * 25 + c * 25;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -204,16 +225,17 @@ void Conv2dNeonK5x5S1(const float *input,
filter_ptr0 -= 25;
} // w
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 5, 5,
out_height, out_width, out_ptr0_base, 1);
#endif
} // c
} // mm
} // if
} // m
} // b
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
#define MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
#include "mace/ops/arm/fp32/conv_2d.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
class Conv2dK5x5S1 : public Conv2dBase {
public:
Conv2dK5x5S1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK5x5S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_5X5_H_
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,14 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/ops/arm/conv_2d_neon.h"
#include <memory>
#include "mace/ops/arm/fp32/conv_2d_7x7.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \
......@@ -153,12 +153,40 @@ namespace ops {
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S1(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
......@@ -173,26 +201,25 @@ void Conv2dNeonK7x7S1(const float *input,
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size;
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
output_data + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
output_data + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
output + b * out_batch_size + (m + 3) * out_image_size;
#endif
output_data + b * out_batch_size + (m + 3) * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
#if defined(MACE_ENABLE_NEON)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
const float *filter_ptr1 =
filter + (m + 1) * in_channels * 49 + c * 49;
filter_data + (m + 1) * in_channels * 49 + c * 49;
const float *filter_ptr2 =
filter + (m + 2) * in_channels * 49 + c * 49;
filter_data + (m + 2) * in_channels * 49 + c * 49;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 49 + c * 49;
filter_data + (m + 3) * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -243,23 +270,16 @@ void Conv2dNeonK7x7S1(const float *input,
filter_ptr3 -= 49;
} // w
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
in_width, 7, 7, out_height, out_width,
out_ptr0_base + oc * out_image_size, 1);
}
#endif
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float *out_ptr0_base =
output + b * out_batch_size + mm * out_image_size;
output_data + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
#if defined(MACE_ENABLE_NEON)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -297,23 +317,50 @@ void Conv2dNeonK7x7S1(const float *input,
filter_ptr0 -= 49;
} // w
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
out_height, out_width, out_ptr0_base, 1);
#endif
} // c
} // mm
} // if
} // m
} // b
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S2(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
......@@ -328,26 +375,25 @@ void Conv2dNeonK7x7S2(const float *input,
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size;
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
output_data + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
output_data + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
output + b * out_batch_size + (m + 3) * out_image_size;
#endif
output_data + b * out_batch_size + (m + 3) * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
#if defined(MACE_ENABLE_NEON)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
const float *filter_ptr1 =
filter + (m + 1) * in_channels * 49 + c * 49;
filter_data + (m + 1) * in_channels * 49 + c * 49;
const float *filter_ptr2 =
filter + (m + 2) * in_channels * 49 + c * 49;
filter_data + (m + 2) * in_channels * 49 + c * 49;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 49 + c * 49;
filter_data + (m + 3) * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -403,23 +449,16 @@ void Conv2dNeonK7x7S2(const float *input,
filter_ptr3 -= 49;
} // w
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
in_width, 7, 7, out_height, out_width,
out_ptr0_base + oc * out_image_size, 2);
}
#endif
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float *out_ptr0_base =
output + b * out_batch_size + mm * out_image_size;
output_data + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
#if defined(MACE_ENABLE_NEON)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -462,23 +501,50 @@ void Conv2dNeonK7x7S2(const float *input,
filter_ptr0 -= 49;
} // w
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
out_height, out_width, out_ptr0_base, 2);
#endif
} // c
} // mm
} // if
} // m
} // b
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S3(const float *input,
const float *filter,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
......@@ -493,26 +559,25 @@ void Conv2dNeonK7x7S3(const float *input,
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size;
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
output_data + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
output_data + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
output + b * out_batch_size + (m + 3) * out_image_size;
#endif
output_data + b * out_batch_size + (m + 3) * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
#if defined(MACE_ENABLE_NEON)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
const float *filter_ptr1 =
filter + (m + 1) * in_channels * 49 + c * 49;
filter_data + (m + 1) * in_channels * 49 + c * 49;
const float *filter_ptr2 =
filter + (m + 2) * in_channels * 49 + c * 49;
filter_data + (m + 2) * in_channels * 49 + c * 49;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * 49 + c * 49;
filter_data + (m + 3) * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -568,23 +633,16 @@ void Conv2dNeonK7x7S3(const float *input,
filter_ptr3 -= 49;
} // w
} // h
#else
for (index_t oc = 0; oc < 4; ++oc) {
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 49,
in_width, 7, 7, out_height, out_width,
out_ptr0_base + oc * out_image_size, 3);
}
#endif
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float *out_ptr0_base =
output + b * out_batch_size + mm * out_image_size;
output_data + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = filter + mm * in_channels * 49 + c * 49;
#if defined(MACE_ENABLE_NEON)
input_data + b * in_batch_size + c * in_image_size;
const float
*filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -627,16 +685,17 @@ void Conv2dNeonK7x7S3(const float *input,
filter_ptr0 -= 49;
} // w
} // h
#else
Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 7, 7,
out_height, out_width, out_ptr0_base, 3);
#endif
} // c
} // mm
} // if
} // m
} // b
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
#include "mace/ops/arm/fp32/conv_2d.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
class Conv2dK7x7S1 : public Conv2dBase {
public:
Conv2dK7x7S1(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK7x7S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
class Conv2dK7x7S2 : public Conv2dBase {
public:
Conv2dK7x7S2(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK7x7S2() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
class Conv2dK7x7S3 : public Conv2dBase {
public:
Conv2dK7x7S3(const std::vector<int> paddings, const Padding padding_type)
: Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {}
virtual ~Conv2dK7x7S3() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "mace/ops/arm/fp32/conv_general.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
MaceStatus Conv2dGeneral::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto in_shape = in_tensor->shape();
auto out_shape = out_tensor->shape();
auto filter_shape = filter->shape();
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = filter_shape[1] * in_image_size;
const index_t out_batch_size = filter_shape[0] * out_image_size;
const index_t filter_size = filter_shape[2] * filter_shape[3];
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < filter_shape[0]; m += 4) {
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t out_channels = filter_shape[0];
const index_t in_channels = filter_shape[1];
const int stride_h = strides_[0];
const int stride_w = strides_[1];
const int dilation_h = dilations_[0];
const int dilation_w = dilations_[1];
if (m + 3 < out_channels) {
float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size;
float *out_ptr1_base = out_ptr0_base + out_image_size;
float *out_ptr2_base = out_ptr1_base + out_image_size;
float *out_ptr3_base = out_ptr2_base + out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 =
filter_data + m * in_channels * filter_size + c * filter_size;
const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
index_t ih = h * stride_h;
index_t iw = w * stride_w;
index_t in_offset = ih * in_width + iw;
// output (4 outch x 1 height x 4 width): vo_outch_height
float vo0[4], vo1[4], vo2[4], vo3[4];
// load output
index_t out_offset = h * out_width + w;
for (index_t ow = 0; ow < 4; ++ow) {
vo0[ow] = out_ptr0_base[out_offset + ow];
vo1[ow] = out_ptr1_base[out_offset + ow];
vo2[ow] = out_ptr2_base[out_offset + ow];
vo3[ow] = out_ptr3_base[out_offset + ow];
}
// calc by row
for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
// outch 0
vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw];
vo0[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
// outch 1
vo1[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr1[kw];
vo1[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr1[kw];
vo1[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr1[kw];
vo1[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr1[kw];
// outch 2
vo2[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr2[kw];
vo2[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr2[kw];
vo2[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr2[kw];
vo2[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr2[kw];
// outch 3
vo3[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr3[kw];
vo3[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr3[kw];
vo3[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr3[kw];
vo3[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr3[kw];
} // kw
in_offset += dilation_h * in_width;
filter_ptr0 += filter_shape[3];
filter_ptr1 += filter_shape[3];
filter_ptr2 += filter_shape[3];
filter_ptr3 += filter_shape[3];
} // kh
for (index_t ow = 0; ow < 4; ++ow) {
out_ptr0_base[out_offset + ow] = vo0[ow];
out_ptr1_base[out_offset + ow] = vo1[ow];
out_ptr2_base[out_offset + ow] = vo2[ow];
out_ptr3_base[out_offset + ow] = vo3[ow];
}
filter_ptr0 -= filter_size;
filter_ptr1 -= filter_size;
filter_ptr2 -= filter_size;
filter_ptr3 -= filter_size;
} // w
} // h
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 =
filter_data + mm * in_channels * filter_size + c * filter_size;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
index_t ih = h * stride_h;
index_t iw = w * stride_w;
index_t in_offset = ih * in_width + iw;
// output (1 outch x 1 height x 4 width): vo_outch_height
float vo0[4];
// load output
index_t out_offset = h * out_width + w;
for (index_t ow = 0; ow < 4; ++ow) {
vo0[ow] = out_ptr0_base[out_offset + ow];
}
// calc by row
for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
// outch 0
vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw];
vo0[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
} // kw
in_offset += dilation_h * in_width;
filter_ptr0 += filter_shape[3];
} // kh
for (index_t ow = 0; ow < 4; ++ow) {
out_ptr0_base[out_offset + ow] = vo0[ow];
}
filter_ptr0 -= filter_size;
} // w
} // h
} // c
} // mm
} // if
} // m
} // b
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_GENERAL_H_
#define MACE_OPS_ARM_FP32_CONV_GENERAL_H_
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
#include "mace/ops/arm/fp32/conv_2d.h"
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
class Conv2dGeneral : public Conv2dBase {
public:
Conv2dGeneral(const std::vector<int> strides,
const std::vector<int> dilations,
const std::vector<int> paddings,
const Padding padding_type)
: Conv2dBase(strides, dilations, paddings, padding_type) {}
virtual ~Conv2dGeneral() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
};
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_GENERAL_H_
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......
此差异已折叠。
......@@ -16,7 +16,6 @@
#include "mace/ops/ref/conv_2d.h"
#include <vector>
#include "mace/ops/common/conv_pool_2d_util.h"
namespace mace {
namespace ops {
......@@ -30,31 +29,36 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
const std::vector<index_t> in_shape = input->shape();
const std::vector<index_t> filter_shape = filter->shape();
const std::vector<index_t> out_shape = output->shape();
const std::vector<int> stride_hw{stride_h_, stride_w_};
const std::vector<int> dilation_hw{dilation_h_, dilation_w_};
const std::vector<int> paddings{pad_h_, pad_w_};
const index_t pad_top = pad_h_ >> 1;
const index_t pad_left = pad_w_ >> 1;
std::vector<index_t> output_shape(4);
CalcOutputSize(in_shape.data(),
NCHW,
filter_shape.data(),
OIHW,
paddings.data(),
dilation_hw.data(),
stride_hw.data(),
RoundType::FLOOR,
output_shape.data());
output->Resize(output_shape);
std::vector<index_t> out_shape(4);
std::vector<int> paddings(2);
if (paddings_.empty()) {
CalcNCHWPaddingAndOutputSize(input->shape().data(),
filter->shape().data(),
dilations_.data(),
strides_.data(),
padding_type_,
out_shape.data(),
paddings.data());
} else {
paddings = paddings_;
CalcNCHWOutputSize(input->shape().data(),
filter->shape().data(),
paddings_.data(),
dilations_.data(),
strides_.data(),
RoundType::FLOOR,
out_shape.data());
}
const index_t pad_top = paddings[0] >> 1;
const index_t pad_left = paddings[1] >> 1;
output->Resize(out_shape);
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = filter_shape[1] * in_image_size;
const index_t out_batch_size = filter_shape[0] * out_image_size;
const index_t filter_size = filter_shape[2] * filter_shape[3];
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard output_guard(output);
......@@ -86,8 +90,10 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
const index_t ih = -pad_top + h * stride_h_ + kh * dilation_h_;
const index_t iw = -pad_left + w * stride_w_ + kw * dilation_w_;
const index_t
ih = -pad_top + h * strides_[0] + kh * dilations_[0];
const index_t
iw = -pad_left + w * strides_[1] + kw * dilations_[1];
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
sum += in_ptr_base[ih * in_width + iw] * filter_ptr[kw];
}
......
......@@ -16,9 +16,12 @@
#ifndef MACE_OPS_REF_CONV_2D_H_
#define MACE_OPS_REF_CONV_2D_H_
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/op_context.h"
#include "mace/ops/common/conv_pool_2d_util.h"
namespace mace {
namespace ops {
......@@ -27,30 +30,39 @@ namespace ref {
template<typename OUTPUT_TYPE>
class Conv2d {
public:
Conv2d(int stride_h, int stride_w, int dilation_h, int dilation_w);
Conv2d(const std::vector<int> strides,
const std::vector<int> dilations,
const std::vector<int> paddings,
const Padding padding_type)
: strides_(strides),
dilations_(dilations),
paddings_(paddings),
padding_type_(padding_type) {}
~Conv2d() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output);
private:
const std::vector<int> strides_;
const std::vector<int> dilations_;
const std::vector<int> paddings_;
const Padding padding_type_;
};
template<>
class Conv2d<float> {
public:
Conv2d(int pad_h,
int pad_w,
int stride_h,
int stride_w,
int dilation_h,
int dilation_w)
: pad_h_(pad_h),
pad_w_(pad_w),
stride_h_(stride_h),
stride_w_(stride_w),
dilation_h_(dilation_h),
dilation_w_(dilation_w) {}
Conv2d(const std::vector<int> strides,
const std::vector<int> dilations,
const std::vector<int> paddings,
const Padding padding_type)
: strides_(strides),
dilations_(dilations),
paddings_(paddings),
padding_type_(padding_type) {}
~Conv2d() {}
// Always row-major after transpose
MaceStatus Compute(
......@@ -60,12 +72,10 @@ class Conv2d<float> {
Tensor *output);
private:
int pad_h_;
int pad_w_;
int stride_h_;
int stride_w_;
int dilation_h_;
int dilation_w_;
const std::vector<int> strides_;
const std::vector<int> dilations_;
const std::vector<int> paddings_;
const Padding padding_type_;
};
} // namespace ref
......
......@@ -1969,7 +1969,8 @@ class Transformer(base_converter.ConverterInterface):
else:
print("Quantize op %s (%s)" % (op.name, op.type))
non_zero = self._option.device == DeviceType.CPU.value
non_zero = self._option.device == DeviceType.CPU.value \
and op.type == MaceOp.MatMul.name
for idx, input_tensor in enumerate(op.input):
quantized_inputs_names.append(input_tensor)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册