diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc index 6d08ca907b55b209cc9b5d05b7638bbf9d27fd18..12581af6e979c87deac701f7dc92e8dad676bec5 100644 --- a/mace/core/memory_optimizer.cc +++ b/mace/core/memory_optimizer.cc @@ -90,7 +90,9 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock( if (shape.size() == 2) { shape = {shape[0], 1, 1, shape[1]}; } else { - MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input"; + MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input, " + << "op name: " << op_def->name() << ", " + << MakeString(shape); } OpenCLUtil::CalImage2DShape(shape, buffer_type, &image_shape); block.set_x(image_shape[0]); diff --git a/mace/ops/arm/fp32/bias_add.cc b/mace/ops/arm/fp32/bias_add.cc index de4b6d575b194b253243cdfb3ffe7ceebec3f045..f572b22bbd1cfe80b39ff7e2c76727aa8b437fac 100644 --- a/mace/ops/arm/fp32/bias_add.cc +++ b/mace/ops/arm/fp32/bias_add.cc @@ -62,34 +62,62 @@ void BiasAdd::AddBias(const OpContext *context, utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); - thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, - index_t start1, index_t end1, index_t step1) { - for (index_t b = start0; b < end0; b += step0) { - for (index_t c = start1; c < end1; c += step1) { - const index_t offset = (b * channels + c) * image_size; - auto input_ptr = input_data + offset; - auto output_ptr = output_data + offset; - const float bias = bias_data[c]; - float32x4_t vbias = vdupq_n_f32(bias); + if (bias->dim_size() == 1) { + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + const index_t b_offset = b * channels; + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = (b_offset + c) * image_size; + auto input_ptr = input_data + offset; + auto output_ptr = output_data + offset; + const float bias = bias_data[c]; + float32x4_t vbias = vdupq_n_f32(bias); - for (index_t i = 0; i < block_count; ++i) { - float32x4_t v = vld1q_f32(input_ptr); - v = vaddq_f32(v, vbias); - vst1q_f32(output_ptr, v); + for (index_t i = 0; i < block_count; ++i) { + float32x4_t v = vld1q_f32(input_ptr); + v = vaddq_f32(v, vbias); + vst1q_f32(output_ptr, v); - input_ptr += 4; - output_ptr += 4; + input_ptr += 4; + output_ptr += 4; + } + for (index_t i = 0; i < remain; ++i) { + (*output_ptr++) = (*input_ptr++) + bias; + } } - for (index_t i = 0; i < remain; ++i) { - (*output_ptr++) = (*input_ptr++) + bias; + } + }, 0, batch, 1, 0, channels, 1); + } else { + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + const index_t b_offset = b * channels; + for (index_t c = start1; c < end1; c += step1) { + const index_t offset = (b_offset + c) * image_size; + auto input_ptr = input_data + offset; + auto output_ptr = output_data + offset; + const float bias = bias_data[b * channels + c]; + float32x4_t vbias = vdupq_n_f32(bias); + + for (index_t i = 0; i < block_count; ++i) { + float32x4_t v = vld1q_f32(input_ptr); + v = vaddq_f32(v, vbias); + vst1q_f32(output_ptr, v); + + input_ptr += 4; + output_ptr += 4; + } + for (index_t i = 0; i < remain; ++i) { + (*output_ptr++) = (*input_ptr++) + bias; + } } } - } - }, 0, batch, 1, 0, channels, 1); + }, 0, batch, 1, 0, channels, 1); + } } } // namespace fp32 } // namespace arm } // namespace ops } // namespace mace - diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index f8c5b352d54bd80dd622d8fba8d5a81b8429a88b..c17a6e49dd5ab74302933d31fea4d07a197c4a8f 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -49,15 +49,18 @@ class BiasAddOp : public Operation { MACE_UNUSED(context); const Tensor *input = this->Input(0); const Tensor *bias = this->Input(1); - - MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ", - bias->dim_size()); - Tensor *output = this->Output(0); - if (input->dim_size() == 4 && has_data_format_) { + if (input->dim_size() == 4 && (has_data_format_ + || input->data_format() == DataFormat::NCHW)) { // NCHW + MACE_CHECK(bias->dim_size() == 1 || bias->dim_size() == 2, + "bias must be 1-dimensional or n*c for caffee.", + MakeString(bias->shape())); bias_add_delegator_.Compute(context, input, bias, output); - } else { + } else { // NHWC + MACE_CHECK(bias->dim_size() == 1 || bias->dim_size() == 2, + "bias must be 1 or 2 dimensionals for caffee.", + bias->dim_size(), MakeString(bias->shape())); // TODO(liyin): remove it and tranform bias to add (eltwise) MACE_RETURN_IF_ERROR(output->ResizeLike(input)); @@ -70,16 +73,40 @@ class BiasAddOp : public Operation { float *output_ptr = output->mutable_data(); const std::vector &shape = input->shape(); - const index_t fused_batch = std::accumulate( - shape.begin(), shape.end() - 1, 1, std::multiplies()); const index_t channels = *shape.rbegin(); - - for (index_t n = 0; n < fused_batch; ++n) { - index_t pos = n * channels; - for (index_t c = 0; c < channels; ++c) { - output_ptr[pos] = input_ptr[pos] + bias_ptr[c]; - ++pos; - } + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + if (bias->dim_size() == 1) { + const index_t fused_batch = std::accumulate( + shape.begin(), shape.end() - 1, 1, std::multiplies()); + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t n = start; n < end; n += step) { + index_t pos = n * channels; + for (index_t c = 0; c < channels; ++c) { + output_ptr[pos] = input_ptr[pos] + bias_ptr[c]; + ++pos; + } + } + }, 0, fused_batch, 1); + } else { // bias is 2d + const auto n = shape[0]; + MACE_CHECK(n == bias->shape()[0]); + const index_t fused_hw = std::accumulate( + shape.begin() + 1, shape.end() - 1, 1, std::multiplies()); + const auto ch_size = bias->shape()[1]; + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + auto offset = i * fused_hw; + auto bias_offset = i * ch_size; + for (index_t j = start1; j < end1; j += step1) { + index_t pos = (offset + i) * channels; + for (index_t c = 0; c < channels; ++c, ++pos) { + output_ptr[pos] = input_ptr[pos] + bias_ptr[bias_offset + c]; + } + } + } + }, 0, n, 1, 0, fused_hw, 1); } } @@ -109,21 +136,25 @@ class BiasAddOp : public Operation { } else { MACE_NOT_IMPLEMENTED; } - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) - == MaceStatus::MACE_SUCCESS); + + // for const bias tensor + if (context->workspace()->GetTensor(operator_def_->input(1)) != nullptr) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS, "TransformFilter failed"); + } } + MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); const Tensor *bias = this->Input(1); - - MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ", - bias->dim_size()); - Tensor *output = this->Output(0); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); MACE_CHECK(input->dim_size() == 4 && has_data_format_, "gpu only support biasadd for 4-dimensional NHWC format tensor"); + MACE_CHECK(bias->dim_size() == 1 || bias->dim_size() == 2, + "bias must be 1-dimensional or 2-dimensional for caffee. ", + MakeString(bias->shape())); return kernel_->Compute(context, input, bias, output); } @@ -151,6 +182,10 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) { *op, "has_data_format", 0); if (!has_data_format || op->output_shape(0).dims_size() != 4) { + LOG(INFO) << "BiasAdd only support cpu, has_data_format=" + << has_data_format + << ", op->output_shape(0).dims_size()=" + << op->output_shape(0).dims_size(); return {DeviceType::CPU}; } return {DeviceType::CPU, DeviceType::GPU}; diff --git a/mace/ops/lpnorm.cc b/mace/ops/lpnorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c62ac194f688788502b1f8be19505ad87ab4402 --- /dev/null +++ b/mace/ops/lpnorm.cc @@ -0,0 +1,157 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include +#include + +#include "mace/core/operator.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/image/lpnorm.h" +#endif // MACE_ENABLE_OPENCL + +/** + * LpNormOp is a Normalization OP which support L1 and L2, which is a custom op + * of caffe (not exist in official caffe), please reference: + * https://github.com/freesouls/caffe/blob/master/src/caffe/layers/normalization_layer.cpp #noqa + */ + +namespace mace { +namespace ops { + +template +class LpNormOp; + +template<> +class LpNormOp : public Operation { + public: + explicit LpNormOp(OpConstructContext *context) + : Operation(context), + p_(Operation::GetOptionalArg("p", 2)), + axis_(Operation::GetOptionalArg("axis", -1)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + if (axis_ < 0) { + axis_ += input->dim_size(); + } + MACE_CHECK(axis_ < input->dim_size() && axis_ >= 0, + "The axis_ must be small than dim size"); + const std::vector &input_shape = input->shape(); + MACE_RETURN_IF_ERROR(output->Resize(input_shape)); + + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_output(output); + + const auto *input_data = input->data(); + auto *output_data = output->mutable_data(); + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + auto outer_loop = std::accumulate(input_shape.begin(), + input_shape.begin() + axis_, 1, + std::multiplies()); + auto inner_loop = std::accumulate(input_shape.begin() + axis_, + input_shape.end(), 1, + std::multiplies()); + + if (p_ == 1) { + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output_data[i] = std::abs(input_data[i]); + } + }, 0, input->size(), 1); + } else if (p_ == 2) { + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output_data[i] = input_data[i] * input_data[i]; + } + }, 0, input->size(), 1); + } else { + LOG(FATAL) << "LpNorm's p should be 1 or 2, current p is: " << p_; + } + + const float power = 1 / static_cast(p_); + auto norm_buffer = context->device()->scratch_buffer(); + norm_buffer->Rewind(); + MACE_RETURN_IF_ERROR(norm_buffer->GrowSize(outer_loop * sizeof(float))); + float *norm_ptr = norm_buffer->mutable_data(); + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + auto output_data_base = output_data + inner_loop * i; + norm_ptr[i] = std::accumulate(output_data_base, + output_data_base + inner_loop, 0.0f); + norm_ptr[i] = std::pow(norm_ptr[i], power); + norm_ptr[i] += 1e-6; + } + }, 0, outer_loop, 1); + + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + const auto offset = i * inner_loop; + for (index_t j = start1; j < end1; j += step1) { + output_data[offset + j] = input_data[offset + j] / norm_ptr[i]; + } + } + }, 0, outer_loop, 1, 0, inner_loop, 1); + + return MaceStatus::MACE_SUCCESS; + } + + private: + int p_; + int axis_; +}; + +#ifdef MACE_ENABLE_OPENCL +template<> +class LpNormOp : public Operation { + public: + explicit LpNormOp(OpConstructContext *context) + : Operation(context) { + const auto p = Operation::GetOptionalArg("p", 2); + const auto axis = Operation::GetOptionalArg("axis", -1); + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { + kernel_ = make_unique(p, axis); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + return kernel_->Compute(context, input, output); + } + + private: + std::unique_ptr kernel_; + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; +#endif // MACE_ENABLE_OPENCL + +void RegisterLpNorm(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "LpNorm", LpNormOp, + DeviceType::CPU, float); + MACE_REGISTER_GPU_OP(op_registry, "LpNorm", LpNormOp); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/mvnorm.cc b/mace/ops/mvnorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..ccb0018a5881d1341de931bead67a41a367af985 --- /dev/null +++ b/mace/ops/mvnorm.cc @@ -0,0 +1,175 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include +#include + +#include "mace/core/operator.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/image/mvnorm.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace ops { + +// Mean-Variance Normalization (MVN) +template +class MVNormOp; + +template<> +class MVNormOp : public Operation { + public: + explicit MVNormOp(OpConstructContext *context) + : Operation(context), + normalize_variance_( + Operation::GetOptionalArg("normalize_variance", true)), + across_channels_( + Operation::GetOptionalArg("across_channels", false)), + eps_(Operation::GetOptionalArg("epsilon", 1e-9)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->data_format() == DataFormat::NCHW, + "The MVN only suport NCHW"); + const std::vector &input_shape = input->shape(); + MACE_RETURN_IF_ERROR(output->Resize(input_shape)); + + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_output(output); + const auto *input_data = input->data(); + auto *output_data = output->mutable_data(); + + const auto input_size = input->size(); + const auto outer_loop = + across_channels_ ? input_shape[0] : input_shape[0] * input_shape[1]; + const auto inner_loop = input_size / outer_loop; + utils::ThreadPool + &thread_pool = context->device()->cpu_runtime()->thread_pool(); + + Buffer mean_buffer(context->device()->allocator()); + MACE_RETURN_IF_ERROR(mean_buffer.Allocate(outer_loop * sizeof(float))); + auto *mean_ptr = mean_buffer.mutable_data(); + + // compute EX + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + const auto offset = inner_loop * i; + mean_ptr[i] = std::accumulate(input_data + offset, + input_data + offset + inner_loop, 0.0f); + mean_ptr[i] /= inner_loop; + } + }, 0, outer_loop, 1); + + // compute (X - EX) + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + const auto offset = i * inner_loop; + for (index_t j = start1; j < end1; j += step1) { + output_data[offset + j] = input_data[offset + j] - mean_ptr[i]; + } + } + }, 0, outer_loop, 1, 0, inner_loop, 1); + + if (normalize_variance_) { + // compute (X - EX)^2 + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + output_data[i] = output_data[i] * output_data[i]; + } + }, 0, input_size, 1); + + auto mean_v_buffer = context->device()->scratch_buffer(); + mean_v_buffer->Rewind(); + MACE_RETURN_IF_ERROR( + mean_v_buffer->GrowSize(outer_loop * sizeof(float))); + float *mean_v_ptr = mean_v_buffer->mutable_data(); + // compute E((X - EX)^2)^0.5 + eps_ + thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { + for (index_t i = start; i < end; i += step) { + auto output_data_base = output_data + inner_loop * i; + mean_v_ptr[i] = std::accumulate(output_data_base, + output_data_base + inner_loop, 0.0f); + mean_v_ptr[i] = std::pow(mean_v_ptr[i] / inner_loop, 0.5f) + eps_; + } + }, 0, outer_loop, 1); + + // compute (X - EX) / (E((X - EX)^2)^0.5 + eps_) + thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, + index_t start1, index_t end1, index_t step1) { + for (index_t i = start0; i < end0; i += step0) { + const auto offset = i * inner_loop; + for (index_t j = start1; j < end1; j += step1) { + output_data[offset + j] = + (input_data[offset + j] - mean_ptr[i]) / mean_v_ptr[i]; + } + } + }, 0, outer_loop, 1, 0, inner_loop, 1); + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + bool normalize_variance_; + bool across_channels_; + float eps_; +}; + +#ifdef MACE_ENABLE_OPENCL +template<> +class MVNormOp : public Operation { + public: + explicit MVNormOp(OpConstructContext *context) : Operation(context) { + auto normalize_variance = + Operation::GetOptionalArg("normalize_variance", true); + auto across_channels = + Operation::GetOptionalArg("across_channels", false); + auto eps = Operation::GetOptionalArg("epsilon", 1e-9); + + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { + kernel_ = make_unique( + normalize_variance, across_channels, eps); + } else { + MACE_NOT_IMPLEMENTED; + } + } + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + return kernel_->Compute(context, input, output); + } + + private: + std::unique_ptr kernel_; + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; +#endif // MACE_ENABLE_OPENCL + +void RegisterMVNorm(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "MVNorm", MVNormOp, + DeviceType::CPU, float); + MACE_REGISTER_GPU_OP(op_registry, "MVNorm", MVNormOp); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/cl/bias_add.cl b/mace/ops/opencl/cl/bias_add.cl index 7ae00846a09305ada5ada9d057bae2f8662c6b1d..f7e175b7ae3ea50db46d77bb6f9b952436fc0ab7 100644 --- a/mace/ops/opencl/cl/bias_add.cl +++ b/mace/ops/opencl/cl/bias_add.cl @@ -2,25 +2,27 @@ // Supported data types: half/float __kernel void bias_add(OUT_OF_RANGE_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 + __private const int input_height, __read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t output) { const int ch_blk = get_global_id(0); - const int w = get_global_id(1); - const int hb = get_global_id(2); + const int width_idx = get_global_id(1); + const int hb_idx = get_global_id(2); #ifndef NON_UNIFORM_WORK_GROUP - if (ch_blk >= global_size_dim0 || w >= global_size_dim1 - || hb >= global_size_dim2) { + if (ch_blk >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { return; } #endif const int width = global_size_dim1; - const int pos = mad24(ch_blk, width, w); - DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); - DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0)); + const int pos = mad24(ch_blk, width, width_idx); + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx)); + const int b_idx = select(0, hb_idx / input_height, input_height > 0); + DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, b_idx)); DATA_TYPE4 out = in + bias_value; - WRITE_IMAGET(output, (int2)(pos, hb), out); + WRITE_IMAGET(output, (int2)(pos, hb_idx), out); } diff --git a/mace/ops/opencl/cl/lpnorm.cl b/mace/ops/opencl/cl/lpnorm.cl new file mode 100644 index 0000000000000000000000000000000000000000..a2dbbc9de505bf89310a6e62cff0ee81354b920b --- /dev/null +++ b/mace/ops/opencl/cl/lpnorm.cl @@ -0,0 +1,81 @@ +#include + +DATA_TYPE4 compute_total(__read_only image2d_t input, const int hb_base, + const int chan_blks, const int width, const int height, + const int hb_idx, const int chan_blk_idx) { + DATA_TYPE4 total = 0.0f; +#if PARAM_AXIS == 1 + const int wc_blks = mul24(width, chan_blks); + + for (int h_idx = hb_base; h_idx < hb_base + height; ++h_idx) { + for (int pos = 0; pos < wc_blks; ++pos) { + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, h_idx)); +#if PARAM_P == 1 + total += fabs(in_data); +#else + total = mad(in_data, in_data, total); +#endif + } + } + DATA_TYPE total_all = total.x + total.y + total.z + total.w; + total = (DATA_TYPE4){total_all, total_all, total_all, total_all}; +#elif PARAM_AXIS == 2 + for (int h_idx = hb_base; h_idx < hb_base + height; ++h_idx) { + for (int w_idx = 0; w_idx < width; ++w_idx) { + int pos = mad24(chan_blk_idx, width, w_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, h_idx)); +#if PARAM_P == 1 + total = total + fabs(in_data); +#else + total = mad(in_data, in_data, total); +#endif + } + } +#elif PARAM_AXIS == 3 + for (int w_idx = 0; w_idx < width; ++x) { + int pos = mad24(chan_blk_idx, width, w_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx)); +#if PARAM_P == 1 + total = total + fabs(in_data); +#else + total = mad(in_data, in_data, total); +#endif + } +#endif + + return total; +} + +__kernel void lpnorm(OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __private const int height, + __private const float eps, + __write_only image2d_t output) { + const int chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } +#endif + + const int chan_blks = global_size_dim0; + const int width = global_size_dim1; + const int hb = global_size_dim2; + const int hb_base = mul24(hb_idx / height, height); + DATA_TYPE4 total = compute_total(input, hb_base, chan_blks, width, height, + hb_idx, chan_blk_idx); + const int pos = mad24(chan_blk_idx, width, width_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx)); + +#if PARAM_P == 1 + in_data = in_data / (total + eps); +#else + in_data = in_data / (sqrt(total) + eps); +#endif + WRITE_IMAGET(output, (int2)(pos, hb_idx), in_data); +} diff --git a/mace/ops/opencl/cl/mvnorm.cl b/mace/ops/opencl/cl/mvnorm.cl new file mode 100644 index 0000000000000000000000000000000000000000..d9e6ae5b8281149ab3ae4e9057c508b600153b48 --- /dev/null +++ b/mace/ops/opencl/cl/mvnorm.cl @@ -0,0 +1,129 @@ +#include + +DATA_TYPE4 compute_mean_image(image2d_t input, const int width_idx, + const int hb_idx, const int chan_blks, + const int height, const int width) { + DATA_TYPE4 total = 0.0f; + DATA_TYPE4 mean = 0.0f; + const int hb_base = mul24(hb_idx / height, height); + const int wc_blks = mul24(width, chan_blks); + +#ifdef ACROSS_CHANNELS + for (int h_idx = hb_base; h_idx < hb_base + height; ++h_idx) { + for (int pos = 0; pos < wc_blks; ++pos) { + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, h_idx)); + total += in_data; + } + } + DATA_TYPE total_value = total.x + total.y + total.z + total.w; + DATA_TYPE mean_value = total_value / (DATA_TYPE)(mul24(mul24(height, wc_blks), 4)); + mean = (DATA_TYPE4){mean_value, mean_value, mean_value, mean_value}; +#else + for (int h_idx = hb_base; h_idx < hb_base + height; ++h_idx) { + for (int w_idx = 0; w_idx < width; ++w_idx) { + int pos = mad24(w_idx, chan_blks, width_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, h_idx)); + total += in_data; + } + } + mean = total / mul24(height, width); +#endif + + return mean; +} + +__kernel void mvnorm_mean(OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __private const int height, + __write_only image2d_t output) { + const int chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } +#endif + + const int chan_blks = global_size_dim0; + const int width = global_size_dim1; + + DATA_TYPE4 mean = compute_mean_image(input, width_idx, + hb_idx, chan_blks, height, width); + const int pos = mad24(chan_blk_idx, width, width_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx)); + in_data -= mean; + + WRITE_IMAGET(output, (int2)(pos, hb_idx), in_data); +} + +__kernel void mvnorm_vn_step1(OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __write_only image2d_t mean_image, // E(X) + __write_only image2d_t square_image, // (X - EX)^2 + __private const int height) { + const int chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } +#endif + + const int chan_blks = global_size_dim0; + const int width = global_size_dim1; + + DATA_TYPE4 mean = compute_mean_image(input, width_idx, + hb_idx, chan_blks, height, width); + const int pos = mad24(chan_blk_idx, width, width_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx)); + in_data = in_data - mean; + DATA_TYPE4 pow_data = in_data * in_data; + if (hb_idx == 0 && width_idx == 0) { + WRITE_IMAGET(mean_image, (int2)(chan_blk_idx, 0), mean); + } + WRITE_IMAGET(square_image, (int2)(pos, hb_idx), pow_data); +} + + +__kernel void mvnorm_vn_step2(OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, + __read_only image2d_t mean_image, // E(X) + __read_only image2d_t square_image, // (X - EX)^2 + __private const int height, + __private const float eps, + __write_only image2d_t output) { + const int chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } +#endif + + const int chan_blks = global_size_dim0; + const int width = global_size_dim1; + + DATA_TYPE4 mean = READ_IMAGET(mean_image, SAMPLER, (int2)(chan_blk_idx, 0)); + const int pos = mad24(chan_blk_idx, width, width_idx); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx)); + in_data = in_data - mean; + + DATA_TYPE4 mean_v = compute_mean_image(square_image, width_idx, + hb_idx, chan_blks, height, width); + + DATA_TYPE4 norm_data = in_data / (sqrt(mean_v) + eps); + + WRITE_IMAGET(output, (int2)(pos, hb_idx), norm_data); +} diff --git a/mace/ops/opencl/image/bias_add.cc b/mace/ops/opencl/image/bias_add.cc index 1f62f592f41ede1df35532775f88bae761623447..cd17f26007426830d26a67c6901320b0738bf070 100644 --- a/mace/ops/opencl/image/bias_add.cc +++ b/mace/ops/opencl/image/bias_add.cc @@ -56,6 +56,7 @@ MaceStatus BiasAddKernel::Compute( uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, static_cast(bias->dim_size() > 1 ? height : 0)); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); diff --git a/mace/ops/opencl/image/lpnorm.cc b/mace/ops/opencl/image/lpnorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..00b91039b2651d80b49719b9516d94e679b55e0e --- /dev/null +++ b/mace/ops/opencl/image/lpnorm.cc @@ -0,0 +1,92 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/lpnorm.h" + +#include +#include +#include + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +LpNormKernel::LpNormKernel(const int p, const int axis) : p_(p), axis_(axis) { + MACE_CHECK(p_ == 1 || p_ == 2, "Current p is: ", p); +} + +MaceStatus LpNormKernel::Compute(OpContext *context, + const Tensor *input, Tensor *output) { + if (axis_ < 0) { + axis_ += input->dim_size(); + } + MACE_CHECK(axis_ == 1 || axis_ == 2 || axis_ == 3, + "Current axis is: ", axis_); + + const auto batch = input->dim(0); + const auto height = input->dim(1); + const auto width = input->dim(2); + const auto channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lpnorm"); + built_options.emplace("-Dlpnorm=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + std::stringstream param_p; + param_p << "-DPARAM_P=" << p_; + built_options.emplace(param_p.str()); + std::stringstream param_axis; + param_axis << "-DPARAM_AXIS=" << axis_; + built_options.emplace(param_axis.str()); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("lpnorm", kernel_name, + built_options, &kernel_)); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_); + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_image())); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(1e-6)); + kernel_.setArg(idx++, *(output->opencl_image())); + + std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + std::string tuning_key = + Concat("lpnorm_opencl_kernel", batch, height, width, channels, p_, axis_); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/lpnorm.h b/mace/ops/opencl/image/lpnorm.h new file mode 100644 index 0000000000000000000000000000000000000000..ae517c28f018b595e78e56b6a59ea939e23276df --- /dev/null +++ b/mace/ops/opencl/image/lpnorm.h @@ -0,0 +1,49 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_OPS_OPENCL_IMAGE_LPNORM_H_ +#define MACE_OPS_OPENCL_IMAGE_LPNORM_H_ + +#include "mace/core/op_context.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/tensor.h" +#include "mace/ops/opencl/helper.h" +#include "mace/ops/opencl/lpnorm.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +class LpNormKernel : public OpenCLLpNormKernel { + public: + explicit LpNormKernel(const int p, const int axis); + ~LpNormKernel() = default; + + MaceStatus Compute( + OpContext *context, const Tensor *input, Tensor *output) override; + + private: + int p_; + int axis_; + + cl::Kernel kernel_; + uint32_t kwg_size_; +}; + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_IMAGE_LPNORM_H_ diff --git a/mace/ops/opencl/image/mvnorm.cc b/mace/ops/opencl/image/mvnorm.cc new file mode 100644 index 0000000000000000000000000000000000000000..b0356bf6fc0a3c47d03a49fbcd2d10f365a38c2c --- /dev/null +++ b/mace/ops/opencl/image/mvnorm.cc @@ -0,0 +1,237 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/opencl/image/mvnorm.h" + +#include +#include +#include +#include + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +namespace { + +MaceStatus BuildMVNKernel(OpenCLRuntime *runtime, cl::Kernel *kernel, + const char *kernel_name, + std::set *built_options, + bool across_channel) { + std::stringstream micro_name; + micro_name << "-Dmvnorm=" << kernel_name; + built_options->emplace(micro_name.str()); + built_options->emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT)); + built_options->emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT)); + if (across_channel) { + built_options->emplace("-DACROSS_CHANNELS"); + } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("mvnorm", kernel_name, + *built_options, kernel)); + return MaceStatus::MACE_SUCCESS; +} + +std::unique_ptr CreateImage( + OpContext *context, const DataType dt, + const std::vector &buffer_shape) { + std::unique_ptr image = + make_unique(context->device()->allocator()); + std::vector shape; + OpenCLUtil::CalImage2DShape( + buffer_shape, OpenCLBufferType::IN_OUT_CHANNEL, &shape); + MACE_CHECK(image->Allocate(shape, dt) == MaceStatus::MACE_SUCCESS); + VLOG(1) << "MVNormKernel::CreateImage allocate image_:" << MakeString(shape); + + return image; +} + +} // namespace + +MVNormKernel::MVNormKernel(bool normalize_variance, + bool across_channels, float eps) + : normalize_variance_(normalize_variance), + across_channels_(across_channels), + eps_(eps) {} + +void MVNormKernel::CheckImage(OpContext *context, const DataType dt, + const std::vector &square_shape, + const std::vector &mean_shape) { + if (square_image_ == nullptr) { + square_image_ = CreateImage(context, dt, square_shape); + } + + if (mean_image_ == nullptr) { + mean_image_ = CreateImage(context, dt, mean_shape); + } +} + +MaceStatus MVNormKernel::Compute(OpContext + *context, + const Tensor *input, Tensor + *output) { + const auto batch = input->dim(0); + const auto height = input->dim(1); + const auto width = input->dim(2); + const auto channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + auto runtime = context->device()->gpu_runtime()->opencl_runtime(); + + if (normalize_variance_) { + const std::vector &square_shape = input->buffer_shape(); + const std::vector mean_shape = {1, 1, 1, channels}; + CheckImage(context, input->dtype(), square_shape, mean_shape); + // compute the (X - EX)^2 + MACE_RETURN_IF_ERROR(ExecuteVarianceNormStep1Kernel( + context, runtime, gws, input)); + // compute the compute (X - EX) / (E((X - EX)^2)^0.5 + eps_) + MACE_RETURN_IF_ERROR(ExecuteVarianceNormStep2Kernel( + context, runtime, gws, input, output)); + } else { + MACE_RETURN_IF_ERROR(ExecuteMeanNormKernel( + context, runtime, gws, input, output)); + } + + return + MaceStatus::MACE_SUCCESS; +} + +MaceStatus MVNormKernel::ExecuteMeanNormKernel(OpContext *context, + OpenCLRuntime *runtime, + const uint32_t (&gws)[3], + const Tensor *input, + Tensor *output) { + const auto height = input->dim(1); + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel_step1_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + MACE_RETURN_IF_ERROR(BuildMVNKernel(runtime, &kernel_step1_, "mvnorm_mean", + &built_options, across_channels_)); + kwg_size_step1_ = static_cast( + runtime->GetKernelMaxWorkGroupSize(kernel_step1_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_step1_); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_step1_); + MACE_SET_3D_GWS_ARGS(kernel_step1_, gws); + kernel_step1_.setArg(idx++, *(input->opencl_image())); + kernel_step1_.setArg(idx++, static_cast(height)); + kernel_step1_.setArg(idx++, *(output->opencl_image())); + + std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_step1_); + std::string + tuning_key = Concat("mvnorm_mean_opencl_kernel", gws[0], gws[1], gws[2], + normalize_variance_, across_channels_); + + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_step1_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +// The first step of compute Variance Norm, compute the (X - EX)^2 +// store them into the square_image_ +MaceStatus MVNormKernel::ExecuteVarianceNormStep1Kernel( + OpContext *context, OpenCLRuntime *runtime, + const uint32_t (&gws)[3], const Tensor *input) { + const auto height = input->dim(1); + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel_step1_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + MACE_RETURN_IF_ERROR(BuildMVNKernel(runtime, &kernel_step1_, + "mvnorm_vn_step1", + &built_options, across_channels_)); + kwg_size_step1_ = static_cast( + runtime->GetKernelMaxWorkGroupSize(kernel_step1_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_step1_); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_step1_); + MACE_SET_3D_GWS_ARGS(kernel_step1_, gws); + kernel_step1_.setArg(idx++, *(input->opencl_image())); + cl::Image *mean_image = static_cast(mean_image_->buffer()); + kernel_step1_.setArg(idx++, *mean_image); + cl::Image *square_image = static_cast(square_image_->buffer()); + kernel_step1_.setArg(idx++, *square_image); + kernel_step1_.setArg(idx++, static_cast(height)); + + std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_step1_); + std::string + tuning_key = Concat("mvnorm_v_step1_opencl_kernel", gws[0], gws[1], + gws[2], normalize_variance_, + across_channels_); + + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_step1_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +// The second step of compute Variance Norm, read the (X - EX)^2 from +// square_image_ and compute (X - EX) / (E((X - EX)^2)^0.5 + eps_) +MaceStatus MVNormKernel::ExecuteVarianceNormStep2Kernel( + OpContext *context, OpenCLRuntime *runtime, const uint32_t (&gws)[3], + const Tensor *input, Tensor *output) { + const auto height = input->dim(1); + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel_step2_.get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + MACE_RETURN_IF_ERROR(BuildMVNKernel(runtime, &kernel_step2_, + "mvnorm_vn_step2", + &built_options, across_channels_)); + kwg_size_step2_ = static_cast( + runtime->GetKernelMaxWorkGroupSize(kernel_step2_)); + } + + MACE_OUT_OF_RANGE_INIT(kernel_step2_); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(kernel_step2_); + MACE_SET_3D_GWS_ARGS(kernel_step2_, gws); + kernel_step2_.setArg(idx++, *(input->opencl_image())); + cl::Image *mean_image = static_cast(mean_image_->buffer()); + kernel_step2_.setArg(idx++, *mean_image); + cl::Image *square_image = static_cast(square_image_->buffer()); + kernel_step2_.setArg(idx++, *square_image); + kernel_step2_.setArg(idx++, static_cast(height)); + kernel_step2_.setArg(idx++, static_cast(eps_)); + kernel_step2_.setArg(idx++, *(output->opencl_image())); + + std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_step2_); + std::string + tuning_key = Concat("mvnorm_v_step2_opencl_kernel", gws[0], gws[1], + gws[2], normalize_variance_, + across_channels_); + + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_step2_, tuning_key, + gws, lws, context->future())); + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/mvnorm.h b/mace/ops/opencl/image/mvnorm.h new file mode 100644 index 0000000000000000000000000000000000000000..9ff6d47f4547227dc54c22e4709b1ca26ff4d3b9 --- /dev/null +++ b/mace/ops/opencl/image/mvnorm.h @@ -0,0 +1,80 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MACE_OPS_OPENCL_IMAGE_MVNORM_H_ +#define MACE_OPS_OPENCL_IMAGE_MVNORM_H_ + +#include +#include + +#include "mace/core/op_context.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/tensor.h" +#include "mace/ops/opencl/helper.h" +#include "mace/ops/opencl/mvnorm.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +class MVNormKernel : public OpenCLMVNormKernel { + public: + explicit MVNormKernel(bool normalize_variance_, + bool across_channels, float eps); + ~MVNormKernel() = default; + + MaceStatus Compute( + OpContext *context, const Tensor *input, Tensor *output) override; + + private: + void CheckImage(OpContext *context, const DataType dt, + const std::vector &square_shape, + const std::vector &mean_shape); + MaceStatus ExecuteMeanNormKernel(OpContext *context, + OpenCLRuntime *runtime, + const uint32_t (&gws)[3], + const Tensor *input, + Tensor *output); + MaceStatus ExecuteVarianceNormStep1Kernel(OpContext *context, + OpenCLRuntime *runtime, + const uint32_t (&gws)[3], + const Tensor *input); + MaceStatus ExecuteVarianceNormStep2Kernel(OpContext *context, + OpenCLRuntime *runtime, + const uint32_t (&gws)[3], + const Tensor *input, + Tensor *output); + + private: + bool normalize_variance_; + bool across_channels_; + float eps_; + + cl::Kernel kernel_step1_; + uint32_t kwg_size_step1_; + cl::Kernel kernel_step2_; + uint32_t kwg_size_step2_; + + // the cache of (X - EX)^2 + std::unique_ptr square_image_; + // the cache of EX + std::unique_ptr mean_image_; +}; + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_IMAGE_MVNORM_H_ diff --git a/mace/ops/opencl/lpnorm.h b/mace/ops/opencl/lpnorm.h new file mode 100644 index 0000000000000000000000000000000000000000..d216e4cb6f74707a24f03b6b886eb68772e10ffa --- /dev/null +++ b/mace/ops/opencl/lpnorm.h @@ -0,0 +1,39 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_OPENCL_LPNORM_H_ +#define MACE_OPS_OPENCL_LPNORM_H_ + +#include "mace/public/mace.h" +#include "mace/utils/math.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace ops { +class OpenCLLpNormKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLLpNormKernel); +}; + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_LPNORM_H_ diff --git a/mace/ops/opencl/mvnorm.h b/mace/ops/opencl/mvnorm.h new file mode 100644 index 0000000000000000000000000000000000000000..433ef0a2b1d95ffb8a9285f5009c5eaa000ca157 --- /dev/null +++ b/mace/ops/opencl/mvnorm.h @@ -0,0 +1,39 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_OPENCL_MVNORM_H_ +#define MACE_OPS_OPENCL_MVNORM_H_ + +#include "mace/public/mace.h" +#include "mace/utils/math.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace ops { +class OpenCLMVNormKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLMVNormKernel); +}; + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_OPENCL_MVNORM_H_ diff --git a/mace/ops/ref/bias_add.cc b/mace/ops/ref/bias_add.cc index c3be25b08d070ac2791fe971325cce57d96de831..efc56f74b412814da9643eccd9e4ce459299c622 100644 --- a/mace/ops/ref/bias_add.cc +++ b/mace/ops/ref/bias_add.cc @@ -56,12 +56,13 @@ void BiasAdd::AddBias(const OpContext *context, const index_t width = output->dim(3); const index_t image_size = height * width; + auto bias_b = bias->dim_size() == 1 ? 0 : bias->shape()[1]; for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t offset = (b * channels + c) * image_size; auto input_ptr = input_data + offset; auto output_ptr = output_data + offset; - const float bias = bias_data[c]; + const float bias = bias_data[bias_b * channels + c]; for (index_t i = 0; i < image_size; ++i) { (*output_ptr++) = (*input_ptr++) + bias; diff --git a/mace/ops/registry/ops_registry.cc b/mace/ops/registry/ops_registry.cc index b7b7f5f64d27df9297c06257822aded93be5e6c3..e2dcc276b6a4c38497ec00d77b7eb6e851cc40b4 100644 --- a/mace/ops/registry/ops_registry.cc +++ b/mace/ops/registry/ops_registry.cc @@ -46,8 +46,10 @@ extern void RegisterDelay(OpRegistryBase *op_registry); extern void RegisterInferConv2dShape(OpRegistryBase *op_registry); extern void RegisterKaldiBatchNorm(OpRegistryBase *op_registry); extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry); +extern void RegisterLpNorm(OpRegistryBase *op_registry); extern void RegisterLSTMNonlinear(OpRegistryBase *op_registry); extern void RegisterMatMul(OpRegistryBase *op_registry); +extern void RegisterMVNorm(OpRegistryBase *op_registry); extern void RegisterOneHot(OpRegistryBase *op_registry); extern void RegisterPad(OpRegistryBase *op_registry); extern void RegisterPadContext(OpRegistryBase *op_registry); @@ -121,8 +123,10 @@ OpRegistry::OpRegistry() : OpRegistryBase() { ops::RegisterInferConv2dShape(this); ops::RegisterKaldiBatchNorm(this); ops::RegisterLocalResponseNorm(this); + ops::RegisterLpNorm(this); ops::RegisterLSTMNonlinear(this); ops::RegisterMatMul(this); + ops::RegisterMVNorm(this); ops::RegisterOneHot(this); ops::RegisterPad(this); ops::RegisterPadContext(this); diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc index 32597fc9694c5747fcdc0f83841ce8568d2a0923..8e469b268dc49f3b0eeb324986c734f4246752b6 100644 --- a/mace/ops/reshape.cc +++ b/mace/ops/reshape.cc @@ -159,8 +159,10 @@ void RegisterReshape(OpRegistryBase *op_registry) { auto tensor_shape_info = context->tensor_shape_info(); const std::string &input_0 = op->input(0); - if (4 == op->output_shape(0).dims_size() && - 4 == tensor_shape_info->at(input_0).size()) { + const auto out_dims_size = + op->output_shape(0).dims_size(); + if (4 == tensor_shape_info->at(input_0).size() + && (out_dims_size == 4 || out_dims_size == 2)) { return {DeviceType::CPU, DeviceType::GPU}; } return {DeviceType::CPU}; diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 3784d2ddadce96f3e9f002477b107c0f9186e3bc..cfbde681eaac06aff6a5d84a8d5cc7afc45021b0 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -82,12 +82,13 @@ class SoftmaxOp : public Operation { index_t batch_stride = class_size; index_t batch_size = batch_stride * input->dim(0); - Buffer cache_buffer(context->device()->allocator()); - MACE_RETURN_IF_ERROR(cache_buffer.Allocate(hw_size * sizeof(float))); + auto cache_buffer = context->device()->scratch_buffer(); + cache_buffer->Rewind(); + MACE_RETURN_IF_ERROR(cache_buffer->GrowSize(hw_size * sizeof(float))); utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); float std_lowest = std::numeric_limits::lowest(); - float *cache_ptr = cache_buffer.mutable_data(); + float *cache_ptr = cache_buffer->mutable_data(); for (index_t b_offset = 0; b_offset < batch_size; b_offset += batch_stride) { diff --git a/mace/python/tools/encrypt_opencl_codegen.py b/mace/python/tools/encrypt_opencl_codegen.py index 24c8564498a9951eb74fc1f065ddb49d8ca5f999..56cbd4ceefb9a8f507d8c5fd4ca5cb6af85321e3 100644 --- a/mace/python/tools/encrypt_opencl_codegen.py +++ b/mace/python/tools/encrypt_opencl_codegen.py @@ -84,7 +84,7 @@ def encrypt_opencl_codegen(cl_kernel_dir, output_path): for file_name in os.listdir(cl_kernel_dir): file_path = os.path.join(cl_kernel_dir, file_name) module_key = get_module_key(file_name) - if len(module_key) > 0: + if module_key is not None and len(module_key) > 0: with open(file_path, "r") as f: code_str = "" headers = [] diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl index a39a1703640ea8e0247364833d8ba11f2b7a9daf..db8e9730d2d28ef74505cdb809f7d2fc5dec2c3d 100644 --- a/repository/opencl-kernel/opencl_kernel_configure.bzl +++ b/repository/opencl-kernel/opencl_kernel_configure.bzl @@ -63,6 +63,8 @@ def _opencl_encrypt_kernel_impl(repository_ctx): unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/space_to_depth.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/sqrdiff_mean.cl")) unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/winograd_transform.cl")) + unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/lpnorm.cl")) + unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/mvnorm.cl")) python_bin_path = repository_ctx.which("python") diff --git a/test/ccunit/mace/ops/bias_add_test.cc b/test/ccunit/mace/ops/bias_add_test.cc index 0126abb9d20645c51925e218bdc881fc3801fd5b..07f6172608ada11ea9ef3d817498b4a0abb0c690 100644 --- a/test/ccunit/mace/ops/bias_add_test.cc +++ b/test/ccunit/mace/ops/bias_add_test.cc @@ -62,11 +62,60 @@ void BiasAddSimple() { ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } + +template +void BiasAddSimple2D() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray("Input", {2, 3, 1, 2}, + {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); + net.AddInputFromArray("Bias", {2, 2}, + {0.1f, 0.2f, 0.3f, 0.4f}, true); + if (D == DeviceType::CPU) { + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + OpDefBuilder("BiasAdd", "BiasAddTest") + .Input("InputNCHW") + .Input("Bias") + .AddIntArg("has_data_format", 1) + .Output("OutputNCHW") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + } else if (D == DeviceType::GPU) { + OpDefBuilder("BiasAdd", "BiasAddTest") + .Input("Input") + .Input("Bias") + .Output("Output") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } else { + MACE_NOT_IMPLEMENTED; + } + + // Check + auto expected = net.CreateTensor( + {2, 3, 1, 2}, + {5.1, 5.2, 7.1, 7.2, 9.1, 9.2, 11.3, 11.4, 13.3, 13.4, 15.3, 15.4}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + } // namespace TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple(); } -TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { BiasAddSimple(); } +TEST_F(BiasAddOpTest, BiasAddSimple2DCPU) { + BiasAddSimple2D(); +} + +TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) { + BiasAddSimple(); +} TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { // generate random input diff --git a/test/ccunit/mace/ops/lpnorm_test.cc b/test/ccunit/mace/ops/lpnorm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c397ab94e6eb7e43b3912b25e2ded667d421b79b --- /dev/null +++ b/test/ccunit/mace/ops/lpnorm_test.cc @@ -0,0 +1,126 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class LpNormOpTest : public OpsTestBase {}; + +namespace { +template +void TestLpNorm(const std::vector &input_shape, + const std::vector &input, + const int p, + const int axis, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), input_shape, input); + + if (D == DeviceType::GPU) { + net.TransformDataFormat( + "Input", DataFormat::NCHW, "InputNHWC", DataFormat::NHWC); + } + + OpDefBuilder("LpNorm", "LpNormTest") + .Input(D == DeviceType::CPU ? "Input" : "InputNHWC") + .AddIntArg("p", p) + .AddIntArg("axis", axis) + .Output(D == DeviceType::CPU ? "Output" : "OutputNHWC") + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + if (D == DeviceType::GPU) { + net.TransformDataFormat( + "OutputNHWC", DataFormat::NHWC, "Output", DataFormat::NCHW); + } + + net.AddInputFromArray("ExpectedOutput", input_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(LpNormOpTest, SimpleTestFabs) { + TestLpNorm( + {1, 8, 1, 2}, // NCHW + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 1, 1, + {0.00735294, 0.0147059, 0.0220588, 0.0294118, + 0.0367647, 0.0441176, 0.0514706, 0.0588235, + 0.0661765, 0.0735294, 0.0808824, 0.0882353, + 0.0955882, 0.102941, 0.110294, 0.117647}); +} + +TEST_F(LpNormOpTest, SimpleTestSquare) { + TestLpNorm( + {1, 8, 1, 2}, // NCHW + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 2, 1, + {0.0258544, 0.0517088, 0.0775632, 0.103418, + 0.129272, 0.155126, 0.180981, 0.206835, + 0.232689, 0.258544, 0.284398, 0.310253, + 0.336107, 0.361961, 0.387816, 0.41367}); +} + +TEST_F(LpNormOpTest, SimpleTestPSquare2) { +TestLpNorm( + {1, 8, 1, 2}, // NCHW + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 2, 2, + {0.447214, 0.894427, 0.600000, 0.800000, + 0.640184, 0.768221, 0.658505, 0.752577, + 0.668965, 0.743294, 0.675725, 0.737154, + 0.680451, 0.732793, 0.683941, 0.729537}); +} + +TEST_F(LpNormOpTest, SimpleTestFabsOpenCL) { + TestLpNorm( + {1, 8, 1, 2}, // NCHW + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 1, 1, + {0.00735294, 0.0147059, 0.0220588, 0.0294118, + 0.0367647, 0.0441176, 0.0514706, 0.0588235, + 0.0661765, 0.0735294, 0.0808824, 0.0882353, + 0.0955882, 0.102941, 0.110294, 0.117647}); +} + +TEST_F(LpNormOpTest, SimpleTestSquareOpenCL) { + TestLpNorm( + {1, 8, 1, 2}, // NCHW + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 2, 1, + {0.0258544, 0.0517088, 0.0775632, 0.103418, + 0.129272, 0.155126, 0.180981, 0.206835, + 0.232689, 0.258544, 0.284398, 0.310253, + 0.336107, 0.361961, 0.387816, 0.41367}); +} + +TEST_F(LpNormOpTest, SimpleTestSquareOpenCL2) { + TestLpNorm( + {1, 8, 1, 2}, // NCHW + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 2, 2, + {0.447214, 0.894427, 0.600000, 0.800000, + 0.640184, 0.768221, 0.658505, 0.752577, + 0.668965, 0.743294, 0.675725, 0.737154, + 0.680451, 0.732793, 0.683941, 0.729537}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/test/ccunit/mace/ops/mvnorm_test.cc b/test/ccunit/mace/ops/mvnorm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..14b73e2e29fef63aa12428f3cdea6c53736d0995 --- /dev/null +++ b/test/ccunit/mace/ops/mvnorm_test.cc @@ -0,0 +1,166 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class MVNormOpTest : public OpsTestBase {}; + +namespace { +template +void TestMVNorm(const std::vector &input_shape, + const std::vector &input, + bool normalize_variance, + bool across_channels, + const std::vector &output) { + OpsTestNet net; + net.AddInputFromArray(MakeString("Input"), input_shape, input); + + if (D == DeviceType::CPU) { + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + } + OpDefBuilder("MVNorm", "MVNormTest") + .Input(D == DeviceType::CPU ? "InputNCHW" : "Input") + .AddIntArg("normalize_variance", normalize_variance) + .AddIntArg("across_channels", across_channels) + .Output(D == DeviceType::CPU ? "OutputNCHW" : "Output") + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + if (D == DeviceType::CPU) { + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + } + + net.AddInputFromArray("ExpectedOutput", input_shape, output); + ExpectTensorNear(*net.GetOutput("ExpectedOutput"), + *net.GetOutput("Output")); +} +} // namespace + +TEST_F(MVNormOpTest, SimpleTestMean) { + TestMVNorm( + {1, 1, 5, 12}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, + false, true, + {-9.5, -8.5, -7.5, -6.5, -5.5, + -4.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, -7.5, -6.5, -5.5, + -4.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, -5.5, + -4.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, 4.5, + 5.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, 4.5, + 5.5, 6.5, 7.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, 4.5, + 5.5, 6.5, 7.5, 8.5, 9.5}); +} + +TEST_F(MVNormOpTest, SimpleTestVariance) { + TestMVNorm( + {1, 1, 5, 12}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, + true, true, + {-2.1287, -1.90463, -1.68056, -1.45648, -1.23241, + -1.00833, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, -1.68056, -1.45648, -1.23241, + -1.00833, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, -1.23241, + -1.00833, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, 1.00833, + 1.23241, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, 1.00833, + 1.23241, 1.45648, 1.68056, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, 1.00833, + 1.23241, 1.45648, 1.68056, 1.90463, 2.1287}); +} + +TEST_F(MVNormOpTest, SimpleTestVariance2) { + TestMVNorm( + {1, 1, 1, 16}, + {-0.63984936, -0.5024374 , -2.1083345, 2.6399455, + -0.63989604, -0.63280314, 2.905462, 1.0263479, + -0.502281 , -0.58158046, -0.5358325 , -0.50097936, + 1.2043145 , -0.53840625, -0.50652033, -0.48295242}, + true, true, + {-0.485057, -0.376699, -1.643057, 2.101283, + -0.485094, -0.479501, 2.310661, 0.828852, + -0.376575, -0.439108, -0.403033, -0.375549, + 0.969191, -0.405062, -0.379918, -0.361333}); +} + +TEST_F(MVNormOpTest, SimpleTestMeanOpenCL) { + TestMVNorm( + {1, 1, 5, 12}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, + false, true, + {-9.5, -8.5, -7.5, -6.5, -5.5, + -4.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, -7.5, -6.5, -5.5, + -4.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, -5.5, + -4.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, 4.5, + 5.5, -3.5, -2.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, 4.5, + 5.5, 6.5, 7.5, -1.5, -0.5, + 0.5, 1.5, 2.5, 3.5, 4.5, + 5.5, 6.5, 7.5, 8.5, 9.5}); +} + + +TEST_F(MVNormOpTest, SimpleTestVarianceOpenCL) { + TestMVNorm( + {1, 1, 5, 12}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, + true, true, + {-2.1287, -1.90463, -1.68056, -1.45648, -1.23241, + -1.00833, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, -1.68056, -1.45648, -1.23241, + -1.00833, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, -1.23241, + -1.00833, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, 1.00833, + 1.23241, -0.784259, -0.560185, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, 1.00833, + 1.23241, 1.45648, 1.68056, -0.336111, -0.112037, + 0.112037, 0.336111, 0.560185, 0.784259, 1.00833, + 1.23241, 1.45648, 1.68056, 1.90463, 2.1287}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/test/ccutils/mace/ops/ops_test_util.h b/test/ccutils/mace/ops/ops_test_util.h index 0869cea1bbd412b05d7fd95075dce1bbdd2679cd..a9f8a9842890c4d9737040c89bd8d68c4fb5d7a1 100644 --- a/test/ccutils/mace/ops/ops_test_util.h +++ b/test/ccutils/mace/ops/ops_test_util.h @@ -124,7 +124,8 @@ class OpsTestNet { input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); - MACE_CHECK(static_cast(input->size()) == data.size()); + MACE_CHECK(static_cast(input->size()) == data.size(), + input->size(), " VS ", data.size()); memcpy(input_data, data.data(), data.size() * sizeof(T)); input->SetScale(scale); input->SetZeroPoint(zero_point); diff --git a/third_party/caffe/caffe.proto b/third_party/caffe/caffe.proto index c972c9f66bd27c8145b919da2778d40668cf50ff..8be551bdb9031492e2ef51615cfc2aec33f5cd0d 100644 --- a/third_party/caffe/caffe.proto +++ b/third_party/caffe/caffe.proto @@ -544,6 +544,7 @@ message LayerParameter { optional VideoDataParameter video_data_param = 207; optional WindowDataParameter window_data_param = 129; optional ShuffleChannelParameter shuffle_channel_param = 164; + optional L2NormalizationParameter l2normalization_param = 208; } // Message that stores parameters used to apply transformation @@ -1934,3 +1935,7 @@ message PReLUParameter { message ShuffleChannelParameter { optional uint32 group = 1[default = 1]; // The number of group } + +message L2NormalizationParameter { + optional int32 axis = 1 [default = 1]; +} diff --git a/tools/device.py b/tools/device.py index 574ff186eeef58a312d165eb155f0153885c08ac..2952177f69536a9a2b3c5b734285b5d783dc4b91 100644 --- a/tools/device.py +++ b/tools/device.py @@ -748,7 +748,6 @@ class DeviceWrapper: ) if YAMLKeyword.dockerfile_path \ in model_config \ else ("third_party/caffe", "lastest") - sh_commands.validate_model( abi=target_abi, device=self, diff --git a/tools/python/transform/base_converter.py b/tools/python/transform/base_converter.py index 0d6a89debeacbe7b7b6986145480d98148ebcd35..c8aca0377d020ed6329639442c79fb3df120a74b 100644 --- a/tools/python/transform/base_converter.py +++ b/tools/python/transform/base_converter.py @@ -152,6 +152,8 @@ MaceSupportedOps = [ 'Transpose', 'Cumsum', 'Tile', + 'LpNorm', + 'MVNorm', ] MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str) @@ -169,7 +171,9 @@ MaceFixedDataFormatOps = [MaceOp.BatchNorm, MaceOp.ResizeBilinear, MaceOp.ResizeNearestNeighbor, MaceOp.SpaceToBatchND, - MaceOp.SpaceToDepth] + MaceOp.SpaceToDepth, + MaceOp.LpNorm, + MaceOp.MVNorm] MaceTransposableDataFormatOps = [MaceOp.Activation, MaceOp.AddN, @@ -180,6 +184,7 @@ MaceTransposableDataFormatOps = [MaceOp.Activation, MaceOp.Eltwise, MaceOp.Pad, MaceOp.Reduce, + MaceOp.Reshape, MaceOp.Softmax, MaceOp.Split, MaceOp.Squeeze, @@ -264,6 +269,9 @@ class MaceKeyword(object): mace_reverse_str = 'reverse' mace_const_data_num_arg_str = 'const_data_num' mace_coeff_str = 'coeff' + mace_p_str = 'p' + mace_nor_var_str = 'normalize_variance' + mace_across_ch_str = 'across_channels' class TransformerRule(Enum): diff --git a/tools/python/transform/caffe_converter.py b/tools/python/transform/caffe_converter.py index fd75501b737d298186453919733e6852d2feb4f6..3eb6e42936efd8fef9f6fb1333046b10fd7dfebd 100644 --- a/tools/python/transform/caffe_converter.py +++ b/tools/python/transform/caffe_converter.py @@ -191,6 +191,10 @@ class CaffeConverter(base_converter.ConverterInterface): 'Flatten': self.convert_flatten, 'PriorBox': self.convert_prior_box, 'Reshape': self.convert_reshape, + 'L2Normalization': self.convert_lpnorm, + 'L1Normalization': self.convert_lpnorm, + 'MVN': self.convert_MVN, + 'Bias': self.convert_Bias, } self._option = option self._mace_net_def = mace_pb2.NetDef() @@ -815,3 +819,57 @@ class CaffeConverter(base_converter.ConverterInterface): num_axes_arg.i = -1 if param.HasField('num_axes'): num_axes_arg.i = param.num_axes + + def convert_lpnorm(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.l2normalization_param + op.type = MaceOp.LpNorm.name + + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = -1 + if param.HasField('axis'): + axis_arg.i = param.axis + + p_arg = op.arg.add() + p_arg.name = MaceKeyword.mace_p_str + if caffe_op.type == 'L1Normalization': + p_arg.i = 1 + elif caffe_op.type == 'L2Normalization': + p_arg.i = 2 + else: + mace_check(False, "Can not support %s" % caffe_op.type) + + def convert_MVN(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.mvn_param + op.type = MaceOp.MVNorm.name + + if param.HasField('normalize_variance'): + nv_arg = op.arg.add() + nv_arg.name = MaceKeyword.mace_nor_var_str + nv_arg.i = param.normalize_variance + + if param.HasField('across_channels'): + across_ch_arg = op.arg.add() + across_ch_arg.name = MaceKeyword.mace_across_ch_str + across_ch_arg.i = param.across_channels + + if param.HasField('eps'): + eps_arg = op.arg.add() + eps_arg.name = MaceKeyword.mace_epsilon_str + eps_arg.f = param.eps + + def convert_Bias(self, caffe_op): + op = self.convert_general_op(caffe_op) + op.type = MaceOp.BiasAdd.name + param = caffe_op.layer.bias_param + mace_check(not param.axis or param.axis == 0 or param.axis == 1, + "BiasAdd only support axis with 0 or 1.") + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = 1 + if param.axis is not None: + mace_check(param.axis == 0 or param.axis == 1, + "BiasAdd only support axis with 0 or 1.") + axis_arg.i = param.axis diff --git a/tools/python/transform/onnx_converter.py b/tools/python/transform/onnx_converter.py index c1a7b0d680beb6836e70f5d20fe0cd79a089dbed..aa780d0b0dc082ea27bebccf6a0c33d9577b9cdb 100644 --- a/tools/python/transform/onnx_converter.py +++ b/tools/python/transform/onnx_converter.py @@ -113,7 +113,7 @@ OnnxSupportedOps = [ # 'Log', 'LogSoftmax', # 'Loop', - # 'LpNormalization', + 'LpNormalization', # 'LpPool', 'MatMul', 'Max', @@ -353,6 +353,7 @@ class OnnxConverter(base_converter.ConverterInterface): OnnxOpType.ImageScaler.name: self.convert_imagescaler, OnnxOpType.LeakyRelu.name: self.convert_activation, OnnxOpType.LogSoftmax.name: self.convert_softmax, + OnnxOpType.LpNormalization: self.convert_lpnormalization, OnnxOpType.LstmNonlinear.name: self.convert_lstm_nonlinear, OnnxOpType.DynamicLSTM.name: self.convert_dynamic_lstm, OnnxOpType.Max.name: self.convert_eltwise, @@ -1435,6 +1436,18 @@ class OnnxConverter(base_converter.ConverterInterface): use_log_arg.name = 'use_log' use_log_arg.i = 1 + def convert_lpnormalization(self, node): + op = self.convert_general_op(node) + op.type = MaceOp.LpNorm.name + + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = node.attrs.get('axis', -1) + + p_arg = op.arg.add() + p_arg.name = MaceKeyword.mace_p_str + p_arg.i = node.attrs.get('p', 2) + def convert_splice(self, node): op = self.convert_general_op(node) op.type = MaceOp.Splice.name @@ -1565,7 +1578,6 @@ class OnnxConverter(base_converter.ConverterInterface): op.input.append(size_tensor_name) else: op.type = MaceOp.ResizeBilinear.name - size_arg = op.arg.add() size_arg.name = MaceKeyword.mace_resize_size_str size_arg.ints.extend(output_size.tolist()) diff --git a/tools/python/transform/shape_inference.py b/tools/python/transform/shape_inference.py index ac97859bcdc82942c09d053e89f7ac2f2aa9663d..2a7b43b6b29d36c2de20f4fe8a70d6636f4a479a 100644 --- a/tools/python/transform/shape_inference.py +++ b/tools/python/transform/shape_inference.py @@ -52,6 +52,8 @@ class ShapeInference(object): MaceOp.PriorBox.name: self.infer_shape_prior_box, MaceOp.Reshape.name: self.infer_shape_reshape, MaceOp.ResizeBilinear.name: self.infer_shape_resize_bilinear, + MaceOp.LpNorm.name: self.infer_shape_general, + MaceOp.MVNorm.name: self.infer_shape_general, } self._net = net @@ -206,7 +208,7 @@ class ShapeInference(object): def infer_shape_slice(self, op): output_shape = self._output_shape_cache[op.input[0]] axis = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str).i - output_shape[axis] /= len(op.output) + output_shape[axis] = (int)(output_shape[axis] / len(op.output)) output_shapes = [] for _ in op.output: output_shapes.append(output_shape) diff --git a/tools/python/transform/transformer.py b/tools/python/transform/transformer.py index 0d8d58cf4f9689c9ffb9007c916505c664c303de..bb9154f8e09bdfc7b83ac2aa3dfd8cb775fc6478 100644 --- a/tools/python/transform/transformer.py +++ b/tools/python/transform/transformer.py @@ -1376,6 +1376,16 @@ class Transformer(base_converter.ConverterInterface): out_shape.dims for out_shape in op.output_shape])) return False + def is_transposable_data_format_ops(self, op): + if op.type == MaceOp.Reshape: + input_op = self._producer[op.input[0]] + out_dims_len = len(op.output_shape[0].dims) + if len(input_op.output_shape[0].dims) != 4 \ + or (out_dims_len != 4 and out_dims_len != 2): + print("In this model, reshape is not transposable op.") + return False + return op.type in MaceTransposableDataFormatOps + def update_data_format(self): print("update data format") net = self._model @@ -1387,7 +1397,7 @@ class Transformer(base_converter.ConverterInterface): df_arg.name = MaceKeyword.mace_data_format_str if op.type in MaceFixedDataFormatOps: df_arg.i = DataFormat.AUTO.value - elif op.type in MaceTransposableDataFormatOps: + elif self.is_transposable_data_format_ops(op): input_df = DataFormat.AUTO.value for input_tensor in op.input: if input_tensor in self._consts: