diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc new file mode 100644 index 0000000000000000000000000000000000000000..97e3149ea6d5669bceadbd5c6ed4fec15c7a4b1d --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc @@ -0,0 +1,129 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/runtime/kernel/arm/fp16/cast_fp16.h" +#include +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" +#include "src/runtime/kernel/arm/nnacl/op_base.h" +#include "src/runtime/runtime_api.h" +#include "include/errorcode.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Cast; + +namespace mindspore::kernel { +namespace { +int CastRun(int thread_id, LiteParallelGroupEnv *penv, void *cdata) { + if (cdata == nullptr) { + MS_LOG(ERROR) << "input cdata is nullptr!"; + return RET_ERROR; + } + + return reinterpret_cast(cdata)->DoCast(thread_id); +} +} // namespace + +int CastFp16CPUKernel::Init() { + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int CastFp16CPUKernel::ReSize() { + data_num_ = in_tensors_[0]->ElementsNum(); + if (data_num_ == 0) { + return RET_OK; + } + op_parameter_->thread_num_ = MSMIN(op_parameter_->thread_num_, data_num_); + stride_ = UP_DIV(data_num_, op_parameter_->thread_num_); + return RET_OK; +} + +int CastFp16CPUKernel::DoCast(int thread_id) { + auto input = in_tensors_.at(0); + int data_num = MSMIN(stride_, data_num_ - thread_id * stride_); + if (data_num <= 0) { + return RET_OK; + } + + auto offset = thread_id * stride_; + auto output_data = out_tensors_.at(0)->Data(); + switch (input->data_type()) { + case kNumberTypeFloat32: + Float32ToFloat16(reinterpret_cast(input->Data()) + offset, + reinterpret_cast(output_data) + offset, data_num); + break; + case kNumberTypeFloat16: + Float16ToFloat32(reinterpret_cast(input->Data()) + offset, + reinterpret_cast(output_data) + offset, data_num); + break; + default: + MS_LOG(ERROR) << "Unsupport input data type " << input->data_type(); + return RET_ERROR; + } + return RET_OK; +} + +int CastFp16CPUKernel::Run() { + auto prepare_ret = Prepare(); + if (prepare_ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; + return prepare_ret; + } + if (data_num_ == 0) { + return RET_OK; + } + return LiteBackendParallelLaunch(CastRun, this, op_parameter_->thread_num_); +} + +kernel::LiteKernel *CpuCastFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, const lite::Primitive *primitive) { + if (opParameter == nullptr) { + MS_LOG(ERROR) << "Input opParameter is nullptr!"; + return nullptr; + } + if (ctx == nullptr) { + MS_LOG(ERROR) << "Input context is nullptr!"; + return nullptr; + } + if (ctx->thread_num_ == 0) { + MS_LOG(ERROR) << "context thread num is 0!"; + return nullptr; + } + auto *kernel = new (std::nothrow) CastFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "new CastFp16CPUKernel fail!"; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + delete kernel; + MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); + return nullptr; + } + return kernel; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Cast, CpuCastFp16KernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.h new file mode 100644 index 0000000000000000000000000000000000000000..e3e100289082f9e7f843e45f4a847f99d1fe8fb3 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.h @@ -0,0 +1,43 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CAST_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CAST_FP16_H_ + +#include +#include "src/lite_kernel.h" + +namespace mindspore::kernel { +class CastFp16CPUKernel : public LiteKernel { + public: + CastFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::Context *ctx, + const lite::Primitive *primitive) + : LiteKernel(parameter, inputs, outputs, ctx, primitive) {} + + ~CastFp16CPUKernel() = default; + + int Init() override; + int ReSize() override; + int Run() override; + int DoCast(int thread_id); + + private: + uint32_t stride_; + uint32_t data_num_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CAST_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc new file mode 100644 index 0000000000000000000000000000000000000000..365b254d6a07fcfcba6b43652048a1cba5affe73 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc @@ -0,0 +1,149 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/runtime/kernel/arm/fp16/pooling_fp16.h" +#include +#include "src/runtime/kernel/arm/nnacl/fp16/pooling_fp16.h" +#include "src/kernel_registry.h" +#include "src/runtime/runtime_api.h" +#include "include/errorcode.h" +#include "src/runtime/kernel/arm/nnacl/op_base.h" +#include "nnacl/fp16/cast_fp16.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Pooling; + +namespace mindspore::kernel { +int PoolingFp16CPUKernel::InitBuffer() { + int in_batch = pooling_param_->input_batch_; + int in_h = pooling_param_->input_h_; + int in_w = pooling_param_->input_w_; + int in_channel = pooling_param_->input_channel_; + fp16_input_ = reinterpret_cast(malloc(in_batch * in_h * in_w * in_channel * sizeof(float16_t))); + if (fp16_input_ == nullptr) { + MS_LOG(ERROR) << "malloc fp16_input_ failed."; + return RET_ERROR; + } + + int out_batch = pooling_param_->output_batch_; + int out_h = pooling_param_->output_h_; + int out_w = pooling_param_->output_w_; + int out_channel = pooling_param_->output_channel_; + fp16_output_ = reinterpret_cast(malloc(out_batch * out_h * out_w * out_channel * sizeof(float16_t))); + if (fp16_output_ == nullptr) { + MS_LOG(ERROR) << "fp16_out malloc failed."; + return RET_ERROR; + } + return RET_OK; +} + +int PoolingFp16CPUKernel::Init() { + if (context_->infer_shape_interrupt_ && !context_->running_) { + set_need_reinit(); + return RET_OK; + } + auto ret = PoolingBaseCPUKernel::Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "PoolingBase Init failed."; + return ret; + } + + ret = InitBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init Buffer failed."; + return ret; + } + return RET_OK; +} + +int PoolingFp16CPUKernel::ReSize() { + auto ret = Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Pooling resize init failed."; + return RET_ERROR; + } + return RET_OK; +} + +int PoolingFp16CPUKernel::RunImpl(int task_id) { + if (pooling_param_->max_pooling_) { + MaxPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id); + } else { + AvgPoolingFp16(fp16_input_, fp16_output_, pooling_param_, task_id); + } + return RET_OK; +} + +int PoolingFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { + auto pooling = reinterpret_cast(cdata); + auto error_code = pooling->RunImpl(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Pooling Run error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + +int PoolingFp16CPUKernel::Run() { + auto prepare_ret = Prepare(); + if (prepare_ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; + return prepare_ret; + } + auto ele_num = in_tensors_.front()->ElementsNum(); + auto input_ptr = reinterpret_cast(in_tensors_.at(kInputIndex)->Data()); + Float32ToFloat16(input_ptr, fp16_input_, ele_num); + + int error_code = LiteBackendParallelLaunch(PoolingFp16Impl, this, thread_count_); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]"; + return RET_ERROR; + } + + auto out_ele_num = out_tensors_.front()->ElementsNum(); + auto output_ptr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); + Float16ToFloat32(fp16_output_, output_ptr, out_ele_num); + return RET_OK; +} + +kernel::LiteKernel *CpuPoolingFp16KernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const Context *ctx, + const kernel::KernelKey &desc, const lite::Primitive *primitive) { + if (opParameter == nullptr) { + MS_LOG(ERROR) << "Input opParameter is nullptr!"; + return nullptr; + } + MS_ASSERT(desc.type == schema::PrimitiveType_Pooling); + auto *kernel = new (std::nothrow) PoolingFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); + if (kernel == nullptr) { + MS_LOG(ERROR) << "new PoolingCPUKernel fail!"; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + delete kernel; + MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " + << schema::EnumNamePrimitiveType(static_cast(opParameter->type_)); + return nullptr; + } + return kernel; +} + +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Pooling, CpuPoolingFp16KernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.h new file mode 100644 index 0000000000000000000000000000000000000000..7c78e0fee589a88ce9c5551943b6db87f99a7c25 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.h @@ -0,0 +1,52 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_POOLING_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_POOLING_FP16_H_ + +#include +#include +#include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/base/pooling_base.h" + +namespace mindspore::kernel { +class PoolingFp16CPUKernel : public PoolingBaseCPUKernel { + public: + PoolingFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const Context *ctx, + const lite::Primitive *primitive) + : PoolingBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~PoolingFp16CPUKernel() override { + if (fp16_input_ != nullptr) { + free(fp16_input_); + } + if (fp16_output_ != nullptr) { + free(fp16_output_); + } + }; + + int Init() override; + int InitBuffer(); + int ReSize() override; + int Run() override; + int RunImpl(int task_id); + + private: + float16_t *fp16_input_ = nullptr; + float16_t *fp16_output_ = nullptr; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_POOLING_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.h b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.h index 1833764e2d34aec3449a2061f91d14d3ba79ff21..2448657d6ce34eab8a754562a816c4545c37f4d3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.h @@ -21,15 +21,8 @@ #include "src/runtime/kernel/arm/base/pooling_base.h" #include "src/lite_kernel.h" #include "ir/anf.h" -#include "include/context.h" namespace mindspore::kernel { -using mindspore::lite::Context; -using mindspore::schema::PadMode; -using mindspore::schema::PoolMode; -using mindspore::schema::QuantType; -using mindspore::schema::RoundMode; - class PoolingCPUKernel : public PoolingBaseCPUKernel { public: PoolingCPUKernel(OpParameter *parameter, const std::vector &inputs, diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.cc new file mode 100644 index 0000000000000000000000000000000000000000..75770dbd9e8fc1d576cb20870173c67221046776 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.cc @@ -0,0 +1,28 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp16/cast_fp16.h" + +void Float32ToFloat16(const float *input, float16_t *output, int number) { + for (int i = 0; i < number; ++i) { + output[i] = (float16_t)input[i]; + } +} + +void Float16ToFloat32(const float16_t *input, float *output, int number) { + for (int i = 0; i < number; ++i) { + output[i] = (float)input[i]; + } +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h new file mode 100644 index 0000000000000000000000000000000000000000..f95b3bb597a9bc9b8d10a4b052a48f4c932e12de --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h @@ -0,0 +1,26 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_CAST_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_CAST_FP16_H_ + +#include +#include "nnacl/op_base.h" +#include "nnacl/fp32/cast.h" + +void Float32ToFloat16(const float *input, float16_t *output, int number); +void Float16ToFloat32(const float16_t *input, float *output, int number); + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_CAST_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pooling_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pooling_fp16.cc new file mode 100644 index 0000000000000000000000000000000000000000..7eff550924261ce954feb65788d308ce3ff6d3ac --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pooling_fp16.cc @@ -0,0 +1,276 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp16/pooling_fp16.h" +#include + +void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) { + int stride_w = pooling_param->stride_w_; + int stride_h = pooling_param->stride_h_; + int pad_w = pooling_param->pad_l_; + int pad_h = pooling_param->pad_u_; + int win_w = pooling_param->window_w_; + int win_h = pooling_param->window_h_; + int channel = pooling_param->input_channel_; + int c8 = channel / C8NUM; + int c8_res = channel % C8NUM; + int c4 = c8_res / C4NUM; + + int in_w = pooling_param->input_w_; + int in_h = pooling_param->input_h_; + int output_w = pooling_param->output_w_; + int output_h = pooling_param->output_h_; + int output_batch = pooling_param->output_batch_; + int out_plane = output_w * output_h; + int out_tile_count = UP_DIV(out_plane, TILE_NUM); + int thread_num = pooling_param->thread_num_; + // input channel is equal to output channel + + for (int batch = 0; batch < output_batch; batch++) { + int in_batch_offset = batch * in_h * in_w * channel; + int out_batch_offset = batch * output_h * output_w * channel; + for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) { + int cal_start_index = thread_id * TILE_NUM; + int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); + for (int i = 0; i < real_cal_num; i++) { + int index = cal_start_index + i; + int out_w_index = index % output_w; + int out_h_index = index / output_w; + int in_w_index = out_w_index * stride_w - pad_w; + int in_h_index = out_h_index * stride_h - pad_h; + int out_plane_offset = out_batch_offset + index * channel; + for (int j = 0; j < c8; j++) { + int in_channel_offset = in_batch_offset + j * C8NUM; + int out_channel_offset = out_plane_offset + j * C8NUM; +#ifdef ENABLE_NEON + float16x8_t tmp_avg = vdupq_n_f16(0); +#else + float16_t tmp_avg[8]{0}; +#endif + int real_count = 0; + for (int h = 0; h < win_h; h++) { + for (int w = 0; w < win_w; w++) { + if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || + (in_w_index + w) >= in_w) { + continue; + } else { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; +#ifdef ENABLE_NEON + tmp_avg = vaddq_f16(tmp_avg, vld1q_f16(input_ptr + in_offset)); +#else + for (int t = 0; t < 8; t++) { + tmp_avg[t] += *(input_ptr + in_offset + t); + } +#endif + ++real_count; + } + } // win_w loop + } // win_h loop +#ifdef ENABLE_NEON + vst1q_f16(output_ptr + out_channel_offset, tmp_avg / vdupq_n_f16(real_count)); +#else + for (int t = 0; t < C8NUM; ++t) { + *(output_ptr + out_channel_offset + t) = tmp_avg[t] / (float16_t)real_count; + } +#endif + } // c8 loop + + int c4_offset = c8 * C8NUM; + for (int l = 0; l < c4; ++l) { + int in_channel_offset = in_batch_offset + c4_offset + l * C4NUM; + int out_channel_offset = out_plane_offset + c4_offset + l * C4NUM; +#ifdef ENABLE_NEON + float16x4_t tmp_avg = vdup_n_f16(0); +#else + float16_t tmp_avg[4]{0}; +#endif + int real_count = 0; + for (int h = 0; h < win_h; h++) { + for (int w = 0; w < win_w; w++) { + if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || + (in_w_index + w) >= in_w) { + continue; + } else { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; +#ifdef ENABLE_NEON + tmp_avg = vadd_f16(tmp_avg, vld1_f16(input_ptr + in_offset)); +#else + for (int j = 0; j < C4NUM; ++j) { + tmp_avg[j] += *(input_ptr + in_offset); + } +#endif + ++real_count; + } + } // win_w loop + } // win_h loop +#ifdef ENABLE_NEON + vst1_f16(output_ptr + out_channel_offset, tmp_avg / vdup_n_f16(real_count)); +#else + for (int t = 0; t < C4NUM; ++t) { + *(output_ptr + out_channel_offset + t) = tmp_avg[t] / (float16_t)real_count; + } +#endif + } // c4 loop + + int channel_s = c8 * C8NUM + c4 * C4NUM; + for (int k = channel_s; k < channel; k++) { + int in_channel_offset = in_batch_offset + k; + int out_channel_offset = out_plane_offset + k; + float16_t tmp_avg = 0; + int real_count = 0; + for (int h = 0; h < win_h; h++) { + for (int w = 0; w < win_w; w++) { + if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || + (in_w_index + w) >= in_w) { + continue; + } else { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg += *(input_ptr + in_offset); + ++real_count; + } + } // win_w loop + } // win_h loop + *(output_ptr + out_channel_offset) = tmp_avg / (float16_t)real_count; + } // channel_res loop + } // real_cal_num loop + } // out_plane loop + } // out_batch loop +} + +void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id) { + int stride_w = pooling_param->stride_w_; + int stride_h = pooling_param->stride_h_; + int pad_w = pooling_param->pad_l_; + int pad_h = pooling_param->pad_u_; + int win_w = pooling_param->window_w_; + int win_h = pooling_param->window_h_; + int channel = pooling_param->input_channel_; + int in_w = pooling_param->input_w_; + int in_h = pooling_param->input_h_; + int output_w = pooling_param->output_w_; + int output_h = pooling_param->output_h_; + int output_batch = pooling_param->output_batch_; + int out_plane = output_w * output_h; + int out_tile_count = UP_DIV(out_plane, TILE_NUM); + int thread_num = pooling_param->thread_num_; + int c8 = channel / C8NUM; + int c8_res = channel % C8NUM; + int c4 = c8_res / C4NUM; + // input channel is equal to output channel + + for (int batch = 0; batch < output_batch; batch++) { + int in_batch_offset = batch * in_h * in_w * channel; + int out_batch_offset = batch * output_h * output_w * channel; + for (int thread_id = task_id; thread_id < out_tile_count; thread_id += thread_num) { + int cal_start_index = thread_id * TILE_NUM; + int real_cal_num = (out_plane - cal_start_index) > TILE_NUM ? TILE_NUM : (out_plane - cal_start_index); + for (int i = 0; i < real_cal_num; i++) { + int index = cal_start_index + i; + int out_w_index = index % output_w; + int out_h_index = index / output_w; + int in_w_index = out_w_index * stride_w - pad_w; + int in_h_index = out_h_index * stride_h - pad_h; + int out_plane_offset = out_batch_offset + index * channel; + for (int j = 0; j < c8; j++) { + int in_channel_offset = in_batch_offset + j * C8NUM; + int out_channel_offset = out_plane_offset + j * C8NUM; +#ifdef ENABLE_NEON + float16x8_t tmp_max = vdupq_n_f16(-FLT_MAX); +#else + float16_t tmp_max[8]{-FLT_MAX}; +#endif + for (int h = 0; h < win_h; h++) { + for (int w = 0; w < win_w; w++) { + if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || + (in_w_index + w) >= in_w) { + continue; + } else { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; +#ifdef ENABLE_NEON + tmp_max = vmaxq_f16(tmp_max, vld1q_f16(input_ptr + in_offset)); +#else + for (int k = 0; k < C8NUM; k++) { + tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k)); + } +#endif + } + } // win_w loop + } // win_h loop +#ifdef ENABLE_NEON + vst1q_f16(output_ptr + out_channel_offset, tmp_max); +#else + for (int l = 0; l < C8NUM; ++l) { + *(output_ptr + out_channel_offset + l) = tmp_max[l]; + } +#endif + } // c8 loop + + int c4_offset = c8 * C8NUM; + for (int j = 0; j < c4; j++) { + int in_channel_offset = in_batch_offset + c4_offset + j * C4NUM; + int out_channel_offset = out_plane_offset + c4_offset + j * C4NUM; +#ifdef ENABLE_NEON + float16x4_t tmp_max = vdup_n_f16(-FLT_MAX); +#else + float16_t tmp_max[4]{-FLT_MAX}; +#endif + for (int h = 0; h < win_h; h++) { + for (int w = 0; w < win_w; w++) { + if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || + (in_w_index + w) >= in_w) { + continue; + } else { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; +#ifdef ENABLE_NEON + tmp_max = vmax_f16(tmp_max, vld1_f16(input_ptr + in_offset)); +#else + for (int k = 0; k < C4NUM; k++) { + tmp_max[k] = fmax(tmp_max[k], *(input_ptr + in_offset + k)); + } +#endif + } + } // win_w loop + } // win_h loop +#ifdef ENABLE_NEON + vst1_f16(output_ptr + out_channel_offset, tmp_max); +#else + for (int l = 0; l < C4NUM; ++l) { + *(output_ptr + out_channel_offset + l) = tmp_max[l]; + } +#endif + } // c4 loop + + int channel_s = c8 * C8NUM + c4 * C4NUM; + for (int k = channel_s; k < channel; k++) { + int in_channel_offset = in_batch_offset + k; + int out_channel_offset = out_plane_offset + k; + float16_t tmp_max = -FLT_MAX; + for (int h = 0; h < win_h; h++) { + for (int w = 0; w < win_w; w++) { + if ((in_h_index + h) < 0 || (in_h_index + h) >= in_h || (in_w_index + w) < 0 || + (in_w_index + w) >= in_w) { + continue; + } else { + int in_offset = in_channel_offset + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_max = fmax(tmp_max, *(input_ptr + in_offset)); + } + } // win_w loop + } // win_h loop + *(output_ptr + out_channel_offset) = tmp_max; + } // channel_res loop + } // real_cal_num loop + } // out_plane loop + } // out_batch loop +} diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pooling_fp16.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pooling_fp16.h new file mode 100644 index 0000000000000000000000000000000000000000..74a4e98f50d6503eaf52d16b113f963ffb1a2328 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pooling_fp16.h @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_POOLING_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_POOLING_FP16_H_ + +#include +#include "nnacl/pooling_parameter.h" + +void AvgPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id); + +void MaxPoolingFp16(const float16_t *input_ptr, float16_t *output_ptr, PoolingParameter *pooling_param, int task_id); + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_POOLING_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.cc index 4a8af888af0657652242be8739ea3981acaaf330..821c8b6feace233992f08c4767ad36e0bcc69ec5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.cc +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.cc @@ -45,17 +45,3 @@ void Float32ToInt32(const float *input, int32_t *output, int number) { output[i] = (int32_t)input[i]; } } - -#ifdef ENABLE_FP16 -void Float32ToFloat16(const float *input, float16_t *output, int number) { - for (int i = 0; i < number; ++i) { - output[i] = (float16_t)input[i]; - } -} - -void Float16ToFloat32(const float16_t *input, float *output, int number) { - for (int i = 0; i < number; ++i) { - output[i] = (float)input[i]; - } -} -#endif diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.h index 3c744a670313d31929695477f4079367287bbe5e..cbdb572aa0c83d06f8983035d7b55856bc92fe50 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/cast.h @@ -33,9 +33,5 @@ void Uint8ToInt8(const uint8_t *input, int8_t *output, int number); void Int8ToUint8(const int8_t *input, uint8_t *output, int number); void Int32ToFloat32(const int32_t *input, float *output, int number); void Float32ToInt32(const float *input, int32_t *output, int number); -#ifdef ENABLE_FP16 -void Float32ToFloat16(const float *input, float16_t *output, int number); -void Float16ToFloat32(const float16_t *input, float *output, int number); -#endif #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_CAST_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/pooling.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/pooling.h index 8b167a3d65cf96fb7b7de7fedc6611347cba70fc..9c2b0d1e77b9d3cec2d9fa4e2176a542e0bd2645 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/pooling.h +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/pooling.h @@ -21,35 +21,9 @@ #include #endif #include "nnacl/op_base.h" +#include "nnacl/pooling_parameter.h" #include "nnacl/quantization/quantize.h" -typedef struct PoolingParameter { - OpParameter op_parameter_; - QuantArg **quant_args_; - bool global_; - bool max_pooling_; - bool avg_pooling_; - bool round_ceil_; - bool round_floor_; - int window_w_; - int window_h_; - int input_w_; - int input_h_; - int input_batch_; - int input_channel_; - int output_w_; - int output_h_; - int output_batch_; - int output_channel_; - int pad_u_; - int pad_d_; - int pad_l_; - int pad_r_; - int stride_w_; - int stride_h_; - int thread_num_; -} PoolingParameter; - void AvgPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); void MaxPooling(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param, int task_id); diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/pooling_parameter.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/pooling_parameter.h new file mode 100644 index 0000000000000000000000000000000000000000..6473469103114bc561dc9f9cfb0a984258d9bbdd --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/pooling_parameter.h @@ -0,0 +1,49 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_POOLING_PARAMETER_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_POOLING_PARAMETER_H_ + +#include "nnacl/op_base.h" +#include "nnacl/quantization/quantize.h" + +typedef struct PoolingParameter { + OpParameter op_parameter_; + QuantArg **quant_args_; + bool global_; + bool max_pooling_; + bool avg_pooling_; + bool round_ceil_; + bool round_floor_; + int window_w_; + int window_h_; + int input_w_; + int input_h_; + int input_batch_; + int input_channel_; + int output_w_; + int output_h_; + int output_batch_; + int output_channel_; + int pad_u_; + int pad_d_; + int pad_l_; + int pad_r_; + int stride_w_; + int stride_h_; + int thread_num_; +} PoolingParameter; + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_POOLING_PARAMETER_H_