Merge branch 'fp16' into 'master'

Fix fp16 delegator register See merge request applied-machine-learning/sysml/mace!1318

Merge branch 'fp16' into 'master'
Fix fp16 delegator register See merge request applied-machine-learning/sysml/mace!1318
0945e498 · 卢旭辉 · 4f5c284a · ef748943 · 0945e498 · 0945e498
6 changed file
--- a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
@@ -20,16 +20,6 @@ namespace mace {
 namespace ops {
 namespace arm {
-extern template
-MaceStatus DepthwiseConv2dK3x3S1<float16_t>::DoCompute(
-    const DepthwiseConvComputeParam &p, const float16_t *filter_data,
-    const float16_t *input_data, float16_t *output_data);
-extern template
-MaceStatus DepthwiseConv2dK3x3S2<float16_t>::DoCompute(
-    const DepthwiseConvComputeParam &p, const float16_t *filter_data,
-    const float16_t *input_data, float16_t *output_data);
 namespace {
 template<typename T>
 void DepthwiseConv2d3x3Pixel(const T *in_base,
@@ -474,16 +464,6 @@ void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
      delegator::DepthwiseConv2dParam,
      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
                            BFloat16, ImplType::NEON, K3x3S2));
-  MACE_REGISTER_FP16_DELEGATOR(
-      registry, DepthwiseConv2dK3x3S1<float16_t>,
-      delegator::DepthwiseConv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
-                            float16_t, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_FP16_DELEGATOR(
-      registry, DepthwiseConv2dK3x3S2<float16_t>,
-      delegator::DepthwiseConv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
-                            float16_t, ImplType::NEON, K3x3S2));
 }
 }  // namespace arm

--- a/mace/ops/arm/base/gemm.cc
+++ b/mace/ops/arm/base/gemm.cc
@@ -23,24 +23,6 @@ namespace mace {
 namespace ops {
 namespace arm {
-extern template void Gemm<float16_t>::Pack8x4(
-    const MatrixMap<const float16_t> &matrix,
-    MatrixMajor dst_major, float16_t *packed_matrix);
-extern template void Gemm<float16_t>::Unpack8x8(
-    const float16_t *packed_output, MatrixMap<float16_t> *output);
-extern template void Gemm<float16_t>::PackLhs(
-    const MatrixMap<const float16_t> &lhs, float16_t *packed_lhs);
-extern template void Gemm<float16_t>::PackRhs(
-    const MatrixMap<const float16_t> &rhs, float16_t *packed_rhs);
-extern template void Gemm<float16_t>::UnpackOutput(
-    const float16_t *packed_output, MatrixMap<float16_t> *output);
-extern template MaceStatus Gemm<float16_t>::Compute(
-    const OpContext *context, const Tensor *lhs, const Tensor *rhs,
-    const index_t batch, const index_t rows, const index_t cols,
-    const index_t depth, const MatrixMajor lhs_major,
-    const MatrixMajor rhs_major, const MatrixMajor output_major,
-    const bool lhs_batched, const bool rhs_batched, Tensor *output);
 template<typename T>
 void Gemm<T>::Pack4x4(const MatrixMap<const T> &matrix,
                      MatrixMajor dst_major, T *packed_matrix) {
@@ -719,12 +701,7 @@ void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_BF16_DELEGATOR(
      registry, Gemm<BFloat16>, delegator::GemmParam,
      MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, BFloat16, ImplType::NEON));
-  MACE_REGISTER_FP16_DELEGATOR(
-      registry, Gemm<float16_t>, delegator::GemmParam,
-      MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float16_t, ImplType::NEON));
 }
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/fp16/depthwise_conv_2d_3x3.cc
@@ -403,6 +403,18 @@ MaceStatus DepthwiseConv2dK3x3S2<float16_t>::DoCompute(
  return MaceStatus::MACE_SUCCESS;
 }
+void RegisterFP16DepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_FP16_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S1<float16_t>,
+      delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float16_t, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_FP16_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S2<float16_t>,
+      delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float16_t, ImplType::NEON, K3x3S2));
+}
 }  // namespace arm
 }  // namespace ops

--- a/mace/ops/arm/fp16/gemm.cc
+++ b/mace/ops/arm/fp16/gemm.cc
--- a/mace/ops/registry/op_delegators_registry.cc
+++ b/mace/ops/registry/op_delegators_registry.cc
@@ -69,7 +69,11 @@ extern void RegisterGroupDeconv2dGeneralDelegator(
 extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
 extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
+#ifdef MACE_ENABLE_FP16
+extern void RegisterFP16DepthwiseConv2dK3x3Delegator(
+    OpDelegatorRegistry *registry);
+extern void RegisterFP16GemmDelegator(OpDelegatorRegistry *registry);
+#endif
 #ifdef MACE_ENABLE_QUANTIZE
 namespace q8 {
 extern void RegisterEltwiseDelegator(OpDelegatorRegistry *registry);
@@ -89,7 +93,6 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
  ref::RegisterDepthwiseDeconv2dDelegator(registry);
  ref::RegisterGemmDelegator(registry);
  ref::RegisterGemvDelegator(registry);
 #ifdef MACE_ENABLE_QUANTIZE
  ref::q8::RegisterEltwiseDelegator(registry);
  ref::q8::RegisterGemvDelegator(registry);
@@ -123,7 +126,10 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
  arm::RegisterGemmDelegator(registry);
  arm::RegisterGemvDelegator(registry);
+#ifdef MACE_ENABLE_FP16
+  arm::RegisterFP16DepthwiseConv2dK3x3Delegator(registry);
+  arm::RegisterFP16GemmDelegator(registry);
+#endif
 #ifdef MACE_ENABLE_QUANTIZE
  arm::q8::RegisterEltwiseDelegator(registry);
  arm::q8::RegisterGemvDelegator(registry);

--- a/tools/bazel_build_standalone_lib.sh
+++ b/tools/bazel_build_standalone_lib.sh
@@ -115,7 +115,7 @@ $(echo "$1" | cut -d '=' -f -1)"
  esac
 done
-if [[ "${enable_apu}" == true ]];then
+if [[ "${enable_apu}" == true || "${abi}" != armeabi-v7a || "${abi}" != arm64-v8a ]];then
  enable_rpcmem=false
 fi