提交 3e9bb73e 编写于 作者: 叶剑武

Merge branch 'fp16' into 'master'

Fp16

See merge request !1073
......@@ -71,7 +71,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]),
linkopts = ["-ldl"],
......
......@@ -46,28 +46,28 @@ namespace mace {
break; \
}
#ifdef MACE_ENABLE_OPENCL
#define MACE_TYPE_ENUM_SWITCH( \
TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \
switch (TYPE_ENUM) { \
MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \
case DT_INVALID: \
INVALID_STATEMENTS; \
break; \
default: \
DEFAULT_STATEMENTS; \
break; \
}
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \
MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS))
#else
#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS)
#endif
#if MACE_ENABLE_OPENCL
#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS))
#else
#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS)
#endif
#define MACE_TYPE_ENUM_SWITCH( \
TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \
switch (TYPE_ENUM) { \
MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
case DT_INVALID: \
INVALID_STATEMENTS; \
break; \
......@@ -75,7 +75,6 @@ namespace mace {
DEFAULT_STATEMENTS; \
break; \
}
#endif
// `TYPE_ENUM` will be converted to template `T` in `STATEMENTS`
#define MACE_RUN_WITH_TYPE_ENUM(TYPE_ENUM, STATEMENTS) \
......
......@@ -47,6 +47,10 @@ size_t GetEnumTypeSize(const DataType dt) {
return sizeof(float);
case DT_HALF:
return sizeof(half);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
case DT_FLOAT16:
return sizeof(float16_t);
#endif
case DT_UINT8:
return sizeof(uint8_t);
case DT_INT32:
......
......@@ -17,6 +17,9 @@
#include <cstdint>
#include <string>
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h>
#endif
#include "mace/proto/mace.pb.h"
#include "include/half.hpp"
......@@ -51,6 +54,9 @@ struct EnumToDataType;
};
MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_MAPPING_DATA_TYPE_AND_ENUM(float16_t, DT_FLOAT16);
#endif
MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT);
MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8);
MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32);
......
......@@ -38,7 +38,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -61,7 +61,20 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
],
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
deps = [
"//mace/core",
"@gtest",
......@@ -89,7 +102,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -111,6 +124,7 @@ cc_library(
srcs = glob(
[
"arm/fp32/*.cc",
"arm/fp16/gemv.h",
],
exclude = [
"arm/fp32/*_test.cc",
......@@ -141,7 +155,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -186,7 +200,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -221,7 +235,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -256,7 +270,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -324,7 +338,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -368,7 +382,7 @@ cc_library(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -398,7 +412,7 @@ cc_library(
] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
......@@ -439,7 +453,7 @@ cc_test(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......@@ -476,7 +490,7 @@ cc_test(
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
"-mfpu=neon-fp16",
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP16_GEMV_H_
#define MACE_OPS_ARM_FP16_GEMV_H_
#include "mace/core/types.h"
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h>
#endif
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) && defined(__ANDROID__)
#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
#endif
namespace mace {
namespace ops {
template<typename INPUT_TYPE_LEFT,
typename INPUT_TYPE_RIGHT,
typename OUTPUT_TYPE>
void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr,
const INPUT_TYPE_RIGHT *v_ptr,
const index_t height,
const index_t width,
OUTPUT_TYPE *result);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
template<>
void FP16Gemv<float16_t, float, float>(const float16_t *m_ptr,
const float *v_ptr,
const index_t height,
const index_t width,
float *out_ptr) {
#pragma omp parallel for
for (index_t h = 0; h < height; ++h) {
const float16_t *m_ptr0 = m_ptr + h * width;
const float *v_ptr0 = v_ptr;
float *out_ptr0 = out_ptr + h;
float sum0 = 0;
float32x4_t vm0, vm1, vm2, vm3;
float32x4_t vv0, vv1, vv2, vv3;
float32x4_t vsum0 = vdupq_n_f32(0.f);
float32x4_t vsum1 = vdupq_n_f32(0.f);
float32x4_t vsum2 = vdupq_n_f32(0.f);
float32x4_t vsum3 = vdupq_n_f32(0.f);
index_t w;
for (w = 0; w + 15 < width; w += 16) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vv1 = vld1q_f32(v_ptr0 + 4);
vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8));
vv2 = vld1q_f32(v_ptr0 + 8);
vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12));
vv3 = vld1q_f32(v_ptr0 + 12);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
vsum1 = vmlaq_f32(vsum1, vm1, vv1);
vsum2 = vmlaq_f32(vsum2, vm2, vv2);
vsum3 = vmlaq_f32(vsum3, vm3, vv3);
m_ptr0 += 16;
v_ptr0 += 16;
}
for (; w + 7 < width; w += 8) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vv1 = vld1q_f32(v_ptr0 + 4);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
vsum1 = vmlaq_f32(vsum1, vm1, vv1);
m_ptr0 += 8;
v_ptr0 += 8;
}
for (; w + 3 < width; w += 4) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
m_ptr0 += 4;
v_ptr0 += 4;
}
vsum0 += vsum1;
vsum2 += vsum3;
vsum0 += vsum2;
sum0 = vaddvq_f32(vsum0);
for (; w < width; ++w) {
sum0 += m_ptr0[0] * v_ptr0[0];
m_ptr0++;
v_ptr0++;
}
*out_ptr0++ = sum0;
}
}
#endif
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_FP16_GEMV_H_
......@@ -14,6 +14,10 @@
#include "mace/core/operator.h"
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h>
#endif
namespace mace {
namespace ops {
......@@ -55,6 +59,10 @@ void RegisterCast(OpRegistryBase *op_registry) {
DeviceType::CPU, float);
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, int32_t);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "Cast", CastOp,
DeviceType::CPU, float16_t);
#endif
}
} // namespace ops
......
......@@ -93,6 +93,10 @@ void RegisterGather(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
DeviceType::CPU, float16_t);
#endif
}
} // namespace ops
......
......@@ -44,6 +44,9 @@
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/matmul.h"
#endif // MACE_ENABLE_OPENCL
#ifdef MACE_ENABLE_NEON
#include "mace/ops/arm/fp16/gemv.h"
#endif
namespace mace {
namespace ops {
......@@ -510,6 +513,86 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
};
#endif // MACE_ENABLE_OPENCL
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
template <>
class MatMulOp<CPU, float16_t> : public MatMulOpBase {
public:
explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) {}
MaceStatus Run(OpContext *context) override {
MACE_CHECK_NOTNULL(context);
Validate();
const Tensor *A = this->Input(INPUT_A);
const Tensor *B = this->Input(INPUT_B);
Tensor *C = this->Output(OUTPUT);
index_t batch;
index_t height;
index_t K;
index_t width;
index_t rank = A->dim_size();
height = A->dim(rank - 2);
K = A->dim(rank - 1);
if (transpose_a_) {
std::swap(height, K);
}
if (transpose_b_) {
width = B->dim(rank - 2);
} else {
width = B->dim(rank - 1);
}
batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
std::multiplies<index_t>());
std::vector<index_t> c_shape = A->shape();
c_shape[rank - 2] = height;
c_shape[rank - 1] = width;
MACE_RETURN_IF_ERROR(C->Resize(c_shape));
Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B);
Tensor::MappingGuard guardc(C);
auto *c_ptr_base = C->mutable_data<float>();
MACE_CHECK(batch == 1, "matmul fp16 only support batch = 1 now");
if (width == 1 && !transpose_a_ && A->dtype() == DT_FLOAT16 &&
B->dtype() == DT_FLOAT) {
auto *a_ptr_base = A->data<float16_t>();
auto *b_ptr_base = B->data<float>();
FP16Gemv(a_ptr_base,
b_ptr_base,
height,
K,
c_ptr_base);
return MaceStatus::MACE_SUCCESS;
} else if (height == 1 && transpose_b_ && A->dtype() == DT_FLOAT &&
B->dtype() == DT_FLOAT16) {
auto *b_ptr_base = B->data<float16_t>();
auto *a_ptr_base = A->data<float>();
FP16Gemv(b_ptr_base,
a_ptr_base,
width,
K,
c_ptr_base);
return MaceStatus::MACE_SUCCESS;
} else {
LOG(INFO) << "Matmul fp16 gemv args: " << height << " " << width << " "
<< transpose_a_ << " " << transpose_b_;
LOG(FATAL) << "Matmul fp16 Op only support fp32[1,k]·fp16[w,k]T or"
" fp16[w,k]·fp32[k,1] now!";
return MaceStatus::MACE_INVALID_ARGS;
}
}
private:
};
#endif // MACE_ENABLE_NEON
void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float);
......@@ -518,6 +601,19 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, float16_t);
#endif // MACE_ENABLE_NEON
}
} // namespace ops
......
......@@ -263,8 +263,17 @@ void MatMulBenchmark(
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("A", {batch, height, channels});
net.AddRandomInput<D, T>("B", {batch, channels, out_width});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
if (DataTypeToEnum<T>::value == DT_FLOAT16) {
net.AddRandomInput<D, float16_t>("A", {batch, height, channels});
net.AddRandomInput<D, float>("B", {batch, channels, out_width});
} else {
#endif
net.AddRandomInput<D, T>("A", {batch, height, channels});
net.AddRandomInput<D, T>("B", {batch, channels, out_width});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
}
#endif
net.GetTensor("A")->SetIsWeight(true);
net.GetTensor("B")->SetIsWeight(true);
if (DataTypeToEnum<T>::value == DT_UINT8) {
......@@ -305,8 +314,17 @@ void MatMulTransposeBenchmark(
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("A", {batch, height, channels});
net.AddRandomInput<D, T>("B", {batch, out_width, channels});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
if (DataTypeToEnum<T>::value == DT_FLOAT16) {
net.AddRandomInput<D, float>("A", {batch, height, channels});
net.AddRandomInput<D, float16_t>("B", {batch, out_width, channels});
} else {
#endif
net.AddRandomInput<D, T>("A", {batch, height, channels});
net.AddRandomInput<D, float>("B", {batch, out_width, channels});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
}
#endif
net.GetTensor("A")->SetIsWeight(true);
net.GetTensor("B")->SetIsWeight(true);
if (DataTypeToEnum<T>::value == DT_UINT8) {
......@@ -381,10 +399,10 @@ void MatMulTransposeBenchmark(
#ifdef MACE_ENABLE_QUANTIZE
#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU)
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
#else
#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU)
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU);
#endif
MACE_BM_MATMUL_OP(1, 30000, 256, 1);
......@@ -407,6 +425,21 @@ MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49);
MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 961);
MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 3969);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#define MACE_BM_MATMUL_TRANPOSE_FP16(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 30000);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 256);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 256, 2048);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 256);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 30000);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 512);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 512, 2048);
MACE_BM_MATMUL_TRANPOSE_FP16(1, 1, 2048, 512);
#endif // MACE_ENABLE_NEON
} // namespace test
} // namespace ops
} // namespace mace
......@@ -330,6 +330,69 @@ void QuantOutputInt32(const std::vector<index_t> &batch,
}
} // namespace
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
namespace {
void FloatOutput16(const std::vector<index_t> &batch,
const index_t rows,
const index_t depth,
const index_t cols,
const bool transpose_lhs,
const bool transpose_rhs,
const bool lhs_batched = true,
const bool rhs_batched = true) {
// Construct graph
OpsTestNet net;
index_t lhs_rows = transpose_lhs ? depth : rows;
index_t lhs_cols = transpose_lhs ? rows : depth;
index_t rhs_rows = transpose_rhs ? cols : depth;
index_t rhs_cols = transpose_rhs ? depth: cols;
std::vector<index_t> lhs_shape = {lhs_rows, lhs_cols};
std::vector<index_t> rhs_shape = {rhs_rows, rhs_cols};
if (lhs_batched) {
lhs_shape.insert(lhs_shape.begin(), batch.begin(), batch.end());
}
if (rhs_batched) {
rhs_shape.insert(rhs_shape.begin(), batch.begin(), batch.end());
}
net.AddRandomInput<CPU, float>("A", lhs_shape);
net.AddRandomInput<CPU, float>("B", rhs_shape);
OpDefBuilder("MatMul", "MatMulTest")
.Input("A")
.AddIntArg("transpose_a", transpose_lhs ? 1 : 0)
.Input("B")
.AddIntArg("transpose_b", transpose_rhs ? 1 : 0)
.Output("Output")
.AddIntArg("T", DT_FLOAT)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
OpDefBuilder("Cast", "CastTest")
.Input("B")
.Output("HalveB")
.OutputType({DT_FLOAT16})
.AddIntArg("T", DT_FLOAT)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("MatMul", "Float16MatMulTest")
.Input("A")
.AddIntArg("transpose_a", transpose_lhs ? 1 : 0)
.Input("HalveB")
.AddIntArg("transpose_b", transpose_rhs ? 1 : 0)
.Output("Float16Output")
.AddIntArg("T", DT_FLOAT16)
.OutputType({DT_FLOAT})
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("Output"),
*net.GetTensor("Float16Output"), 0.01);
}
} // namespace
#endif // MACE_ENABLE_NEON
TEST_F(MatMulOpTest, QuantOutputUint8) {
QuantOutputUint8({1}, 64, 128, 32, false, false);
QuantOutputUint8({1}, 64, 32, 128, false, false);
......@@ -381,6 +444,19 @@ TEST_F(MatMulOpTest, QuantOutputInt32) {
QuantOutputInt32({2, 3}, 31, 61, 67, true, true, false, true);
}
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
TEST_F(MatMulOpTest, FloatOutput16) {
FloatOutput16({1}, 1, 512, 30745, false, true, false, false);
FloatOutput16({1}, 1, 256, 30000, false, true, false, false);
FloatOutput16({1}, 1, 256, 2048, false, true, false, false);
FloatOutput16({1}, 1, 2048, 256, false, true, false, false);
FloatOutput16({1}, 1, 512, 30000, false, true, false, false);
FloatOutput16({1}, 1, 512, 512, false, true, false, false);
FloatOutput16({1}, 1, 512, 2048, false, true, false, false);
FloatOutput16({1}, 1, 2048, 512, false, true, false, false);
}
#endif // MACE_ENABLE_NEON
} // namespace test
} // namespace ops
} // namespace mace
......@@ -162,7 +162,8 @@ class OpsTestNet {
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
if (DataTypeToEnum<T>::value == DT_HALF) {
if (DataTypeToEnum<T>::value == DT_HALF ||
DataTypeToEnum<T>::value == DT_FLOAT16) {
std::generate(
input_data, input_data + input->size(),
[&gen, &nd, positive, truncate, truncate_min, truncate_max] {
......
......@@ -13,6 +13,7 @@ enum DataType {
DT_UINT8 = 2;
DT_HALF = 3;
DT_INT32 = 4;
DT_FLOAT16 = 5;
}
enum MemoryType {
......
......@@ -314,6 +314,8 @@ class TransformerRule(Enum):
TRANSFORM_CHANNEL_SHUFFLE = 38
UPDATE_DATA_FORMAT = 39
QUANTIZE_SPECIFIC_OPS_ONLY = 40
FP16_MATMUL_WEIGHT = 41
FP16_GATHER_WEIGHT = 42
class ConverterInterface(object):
......
......@@ -106,6 +106,10 @@ class Transformer(base_converter.ConverterInterface):
self.transform_channel_shuffle,
TransformerRule.QUANTIZE_SPECIFIC_OPS_ONLY:
self.quantize_specific_ops_only,
TransformerRule.FP16_MATMUL_WEIGHT:
self.fp16_matmul_weight,
TransformerRule.FP16_GATHER_WEIGHT:
self.fp16_gather_weight,
}
self._option = option
......@@ -1853,6 +1857,102 @@ class Transformer(base_converter.ConverterInterface):
op.quantize_info[i].minval,
op.quantize_info[i].maxval))
def fp16_gather_weight(self):
for op in self._model.op:
if op.type != MaceOp.Gather.name:
continue
if op.input[0] not in self._consts:
raise KeyError("Not in const tensor: " + str(op.input[0]))
const_tensor = self._consts[op.input[0]]
if const_tensor.data_type == mace_pb2.DT_FLOAT16:
print(str(const_tensor.name) + " is alreay float16")
continue
print("FP16 Embedding Lookup Weights: %s" % const_tensor.name)
op_outputs = [x for x in op.output]
new_gather_name = op.name + '_fp16'
new_gather_output_name = new_gather_name + ":0"
dehalve_name = op.name
# fp16 weights
const_tensor.data_type = mace_pb2.DT_FLOAT16
# change gather
op.name = new_gather_name
op.output[:] = [new_gather_output_name]
# op.output.extend([new_gather_output_name])
data_type_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_op_data_type_str) # noqa
if data_type_arg is None:
data_type_arg = op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = mace_pb2.DT_FLOAT16
# add dehalve
dehalve_op = self._model.op.add()
dehalve_op.name = dehalve_name
dehalve_op.type = MaceOp.Cast.name
dehalve_op.input.extend([new_gather_output_name])
dehalve_op.output.extend(op_outputs)
dehalve_op.output_shape.extend(op.output_shape)
dehalve_op.output_type.extend([mace_pb2.DT_FLOAT])
data_type_arg = dehalve_op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = mace_pb2.DT_FLOAT16
def fp16_matmul_weight(self):
if self._option.device != DeviceType.CPU.value:
return
print('Convert matmul weights to fp16 for specific matmul: activation + weights') # noqa
for op in self._model.op:
if op.type != MaceOp.MatMul.name:
continue
if op.input[0] not in self._consts and op.input[1] not in self._consts: # noqa
continue
if op.input[0] in self._consts and op.input[1] in self._consts:
continue
# Matmul fp16 Op only support fp32[1,k] x fp16[w,k]T or fp16[w,k] x fp32[k,1] now! # noqa
transpose_a_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_a_str) # noqa
transpose_b_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa
transpose_a = transpose_a_arg is not None and transpose_a_arg.i == 1 # noqa
transpose_b = transpose_b_arg is not None and transpose_b_arg.i == 1 # noqa
left_tensor = op.input[0]
right_tensor = op.input[1]
left_shape = self.get_tensor_shape(left_tensor)
right_shape = self.get_tensor_shape(right_tensor)
height = left_shape[-1] if transpose_a else left_shape[-2]
width = right_shape[-2] if transpose_b else right_shape[-1]
batch = reduce(lambda x, y: x * y, left_shape[: -2], 1)
if batch != 1:
continue
if left_tensor in self._consts:
if width != 1 or transpose_a:
continue
const_tensor = self._consts[left_tensor]
else:
if height != 1 or not transpose_b:
continue
const_tensor = self._consts[right_tensor]
print('Convert Matmul Weights to fp16: %s' % op.name)
const_tensor.data_type = mace_pb2.DT_FLOAT16
data_type_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_op_data_type_str) # noqa
if data_type_arg is None:
data_type_arg = op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = mace_pb2.DT_FLOAT16
op.output_type.extend([mace_pb2.DT_FLOAT])
def add_opencl_informations(self):
print("Add OpenCL informations")
......@@ -2052,10 +2152,7 @@ class Transformer(base_converter.ConverterInterface):
data_type_arg = quantize_op.arg.add()
data_type_arg.name = MaceKeyword.mace_non_zero
if non_zero:
data_type_arg.i = 1
else:
data_type_arg.i = 0
data_type_arg.i = 0
find_range_arg = quantize_op.arg.add()
find_range_arg.name = \
......@@ -2064,8 +2161,6 @@ class Transformer(base_converter.ConverterInterface):
quantized_inputs_names[-1] = quantize_output_name
non_zero = False
del op.input[:]
op.input.extend(quantized_inputs_names)
......
......@@ -117,6 +117,9 @@ class TensorInfo:
elif tensor.data_type == mace_pb2.DT_UINT8:
self.data = bytearray(
np.array(tensor.int32_data).astype(np.uint8).tolist())
elif tensor.data_type == mace_pb2.DT_FLOAT16:
self.data = bytearray(
np.array(tensor.float_data).astype(np.float16).tobytes())
else:
raise Exception('Tensor data type %s not supported' %
tensor.data_type)
......@@ -139,7 +142,8 @@ def update_tensor_infos(net_def, data_type):
offset += padding
if tensor.data_type == mace_pb2.DT_FLOAT \
or tensor.data_type == mace_pb2.DT_HALF:
or tensor.data_type == mace_pb2.DT_HALF \
or tensor.data_type == mace_pb2.DT_FLOAT16:
tensor.data_size = len(tensor.float_data)
elif tensor.data_type == mace_pb2.DT_INT32:
tensor.data_size = len(tensor.int32_data)
......@@ -178,7 +182,8 @@ def save_model_data(net_def, model_tag, output_dir):
def save_model_to_proto(net_def, model_tag, output_dir):
for tensor in net_def.tensors:
if tensor.data_type == mace_pb2.DT_FLOAT \
or tensor.data_type == mace_pb2.DT_HALF:
or tensor.data_type == mace_pb2.DT_HALF \
or tensor.data_type == mace_pb2.DT_FLOAT16:
del tensor.float_data[:]
elif tensor.data_type == mace_pb2.DT_INT32:
del tensor.int32_data[:]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册