提交 ccd6b9a4 编写于 作者: Z zhanyuan

Add fp32 & int8 ops of Matmul(Batchmatmul)

上级 201bcdd9
......@@ -33,29 +33,30 @@ int MatMul::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor
auto output = outputs_.front();
MS_ASSERT(output != nullptr);
std::vector<int> x_shape = input0->shape();
std::vector<int> w_shape = input1->shape();
if (x_shape.size() < 2 || w_shape.size() < 2) {
std::vector<int> a_shape = input0->shape();
std::vector<int> b_shape = input1->shape();
if (a_shape.size() < 3 || b_shape.size() < 3) {
MS_LOG(ERROR) << "inputs shape is invalid";
return RET_INPUT_TENSOR_ERROR;
}
for (int i = 0; i < a_shape.size() - 2; ++i) {
if (a_shape[i] != b_shape[i]) {
MS_LOG(ERROR) << "Op MatMul's dimensions must be equal";
return RET_INPUT_TENSOR_ERROR;
}
}
auto matmul_prim = this->primitive->value_as_MatMul();
if (matmul_prim->transposeA()) {
int tmp = x_shape.back();
x_shape[x_shape.size() - 1] = x_shape[x_shape.size() - 2];
x_shape[x_shape.size() - 2] = tmp;
std::swap(a_shape[a_shape.size() - 1], a_shape[a_shape.size() - 2]);
}
if (matmul_prim->transposeB()) {
int tmp = w_shape.back();
w_shape[w_shape.size() - 1] = w_shape[w_shape.size() - 2];
w_shape[w_shape.size() - 2] = tmp;
std::swap(b_shape[b_shape.size() - 1], b_shape[b_shape.size() - 2]);
}
auto y_shape_size = std::max(x_shape.size(), w_shape.size());
std::vector<int> y_shape(y_shape_size);
y_shape = x_shape;
y_shape[y_shape_size - 1] = w_shape[w_shape.size() - 1];
output->set_shape(y_shape);
std::vector<int> c_shape(a_shape);
c_shape[c_shape.size() - 1] = b_shape[b_shape.size() - 1];
output->set_shape(c_shape);
output->set_data_type(input0->data_type());
output->SetFormat(input0->GetFormat());
......
......@@ -139,6 +139,8 @@ Primitive *Primitive::CreatePrimitive(schema::Primitive *primitive) {
return new lite::SpaceToBatch(const_cast<schema::Primitive *>(primitive));
case schema::PrimitiveType_QuantDTypeCast:
return new lite::QuantDTypeCast(const_cast<schema::Primitive *>(primitive));
case schema::PrimitiveType_MatMul:
return new lite::MatMul(const_cast<schema::Primitive *>(primitive));
default:
break;
}
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/base/matmul_base.h"
#include "src/runtime/kernel/arm/fp32/matmul.h"
#include "src/runtime/kernel/arm/int8/matmul_int8.h"
#include "src/kernel_factory.h"
#include "include/errorcode.h"
#include "include/context.h"
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_MatMul;
namespace mindspore::kernel {
kernel::LiteKernel *CpuMatmulKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, OpParameter *opParameter,
const lite::Context *ctx, const kernel::KernelKey &desc) {
MS_ASSERT(opParameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_Concat);
auto input_tensor = inputs.at(kInputIndex);
auto data_type = input_tensor->data_type();
kernel::LiteKernel *kernel = nullptr;
switch (data_type) {
case kNumberTypeInt8:
case kNumberTypeUInt8: {
kernel = new (std::nothrow) MatmulInt8CPUKernel(opParameter, inputs, outputs, ctx);
if (!kernel) {
MS_LOG(ERROR) << "kernel is nullptr.";
return nullptr;
}
break;
}
case kNumberTypeFloat32: {
kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx);
if (!kernel) {
MS_LOG(ERROR) << "kernel is nullptr.";
return nullptr;
}
break;
}
default:
break;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
delete kernel;
MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
return nullptr;
}
return kernel;
}
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulKernelCreator)
} // namespace mindspore::kernel
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
#include <vector>
#include "src/lite_kernel.h"
#include "include/context.h"
#include "src/runtime/kernel/arm/opclib/matmul.h"
using mindspore::lite::Context;
namespace mindspore::kernel {
class MatmulBaseCPUKernel : public LiteKernel {
public:
MatmulBaseCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
: LiteKernel(parameter, inputs, outputs), ctx_(ctx), thread_count_(ctx->threadNum) {
params_ = reinterpret_cast<MatMulParameter *>(opParameter);
}
~MatmulBaseCPUKernel() = default;
int Init() override { return 0; }
int ReSize() override { return 0; }
int Run() override { return 0; }
protected:
MatMulParameter *params_;
int thread_count_;
int thread_stride_;
const Context *ctx_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
......@@ -15,44 +15,102 @@
*/
#include "src/runtime/kernel/arm/fp32/matmul.h"
#include <vector>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "src/runtime/kernel/arm/opclib/fp32/matmul.h"
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"
using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_MatMul;
namespace mindspore::kernel {
MatmulCPUKernel::~MatmulCPUKernel() {
ctx_->allocator->Free(a_c8_ptr_);
ctx_->allocator->Free(b_r8_ptr_);
ctx_->allocator->Free(c_r8x8_ptr_);
}
int MatmulCPUKernel::ReSize() { return RET_OK; }
int MatmulCPUKernel::Run() { return RET_OK; }
int MatmulCPUKernel::Init() {
int batch = 1;
auto x_shape = inputs_[0]->shape();
auto o_shape = outputs_[0]->shape();
for (int i = 0; i < x_shape.size() - 2; ++i) {
batch *= x_shape[i];
}
params_->batch = batch;
params_->row_ = o_shape[o_shape.size() - 2];
params_->col_ = o_shape[o_shape.size() - 1];
params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
params_->row_8_ = UP_ROUND(params_->row_, 8);
params_->col_8_ = UP_ROUND(params_->col_, 8);
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
int MatmulCPUKernel::Init() { return RET_OK; }
a_c8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(float)));
if (!a_c8_ptr_) {
return RET_MEMORY_FAILED;
}
memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(float));
b_r8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(float)));
if (!b_r8_ptr_) {
return RET_MEMORY_FAILED;
}
memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float));
c_r8x8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(float)));
if (!c_r8x8_ptr_) {
return RET_MEMORY_FAILED;
}
memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(float));
return RET_OK;
}
kernel::LiteKernel *CpuMatmulFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const lite::Context *ctx,
const kernel::KernelKey &desc) {
MS_ASSERT(desc.type == schema::PrimitiveType_MatMul);
auto *kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "new MatmulCPUKernel fail!";
return nullptr;
int MatmulCPUKernel::RunImpl(int task_id) {
int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
if (cur_oc <= 0) {
return RET_OK;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
delete kernel;
MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
return nullptr;
auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
MatMul(a_c8_ptr_, cur_b, cur_c, NULL, ActType_No, params_->deep_, params_->row_8_, cur_oc * 8);
return RET_OK;
}
int MatmulFloatRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
auto op = reinterpret_cast<MatmulCPUKernel *>(cdata);
auto error_code = op->RunImpl(task_id);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;
}
return kernel;
return RET_OK;
}
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulFp32KernelCreator)
int MatmulCPUKernel::Run() {
auto a_ptr = reinterpret_cast<float *>(inputs_[0]->Data());
auto b_ptr = reinterpret_cast<float *>(inputs_[1]->Data());
auto c_ptr = reinterpret_cast<float *>(outputs_[0]->Data());
auto a_stride = params_->row_ * params_->deep_;
auto b_stride = params_->deep_ * params_->col_;
auto c_stride = params_->row_ * params_->col_;
for (int i = 0; i < params_->batch; ++i) {
auto cur_a_ptr = a_ptr + i * a_stride;
auto cur_b_ptr = b_ptr + i * b_stride;
auto cur_c_ptr = c_ptr + i * c_stride;
if (params_->a_transpose_) {
RowMajor2Row8Major(cur_a_ptr, a_c8_ptr_, params_->deep_, params_->row_);
} else {
RowMajor2Col8Major(cur_a_ptr, a_c8_ptr_, params_->row_, params_->deep_);
}
if (params_->b_transpose_) {
RowMajor2Col8Major(cur_b_ptr, b_r8_ptr_, params_->col_, params_->deep_);
} else {
RowMajor2Row8Major(cur_b_ptr, b_r8_ptr_, params_->deep_, params_->col_);
}
LiteBackendParallelLaunch(MatmulFloatRun, this, thread_count_);
Row8x8Major2RowMajor(c_r8x8_ptr_, cur_c_ptr, params_->row_, params_->col_);
}
return RET_OK;
}
} // namespace mindspore::kernel
......@@ -19,27 +19,26 @@
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/opclib/matmul.h"
#include "src/runtime/kernel/arm/base/matmul_base.h"
namespace mindspore::kernel {
class MatmulCPUKernel : public LiteKernel {
class MatmulCPUKernel : public MatmulBaseCPUKernel {
public:
explicit MatmulCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs)
: LiteKernel(parameter, inputs, outputs) {
matmul_param_ = reinterpret_cast<MatMulParameter *>(parameter);
}
~MatmulCPUKernel() override = default;
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
: MatmulBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~MatmulCPUKernel() override;
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
private:
MatMulParameter *matmul_param_;
float *a_c8_ptr_;
float *b_r8_ptr_;
float *c_r8x8_ptr_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_
......@@ -42,7 +42,7 @@ class FullconnectionInt8CPUKernel : public FullconnectionBaseCPUKernel {
int RunImpl(int task_id);
private:
FcQuantArg quant_params_;
MatmulQuantArg quant_params_;
int8_t *a_c8_ptr_;
int8_t *b_r8_ptr_;
int *c_r8x8_ptr_;
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/int8/matmul_int8.h"
#include "src/runtime/kernel/arm/opclib/int8/matmul.h"
#include "src/runtime/kernel/arm/opclib/common_func.h"
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
namespace mindspore::kernel {
MatmulInt8CPUKernel::~MatmulInt8CPUKernel() {
ctx_->allocator->Free(a_c8_ptr_);
ctx_->allocator->Free(b_r8_ptr_);
ctx_->allocator->Free(c_r8x8_ptr_);
}
int MatmulInt8CPUKernel::Init() {
int batch = 1;
auto x_shape = inputs_[0]->shape();
auto o_shape = outputs_[0]->shape();
for (int i = 0; i < x_shape.size() - 2; ++i) {
batch *= x_shape[i];
}
params_->batch = batch;
params_->row_ = o_shape[o_shape.size() - 2];
params_->col_ = o_shape[o_shape.size() - 1];
params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
params_->row_8_ = UP_ROUND(params_->row_, 8);
params_->col_8_ = UP_ROUND(params_->col_, 8);
thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);
a_c8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(int8_t)));
if (!a_c8_ptr_) {
return RET_MEMORY_FAILED;
}
memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(int8_t));
b_r8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(int8_t)));
if (!b_r8_ptr_) {
return RET_MEMORY_FAILED;
}
memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(int8_t));
c_r8x8_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(int)));
if (!c_r8x8_ptr_) {
return RET_MEMORY_FAILED;
}
memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(int));
auto input_tensor = inputs_[0];
auto params = input_tensor->GetQuantParams();
MS_ASSERT(params.size() == 1);
quant_params_.input.zp_ = params.front().zeroPoint;
quant_params_.input.scale_ = params.front().scale;
auto weight_tensor = inputs_[1];
params = weight_tensor->GetQuantParams();
MS_ASSERT(params.size() == 1);
quant_params_.weight.zp_ = params.front().zeroPoint;
quant_params_.weight.scale_ = params.front().scale;
auto output_tensor = outputs_[0];
params = output_tensor->GetQuantParams();
MS_ASSERT(params.size() == 1);
quant_params_.output.zp_ = params.front().zeroPoint;
quant_params_.output.scale_ = params.front().scale;
double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_;
QuantizeRoundParameter(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift,
&quant_params_.right_shift);
return RET_OK;
}
int MatmulInt8CPUKernel::ReSize() { return RET_OK; }
int MatmulInt8CPUKernel::RunImpl(int task_id) {
int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
if (cur_oc <= 0) {
return RET_OK;
}
auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
MatMulInt8(a_c8_ptr_, cur_b, cur_c, params_->row_8_, cur_oc * 8, params_->deep_, quant_params_.input.zp_,
quant_params_.weight.zp_);
return RET_OK;
}
int MatmulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata);
auto ret = op->RunImpl(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
return ret;
}
return RET_OK;
}
int MatmulInt8CPUKernel::Run() {
auto a_ptr = reinterpret_cast<int8_t *>(inputs_[0]->Data());
auto b_ptr = reinterpret_cast<int8_t *>(inputs_[1]->Data());
auto c_ptr = reinterpret_cast<int8_t *>(outputs_[0]->Data());
auto a_stride = params_->row_ * params_->deep_;
auto b_stride = params_->deep_ * params_->col_;
auto c_stride = params_->row_ * params_->col_;
for (int i = 0; i < params_->batch; ++i) {
auto cur_a_ptr = a_ptr + i * a_stride;
auto cur_b_ptr = b_ptr + i * b_stride;
auto cur_c_ptr = c_ptr + i * c_stride;
if (params_->a_transpose_) {
RowMajor2Row8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->deep_, params_->row_);
} else {
RowMajor2Col8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->row_, params_->deep_);
}
if (params_->b_transpose_) {
RowMajor2Col8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->col_, params_->deep_);
} else {
RowMajor2Row8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->deep_, params_->col_);
}
LiteBackendParallelLaunch(MatmulInt8Run, this, thread_count_);
auto &q = quant_params_;
SimplePostFuncInt8(c_r8x8_ptr_, cur_c_ptr, params_->col_, params_->row_, params_->row_8_, q.quant_multiplier,
q.left_shift, q.right_shift, q.output.zp_);
}
return RET_OK;
}
} // namespace mindspore::kernel
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
#include <vector>
#include "include/context.h"
#include "src/runtime/kernel/arm/opclib/quantization/quantize.h"
#include "src/runtime/kernel/arm/base/matmul_base.h"
using mindspore::lite::Context;
namespace mindspore::kernel {
class MatmulInt8CPUKernel : public MatmulBaseCPUKernel {
public:
MatmulInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
: MatmulBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~MatmulInt8CPUKernel() override;
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
private:
MatmulQuantArg quant_params_;
int8_t *a_c8_ptr_;
int8_t *b_r8_ptr_;
int *c_r8x8_ptr_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
......@@ -236,3 +236,20 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane
}
return;
}
void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
int32_t left_shift, int32_t right_shift, int32_t zp) {
/* (int32_t)row8x8-major * multiplier => (int8_t)row-major */
for (int r = 0; r < plane; r++) {
for (int c = 0; c < oc; c++) {
int c8div = c / 8, c8mod = c % 8;
int src_index = c8div * plane8 * 8 + r * 8 + c8mod;
int dst_index = r * oc + c;
int32_t value = in[src_index];
value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + zp;
value = MSMIN(CHAR_MAX, value);
value = MSMAX(CHAR_MIN, value);
out[dst_index] = (int8_t)value;
}
}
}
......@@ -33,6 +33,8 @@ void ReluFp32(float *data, int ele_num);
void Relu6Fp32(float *data, int ele_num);
void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi);
void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
int32_t left_shift, int32_t right_shift, int32_t zp);
void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step,
size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
size_t relu6);
......
......@@ -65,9 +65,7 @@ void MatMul8x8(const float *a, const float *b, float *c, const float *bias, ActT
size_t bi = c8div * deep * 8 + d * 8 + c8mod;
value = value + a[ai] * b[bi];
}
if (bias != nullptr) {
value += bias[col];
}
if (bias != nullptr) value += bias[col];
if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
if (act_type != ActType_No) value = MSMAX(0.0f, value);
c[ci] = value;
......
......@@ -18,6 +18,17 @@
#include <limits.h>
#include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
for (int r = 0; r < row; r++) {
int8_t *src = src_ptr + r * col;
for (int c = 0; c < col; c++) {
int cd8 = c / 8;
int cm8 = c % 8;
dst_ptr[cd8 * 8 * row + r * 8 + cm8] = src[c];
}
}
}
void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
for (int r = 0; r < row; r++) {
int rd8 = r / 8;
......@@ -26,7 +37,6 @@ void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
dst_ptr[rd8 * col * 8 + c * 8 + rm8] = src_ptr[r * col + c];
}
}
return;
}
void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
......@@ -46,5 +56,4 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, co
c[ci] = value;
}
}
return;
}
......@@ -22,7 +22,7 @@
void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
const int32_t a_zp, const int32_t b_zp);
void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
#endif // MINDSPORE_LITE_SRC_BACKEND_ARM_OPCLIB_INT8_MATMUL_H_
......@@ -29,6 +29,7 @@ struct MatMulParameter {
int col_8_;
int deep_;
bool has_bias_;
int batch;
bool a_transpose_; /* false : row-major */
bool b_transpose_; /* true : col-major */
ActType act_type_;
......
......@@ -22,6 +22,7 @@
#include <stdlib.h>
#include <limits.h>
#include <limits>
#include "src/runtime/kernel/arm/opclib/op_base.h"
struct QuantArg {
double scale_;
......@@ -49,7 +50,7 @@ struct ConcatQuantArg {
QuantArg out_quant_args_;
};
struct FcQuantArg {
struct MatmulQuantArg {
QuantArg input;
QuantArg weight;
QuantArg output;
......@@ -130,4 +131,22 @@ inline void CalculateActivationRangeQuantized(bool is_relu, bool is_relu6, int32
*mini = min;
*maxi = max;
}
// quantize from float to int8
inline void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) {
for (int i = 0; i < length; ++i) {
int r = (int)round(input_data[i] / scale + zero_point);
int8_t q = r > CHAR_MAX ? CHAR_MAX : r;
q = q < CHAR_MIN ? CHAR_MIN : q;
output_data[i] = q;
}
}
// dequantize from int8 to float
inline void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) {
for (int i = 0; i < length; ++i) {
output_data[i] = scale * (input_data[i] - zero_point);
}
}
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_QUANTIZATION_QUANTIZE_H_
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include "mindspore/core/utils/log_adapter.h"
#include "common/common_test.h"
#include "mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h"
#include "src/kernel_registry.h"
#include "src/lite_kernel.h"
namespace mindspore {
class TestMatMulFp32 : public mindspore::Common {
public:
TestMatMulFp32() {}
};
int MMTestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
float *a_ptr, float *b_ptr, std::vector<int> a_shape, std::vector<int> b_shape,
std::vector<int> c_shape) {
auto in_t =
new lite::tensor::Tensor(kNumberTypeFloat, a_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
in_t->MallocData();
memcpy(in_t->Data(), a_ptr, sizeof(float) * in_t->ElementsNum());
inputs_->push_back(in_t);
auto weight_t =
new lite::tensor::Tensor(kNumberTypeFloat, b_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
weight_t->MallocData();
memcpy(weight_t->Data(), b_ptr, sizeof(float) * weight_t->ElementsNum());
inputs_->push_back(weight_t);
auto out_t =
new lite::tensor::Tensor(kNumberTypeFloat, c_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
out_t->MallocData();
outputs_->push_back(out_t);
return out_t->ElementsNum();
}
TEST_F(TestMatMulFp32, simple) {
std::vector<lite::tensor::Tensor *> inputs_;
std::vector<lite::tensor::Tensor *> outputs_;
auto matmul_param = new MatMulParameter();
matmul_param->a_transpose_ = false;
matmul_param->b_transpose_ = false;
matmul_param->has_bias_ = false;
float a[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793, -6.1471, -1.7680453, -6.5721383,
17.87506, -5.1192183, 10.742863, 1.4536934, 19.693445, 19.45783, 5.063163, 0.5234792};
float b[] = {-0.0024438887, 0.0006738146, -0.008169129, 0.0021510671, -0.012470592, -0.0053063435,
0.006050155, 0.008656233, 0.012911413, -0.0028635843, -0.00034080597, -0.0010622552,
-0.012254699, -0.01312836, 0.0025241964, -0.004706142, 0.002451482, -0.009558459,
0.004481974, 0.0033251503, -0.011705584, -0.001720293, -0.0039410214, -0.0073637343};
std::vector<int> a_shape = {1, 2, 8};
std::vector<int> b_shape = {1, 8, 3};
std::vector<int> c_shape = {1, 2, 3};
int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
auto ctx = new lite::Context;
ctx->threadNum = 2;
auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
mm->Init();
mm->Run();
float correct[] = {-0.1256939023733139, -0.07744802534580231, 0.07410638779401779,
-0.3049793541431427, -0.027687929570674896, -0.18109679222106934};
CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
delete matmul_param;
delete mm;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
}
TEST_F(TestMatMulFp32, simple_transb) {
std::vector<lite::tensor::Tensor *> inputs_;
std::vector<lite::tensor::Tensor *> outputs_;
auto matmul_param = new MatMulParameter();
matmul_param->a_transpose_ = false;
matmul_param->b_transpose_ = true;
matmul_param->has_bias_ = false;
float a[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793, -6.1471, -1.7680453, -6.5721383,
17.87506, -5.1192183, 10.742863, 1.4536934, 19.693445, 19.45783, 5.063163, 0.5234792};
float b[] = {-0.0024438887, 0.0006738146, -0.008169129, 0.0021510671, -0.012470592, -0.0053063435,
0.006050155, 0.008656233, 0.012911413, -0.0028635843, -0.00034080597, -0.0010622552,
-0.012254699, -0.01312836, 0.0025241964, -0.004706142, 0.002451482, -0.009558459,
0.004481974, 0.0033251503, -0.011705584, -0.001720293, -0.0039410214, -0.0073637343};
std::vector<int> a_shape = {1, 2, 8};
std::vector<int> b_shape = {1, 3, 8};
std::vector<int> c_shape = {1, 2, 3};
int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
auto ctx = new lite::Context;
ctx->threadNum = 2;
auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
mm->Init();
mm->Run();
float correct[] = {0.00533547, 0.002545945, 0.062974121, -0.445441471, -0.246223617, -0.142070031};
CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
delete matmul_param;
delete mm;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
}
TEST_F(TestMatMulFp32, batch) {
std::vector<lite::tensor::Tensor *> inputs_;
std::vector<lite::tensor::Tensor *> outputs_;
auto matmul_param = new MatMulParameter();
matmul_param->a_transpose_ = false;
matmul_param->b_transpose_ = true;
matmul_param->has_bias_ = false;
float a[] = {-4.946672525326248, 11.154420027909701, -7.831129637356922, 17.309845099949953, -10.46177877610444,
2.5412751480833897, 2.700113860276929, -12.616715572097341, -15.513316568881574, -9.513294738065516,
17.931148376418896, -10.83801964632579, -14.023733862948017, -14.50805001403956, 0.7952221556310306,
6.619720423569035, -19.277904230909357, -13.450479287024839, 19.914652156692625, 16.542571697048878,
-2.9715041389268926, 4.949555349889412, -1.9408110276290103, -15.062828261031868, 0.20012569643335,
8.260383531209776, 3.1092344458607357, 16.742272486091487, 17.31277252415167, -16.60303202099434,
-8.980314693173042, -11.735087989358268, -14.918976184088514, -11.347592686892733, 11.808756029220604,
-18.76179414554809, 7.579758962360987, 3.13240880962163, 6.528181981442103, -16.802624652419794,
-14.323146919914901, -16.197579076296144, 9.738053920125779, -12.245780062949866, 8.817905278096319,
0.5261391331275007, -18.26152522535471, -2.400461208771226};
float b[] = {
-0.895183867395529, -0.8146900207660068, -0.27931593219652817, 0.783554361201179, -0.05080215007779798,
-0.9879631271568501, 0.07710949009001333, -0.9562579726211344, 0.29505553318356825, -0.26651960351085124,
-0.12755456259718279, -0.8221417897250098, -0.5094334041431876, -0.9117373380256013, 0.991501784215064,
0.20131976450979394, 0.07889260559412059, -0.8138407752750305, -0.047622075866657454, -0.2778043115153188,
-0.6269973420163957, -0.44345812666611617, -0.8571568605933642, 0.020192166011526735, 0.4860054298402434,
0.41525925469513614, -0.40270506445219967, -0.8716538067535347, 0.5276448387223114, 0.6064500154192936,
-0.9553204135772526, 0.3253219646257437, -0.7237956595774822, 0.3271284879679077, -0.534543967339336,
-0.4076498484281894, 0.01574797075171963, -0.37322004720586244, 0.16425071396119928, -0.5328652244800547,
0.7389336170615435, -0.6552069958923377, -0.042305872596973604, -0.6714941466767734, -0.9281411415119043,
-0.7748558258281224, -0.6209799945964443, 0.02526428593887675, -0.44984776800225856, 0.6281401952319337,
0.9907258228680276, 0.6288646615999687, -0.82076880150175, 0.3065944740797497, -0.29201038744043584,
-0.025685501802048982, -0.07273175145419652, 0.9370449239208709, -0.8233807408078093, -0.4195634619023012,
0.9799555630257346, -0.23461882935715228, -0.8884793313829993, -0.4760267734754635, -0.2874539543614072,
-0.8795685985480997, -0.08099698251915255, -0.1626521023321741, -0.9337167240793414, 0.40924842916829207,
-0.7375713045221615, -0.0065659291539015285};
std::vector<int> a_shape = {3, 2, 8};
std::vector<int> b_shape = {3, 3, 8};
std::vector<int> c_shape = {3, 2, 3};
int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
auto ctx = new lite::Context;
ctx->threadNum = 1;
auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
mm->Init();
mm->Run();
float correct[] = {21.38518524169922, -14.514888763427734, -11.040614128112793, 16.91403579711914,
27.07421112060547, 23.35394287109375, -39.006141662597656, -2.021998405456543,
-17.63555145263672, -8.490625381469727, 5.317771911621094, -14.561882019042969,
-7.251564025878906, -2.508212089538574, 5.86458683013916, -3.466249465942383,
8.869029998779297, 25.034008026123047};
float *output = reinterpret_cast<float *>(outputs_[0]->Data());
for (int i = 0; i < 18; ++i) printf("%f ", output[i]);
CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
delete matmul_param;
delete mm;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
}
} // namespace mindspore
......@@ -13,13 +13,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <memory>
#include "utils/log_adapter.h"
#include "common/common_test.h"
#include "mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h"
#include "mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h"
#include "mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h"
#include "mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h"
#include "mindspore/lite/src/kernel_registry.h"
#include "mindspore/lite/src/lite_kernel.h"
......@@ -30,21 +28,6 @@ class TestFcInt8 : public mindspore::Common {
TestFcInt8() {}
};
void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) {
for (int i = 0; i < length; ++i) {
int8_t q = static_cast<int8_t>(std::max<float>(
std::numeric_limits<int8_t>::min(),
std::min<float>(std::numeric_limits<int8_t>::max(), std::round(zero_point + (input_data[i] / scale)))));
output_data[i] = q;
}
}
void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) {
for (int i = 0; i < length; ++i) {
output_data[i] = scale * (input_data[i] - zero_point);
}
}
int FcInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
float input_max = 20;
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/log_adapter.h"
#include "common/common_test.h"
#include "mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h"
#include "mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h"
#include "mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h"
#include "mindspore/lite/src/kernel_registry.h"
#include "mindspore/lite/src/lite_kernel.h"
namespace mindspore {
class TestMatmulInt8 : public mindspore::Common {
public:
TestMatmulInt8() {}
};
int MMInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
float input_max = 20;
float input_min = -20;
float weight_max = 1;
float weight_min = -1;
float output_max = 30;
float output_min = -30;
double input_scale =
(input_max - input_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
int input_zp = std::numeric_limits<int8_t>::max() - input_max / input_scale;
double weight_scale =
(weight_max - weight_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
int weight_zp = std::numeric_limits<int8_t>::max() - weight_max / weight_scale;
double output_scale =
(output_max - output_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
int output_zp = std::numeric_limits<int8_t>::max() - output_max / output_scale;
*scale = output_scale;
*zeropoint = output_zp;
auto in_t =
new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
in_t->MallocData();
float in[] = {6.583835634764597, 11.337275140963907, -4.125256949459629, 10.994337291530833,
19.086065139532636, 3.620842999158455, 13.167624585590346, -18.326739299407755,
14.877693740734841, -17.092677920571653, 19.24147072807235, -15.14805323833401,
-18.075654829688737, -0.9164404591894204, -3.836646280336332, -10.870298671273918};
Quantize(in, in_t->ElementsNum(), input_scale, input_zp, reinterpret_cast<int8_t *>(in_t->Data()));
auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
in_quant_arg->zeroPoint = input_zp;
in_quant_arg->scale = input_scale;
in_t->AddQuantParam(*in_quant_arg);
inputs_->push_back(in_t);
auto weight_t =
new lite::tensor::Tensor(kNumberTypeInt8, {1, 3, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
weight_t->MallocData();
float weight[] = {0.3651070698591563, -0.5856943921727129, -0.7472032663840145, 0.9489992871641959,
-0.8179490270358738, -0.873058811259344, 0.39876672713807215, -0.1816769383004213,
-0.13584645926733696, -0.7614673836659709, -0.2535825872616164, -0.05265760030895916,
0.28558728305658754, 0.15404213943520118, -0.1634824450738006, -0.5068199082730189,
-0.026961256849111326, -0.1508441942453307, 0.9375335677537737, 0.3304690744194263,
-0.5091563780251127, 0.029887336278646925, -0.39540496207319276, 0.46094065001445084};
Quantize(weight, weight_t->ElementsNum(), weight_scale, weight_zp, reinterpret_cast<int8_t *>(weight_t->Data()));
auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
weight_quant_arg->zeroPoint = weight_zp;
weight_quant_arg->scale = weight_scale;
weight_t->AddQuantParam(*weight_quant_arg);
inputs_->push_back(weight_t);
auto out_t =
new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
out_t->MallocData();
auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
output_quant_arg->zeroPoint = output_zp;
output_quant_arg->scale = output_scale;
out_t->AddQuantParam(*output_quant_arg);
outputs_->push_back(out_t);
*correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
float nchw_co[] = {-0.912632942, 4.08398056, -25.385608673, 2.720281124, 7.745952606, 20.893184662};
memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(float));
matmal_param->b_transpose_ = true;
matmal_param->a_transpose_ = false;
matmal_param->has_bias_ = false;
return out_t->ElementsNum();
}
TEST_F(TestMatmulInt8, mmint8) {
std::vector<lite::tensor::Tensor *> inputs_;
std::vector<lite::tensor::Tensor *> outputs_;
auto matmul_param = new MatMulParameter();
float *correct;
double output_scale;
int output_zp;
int total_size = MMInt8TestInit(&inputs_, &outputs_, matmul_param, &correct, &output_scale, &output_zp);
auto ctx = new lite::Context;
ctx->threadNum = 2;
kernel::MatmulInt8CPUKernel *mm =
new kernel::MatmulInt8CPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
mm->Init();
mm->Run();
float fout[6] = {0};
Dequantize(reinterpret_cast<int8_t *>(outputs_[0]->Data()), outputs_[0]->ElementsNum(), output_scale, output_zp,
fout);
CompareOutputData(fout, correct, 6, 0.3);
delete matmul_param;
delete mm;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
free(correct);
}
} // namespace mindspore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册