未验证 提交 c0fa7ada 编写于 作者: H HappyAngel 提交者: GitHub

[ARM] fix caliberate error, values from [-128,127] to [-127,127] (#2927)

* fix caliberate error, values from [-128, 127] to [-127, 127], test=develop
* add classify demo and detection demo, test=develop
上级 b94839b0
......@@ -330,6 +330,30 @@ if(NOT IOS)
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -341,6 +365,7 @@ if(NOT IOS)
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <sstream>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
using paddle::lite::profile::Timer;
DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_bool(use_optimize_nb,
false,
"optimized & naive buffer model for mobile devices");
DEFINE_string(arg_name, "", "the arg name");
DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
DEFINE_string(in_txt, "", "input text");
DEFINE_string(out_txt, "", "output text");
DEFINE_string(label_file, "", "label file path");
DEFINE_int32(topk, 1, "topk num");
namespace paddle {
namespace lite_api {
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model
int ret = system(
paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
.c_str());
if (ret == 0) {
LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load model from " << load_model_dir;
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
std::vector<std::string> load_labels(std::string label_path) {
FILE* fp = fopen(label_path.c_str(), "r");
if (fp == nullptr) {
LOG(FATAL) << "load label file failed! " << label_path;
}
std::vector<std::string> labels;
while (!feof(fp)) {
char str[1024];
fgets(str, 1024, fp);
std::string str_s(str);
if (str_s.length() > 0) {
for (int i = 0; i < str_s.length(); i++) {
if (str_s[i] == ' ') {
std::string strr = str_s.substr(i, str_s.length() - i - 1);
labels.push_back(strr);
i = str_s.length();
}
}
}
}
fclose(fp);
return labels;
}
void print_topk(const float* scores,
const int size,
const int topk,
const std::vector<std::string> labels) {
std::vector<std::pair<float, int>> vec;
vec.resize(size);
for (int i = 0; i < size; i++) {
vec[i] = std::make_pair(scores[i], i);
}
std::partial_sort(vec.begin(),
vec.begin() + topk,
vec.end(),
std::greater<std::pair<float, int>>());
// print topk and score
std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w");
fprintf(fp, "%d \n", topk);
for (int i = 0; i < topk; i++) {
float score = vec[i].first;
int index = vec[i].second;
fprintf(fp, "%d ", index);
fprintf(fp, "%f \n", score);
LOG(INFO) << i << ": " << index << " " << labels[index] << " " << score;
}
fclose(fp);
}
void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const PowerMode power_mode,
const int thread_num,
const int repeat,
const int warmup_times = 0) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
bool flag_in = true;
bool flag_out = true;
if (FLAGS_in_txt == "") {
flag_in = false;
}
if (FLAGS_out_txt == "") {
flag_out = false;
}
printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
FILE* fp_r = nullptr;
if (flag_in) {
fp_r = fopen(FLAGS_in_txt.c_str(), "r");
}
for (int i = 0; i < input_num; ++i) {
if (flag_in) {
fscanf(fp_r, "%f\n", &input_data[i]);
} else {
input_data[i] = 1.f;
}
}
if (flag_in) {
fclose(fp_r);
}
}
for (int i = 0; i < warmup_times; ++i) {
predictor->Run();
}
Timer ti;
for (int j = 0; j < repeat; ++j) {
ti.Start();
predictor->Run();
float t = ti.Stop();
LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num << ", warmup: " << warmup_times
<< ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
<< " ms"
<< ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0);
auto out = output->data<float>();
auto output_shape = output->shape();
int output_num = 1;
for (int i = 0; i < output_shape.size(); ++i) {
output_num *= output_shape[i];
}
// classify
printf("load_labels \n");
std::vector<std::string> labels = load_labels(FLAGS_label_file);
printf("print_topk \n");
print_topk(out, output_num, FLAGS_topk, labels);
LOG(INFO) << "output_num: " << output_num;
LOG(INFO) << "out " << out[0];
LOG(INFO) << "out " << out[1];
FILE* fp = nullptr;
if (flag_out) {
fp = fopen(FLAGS_out_txt.c_str(), "w");
}
double sum1 = 0.f;
for (int i = 0; i < output_num; ++i) {
if (flag_out) {
fprintf(fp, "%f\n", out[i]);
}
sum1 += out[i];
}
if (flag_out) {
fclose(fp);
}
printf("out mean: %f \n", sum1 / output_num);
FILE* fp_w = fopen("time.txt", "a+");
if (!fp_w) {
printf("open file failed \n");
return;
}
fprintf(fp_w,
"model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
model_dir.c_str(),
thread_num,
ti.LapTimes().Avg(),
ti.LapTimes().Min(),
ti.LapTimes().Max());
fclose(fp_w);
// please turn off memory_optimize_pass to use this feature.
if (FLAGS_arg_name != "") {
auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
auto arg_shape = arg_tensor->shape();
int arg_num = 1;
std::ostringstream os;
os << "{";
for (int i = 0; i < arg_shape.size(); ++i) {
arg_num *= arg_shape[i];
os << arg_shape[i] << ",";
}
os << "}";
float sum = 0.;
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
}
}
#endif
} // namespace lite_api
} // namespace paddle
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model";
exit(0);
}
std::string save_optimized_model_dir = "";
if (FLAGS_use_optimize_nb) {
save_optimized_model_dir = FLAGS_model_dir;
} else {
save_optimized_model_dir = FLAGS_model_dir + "opt2";
}
auto split_string =
[](const std::string& str_in) -> std::vector<std::string> {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
};
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
while (!tmp_str.empty()) {
int dim = atoi(tmp_str.data());
shape.push_back(dim);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return shape;
};
LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
if (!FLAGS_use_optimize_nb) {
// Output optimized model
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
paddle::lite_api::Run(
input_shapes,
save_optimized_model_dir,
static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
FLAGS_threads,
FLAGS_repeats,
FLAGS_warmup);
#endif
return 0;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <sstream>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
using paddle::lite::profile::Timer;
DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_bool(use_optimize_nb,
false,
"optimized & naive buffer model for mobile devices");
DEFINE_string(arg_name, "", "the arg name");
DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
DEFINE_string(in_txt, "", "input text");
DEFINE_string(out_txt, "", "output text");
DEFINE_int32(orih, 1920, "input image height");
DEFINE_int32(oriw, 1080, "input image width");
namespace paddle {
namespace lite_api {
struct Object {
float x;
float y;
float width;
float height;
float class_id;
float prob;
};
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model
int ret = system(
paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
.c_str());
if (ret == 0) {
LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load model from " << load_model_dir;
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
void detect_choose(const float* dout,
std::vector<int64_t> dims,
const float thresh) {
std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w");
for (int iw = 0; iw < dims[0]; iw++) {
const float* values = dout + iw * dims[1];
if (values[1] > thresh) { // pro > 0.01
fprintf(fp, "%f \n", values[0]);
fprintf(fp, "%f \n", values[1]);
fprintf(fp, "%f \n", values[2]);
fprintf(fp, "%f \n", values[3]);
fprintf(fp, "%f \n", values[4]);
fprintf(fp, "%f \n", values[5]);
}
}
fclose(fp);
}
void detect_object(const float* dout,
std::vector<int64_t> dims,
const float thresh,
int orih,
int oriw) {
std::vector<Object> objects;
for (int iw = 0; iw < dims[0]; iw++) {
Object object;
const float* values = dout + iw * dims[1];
object.class_id = values[0];
object.prob = values[1];
object.x = values[2] * oriw;
object.y = values[3] * orih;
object.width = values[4] * oriw - object.x;
object.height = values[5] * orih - object.y;
objects.push_back(object);
}
std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w");
for (int i = 0; i < objects.size(); ++i) {
Object object = objects.at(i);
if (object.prob > thresh && object.x > 0 && object.y > 0 &&
object.width > 0 && object.height > 0) {
if (object.x >= oriw || object.width >= oriw || object.y >= orih ||
object.height >= orih)
continue;
fprintf(fp, "%f \n", object.x);
fprintf(fp, "%f \n", object.y);
fprintf(fp, "%f \n", object.width);
fprintf(fp, "%f \n", object.height);
fprintf(fp, "%f \n", object.prob);
fprintf(fp, "%f \n", object.class_id);
LOG(INFO) << "object id: " << object.class_id << ", image size: " << oriw
<< ", " << orih << ", detect object: " << object.prob
<< ", location: x=" << object.x << ", y=" << object.y
<< ", width=" << object.width << ", height=" << object.height;
}
}
fclose(fp);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const PowerMode power_mode,
const int thread_num,
const int repeat,
const int warmup_times = 0) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
bool flag_in = true;
bool flag_out = true;
if (FLAGS_in_txt == "") {
flag_in = false;
}
if (FLAGS_out_txt == "") {
flag_out = false;
}
printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
FILE* fp_r = nullptr;
if (flag_in) {
fp_r = fopen(FLAGS_in_txt.c_str(), "r");
}
for (int i = 0; i < input_num; ++i) {
if (flag_in) {
fscanf(fp_r, "%f\n", &input_data[i]);
} else {
input_data[i] = 1.f;
}
}
if (flag_in) {
fclose(fp_r);
}
}
for (int i = 0; i < warmup_times; ++i) {
predictor->Run();
}
Timer ti;
for (int j = 0; j < repeat; ++j) {
ti.Start();
predictor->Run();
float t = ti.Stop();
LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num << ", warmup: " << warmup_times
<< ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
<< " ms"
<< ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0);
auto out = output->data<float>();
auto output_shape = output->shape();
// detect
detect_object(
out, output_shape, atof(FLAGS_threshold.data()), FLAGS_orih, FLAGS_oriw);
// detect_choose(out, output_shape, atof(FLAGS_threshold.data()));
LOG(INFO) << "out " << out[0];
LOG(INFO) << "out " << out[1];
int output_num = 1;
for (int i = 0; i < output_shape.size(); ++i) {
output_num *= output_shape[i];
}
LOG(INFO) << "output_num: " << output_num;
FILE* fp = nullptr;
if (flag_out) {
fp = fopen(FLAGS_out_txt.c_str(), "w");
}
double sum1 = 0.f;
for (int i = 0; i < output_num; ++i) {
if (flag_out) {
fprintf(fp, "%f\n", out[i]);
}
sum1 += out[i];
}
if (flag_out) {
fclose(fp);
}
printf("out mean: %f \n", sum1 / output_num);
FILE* fp_w = fopen("time.txt", "a+");
if (!fp_w) {
printf("open file failed \n");
return;
}
fprintf(fp_w,
"model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
model_dir.c_str(),
thread_num,
ti.LapTimes().Avg(),
ti.LapTimes().Min(),
ti.LapTimes().Max());
fclose(fp_w);
// please turn off memory_optimize_pass to use this feature.
if (FLAGS_arg_name != "") {
auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
auto arg_shape = arg_tensor->shape();
int arg_num = 1;
std::ostringstream os;
os << "{";
for (int i = 0; i < arg_shape.size(); ++i) {
arg_num *= arg_shape[i];
os << arg_shape[i] << ",";
}
os << "}";
float sum = 0.;
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
}
}
#endif
} // namespace lite_api
} // namespace paddle
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model";
exit(0);
}
std::string save_optimized_model_dir = "";
if (FLAGS_use_optimize_nb) {
save_optimized_model_dir = FLAGS_model_dir;
} else {
save_optimized_model_dir = FLAGS_model_dir + "opt2";
}
auto split_string =
[](const std::string& str_in) -> std::vector<std::string> {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
};
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
while (!tmp_str.empty()) {
int dim = atoi(tmp_str.data());
shape.push_back(dim);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return shape;
};
LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
if (!FLAGS_use_optimize_nb) {
// Output optimized model
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
paddle::lite_api::Run(
input_shapes,
save_optimized_model_dir,
static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
FLAGS_threads,
FLAGS_repeats,
FLAGS_warmup);
#endif
return 0;
}
......@@ -572,6 +572,25 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
#define GEMM_INT8_INT8_OUT \
GEMM_TRANS_INT32_TO_FP32 \
GEMM_INT8_RELU \
"ld1 {v8.4s}, [%[vmax]] \n" /* v8 = -127 */ \
/* data >= -127 */ \
"fcmge v0.4s, v16.4s, v8.4s\n" \
"fcmge v1.4s, v17.4s, v8.4s\n" \
"fcmge v2.4s, v18.4s, v8.4s\n" \
"fcmge v3.4s, v19.4s, v8.4s\n" \
"fcmge v4.4s, v20.4s, v8.4s\n" \
"fcmge v5.4s, v21.4s, v8.4s\n" \
"fcmge v6.4s, v22.4s, v8.4s\n" \
"fcmge v7.4s, v23.4s, v8.4s\n" \
/* choose data */ \
"bif v16.16b, v8.16b, v0.16b \n" \
"bif v17.16b, v8.16b, v1.16b \n" \
"bif v18.16b, v8.16b, v2.16b \n" \
"bif v19.16b, v8.16b, v3.16b \n" \
"bif v20.16b, v8.16b, v4.16b \n" \
"bif v21.16b, v8.16b, v5.16b \n" \
"bif v22.16b, v8.16b, v6.16b \n" \
"bif v23.16b, v8.16b, v7.16b \n" \
"fcvtas v0.4s, v16.4s\n" /* 00, cvt to int */ \
"fcvtas v1.4s, v17.4s\n" /* 01, cvt to int */ \
"fcvtas v2.4s, v18.4s\n" /* 02, cvt to int */ \
......@@ -580,6 +599,24 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"fcvtas v5.4s, v21.4s\n" /* 11, cvt to int */ \
"fcvtas v6.4s, v22.4s\n" /* 12, cvt to int */ \
"fcvtas v7.4s, v23.4s\n" /* 13, cvt to int */ \
/* data >= -127 */ \
"fcmge v16.4s, v24.4s, v8.4s\n" \
"fcmge v17.4s, v25.4s, v8.4s\n" \
"fcmge v18.4s, v26.4s, v8.4s\n" \
"fcmge v19.4s, v27.4s, v8.4s\n" \
"fcmge v20.4s, v28.4s, v8.4s\n" \
"fcmge v21.4s, v29.4s, v8.4s\n" \
"fcmge v22.4s, v30.4s, v8.4s\n" \
"fcmge v23.4s, v31.4s, v8.4s\n" \
/* choose data */ \
"bif v24.16b, v8.16b, v16.16b\n" \
"bif v25.16b, v8.16b, v17.16b\n" \
"bif v26.16b, v8.16b, v18.16b\n" \
"bif v27.16b, v8.16b, v19.16b\n" \
"bif v28.16b, v8.16b, v20.16b\n" \
"bif v29.16b, v8.16b, v21.16b\n" \
"bif v30.16b, v8.16b, v22.16b\n" \
"bif v31.16b, v8.16b, v23.16b\n" \
"sqxtn v16.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \
"fcvtas v8.4s, v24.4s\n" /* 20, cvt to int */ \
"sqxtn2 v16.8h, v1.4s\n" /* 01, cvt int32 to int16 */ \
......@@ -648,7 +685,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"v9","v10","v11","v12","v13","v14",
"v15","v16","v17","v18","v19","v20",
"v21","v22","v23","v24","v25","v26",
"v27","v28","v29","v30","v31","cc");
"v27","v28","v29","v30","v31","cc", "memory");
// clang-format on
}
......@@ -665,6 +702,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
int k,
int rem) {
// clang-format off
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
: [a_ptr] "+r"(a_ptr),
[b_ptr] "+r"(b_ptr),
......@@ -676,13 +714,14 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
: [is_relu] "r"(is_relu),
[bias] "r"(bias),
[rem] "r"(rem),
[scale] "r"(scale)
[scale] "r"(scale),
[vmax] "r"(vmax)
: "v0","v1","v2","v3","v4","v5","v6","v7",
"v8","v9","v10","v11","v12",
"v13","v14","v15","v16","v17",
"v18","v19","v20","v21","v22",
"v23","v24","v25","v26","v27",
"v28","v29","v30","v31","cc");
"v28","v29","v30","v31","cc", "memory");
// clang-format on
}
......@@ -1179,6 +1218,25 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
#define GEMM_SDOT_INT8_OUT \
GEMM_SDOT_CVT_INT32_TO_FP32 \
GEMM_SDOT_RELU \
"ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \
/* data >= -127 */ \
"fcmge v0.4s, v8.4s, v6.4s\n" \
"fcmge v1.4s, v9.4s, v6.4s\n" \
"fcmge v2.4s, v10.4s, v6.4s\n" \
"fcmge v3.4s, v11.4s, v6.4s\n" \
"fcmge v4.4s, v12.4s, v6.4s\n" \
"fcmge v5.4s, v13.4s, v6.4s\n" \
"fcmge v7.4s, v14.4s, v6.4s\n" \
/* choose data */ \
"bif v8.16b, v6.16b, v0.16b\n" \
"fcmge v0.4s, v15.4s, v6.4s\n" \
"bif v9.16b, v6.16b, v1.16b\n" \
"bif v10.16b, v6.16b, v2.16b\n" \
"bif v11.16b, v6.16b, v3.16b\n" \
"bif v12.16b, v6.16b, v4.16b\n" \
"bif v13.16b, v6.16b, v5.16b\n" \
"bif v14.16b, v6.16b, v7.16b\n" \
"bif v15.16b, v6.16b, v0.16b \n" \
"fcvtas v0.4s, v8.4s\n" /* 00, cvt to int */ \
"fcvtas v1.4s, v9.4s\n" /* 01, cvt to int */ \
"fcvtas v2.4s, v10.4s\n" /* 02, cvt to int */ \
......@@ -1194,7 +1252,30 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
"sqxtn2 v12.8h, v4.4s\n" /* 11, cvt int32 to int16 */ \
"sqxtn v13.4h, v5.4s\n" /* 12, cvt int32 to int16 */ \
"sqxtn v14.4h, v6.4s\n" /* 20, cvt int32 to int16 */ \
"ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \
"sqxtn2 v14.8h, v7.4s\n" /* 21, cvt int32 to int16 */ \
/* data >= -127 */ \
"fcmge v0.4s, v16.4s, v6.4s\n" \
"fcmge v1.4s, v17.4s, v6.4s\n" \
"fcmge v2.4s, v18.4s, v6.4s\n" \
"fcmge v3.4s, v19.4s, v6.4s\n" \
"fcmge v4.4s, v20.4s, v6.4s\n" \
"fcmge v5.4s, v21.4s, v6.4s\n" \
"fcmge v7.4s, v22.4s, v6.4s\n" \
"fcmge v8.4s, v23.4s, v6.4s\n" \
"fcmge v9.4s, v24.4s, v6.4s\n" \
/* choose data */ \
"bif v16.16b, v6.16b, v0.16b\n" \
"fcmge v0.4s, v25.4s, v6.4s\n" \
"bif v17.16b, v6.16b, v1.16b\n" \
"bif v18.16b, v6.16b, v2.16b\n" \
"bif v19.16b, v6.16b, v3.16b\n" \
"bif v20.16b, v6.16b, v4.16b\n" \
"bif v21.16b, v6.16b, v5.16b\n" \
"bif v22.16b, v6.16b, v7.16b\n" \
"bif v23.16b, v6.16b, v8.16b\n" \
"bif v24.16b, v6.16b, v9.16b\n" \
"bif v25.16b, v6.16b, v0.16b\n" \
"fcvtas v0.4s, v16.4s\n" /* 22, cvt to int */ \
"fcvtas v1.4s, v17.4s\n" /* 30, cvt to int */ \
"fcvtas v2.4s, v18.4s\n" /* 31, cvt to int */ \
......@@ -1214,7 +1295,22 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
"sqxtn v19.4h, v6.4s\n" /* 42, cvt int32 to int16 */ \
"sqxtn v20.4h, v7.4s\n" /* 50, cvt int32 to int16 */ \
"sqxtn2 v20.8h, v8.4s\n" /* 51, cvt int32 to int16 */ \
"ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \
"sqxtn v21.4h, v9.4s\n" /* 52, cvt int32 to int16 */ \
/* data >= -127 */ \
"fcmge v0.4s, v26.4s, v6.4s\n" \
"fcmge v1.4s, v27.4s, v6.4s\n" \
"fcmge v2.4s, v28.4s, v6.4s\n" \
"fcmge v3.4s, v29.4s, v6.4s\n" \
"fcmge v4.4s, v30.4s, v6.4s\n" \
"fcmge v5.4s, v31.4s, v6.4s\n" \
/* choose data */ \
"bif v26.16b, v6.16b, v0.16b\n" \
"bif v27.16b, v6.16b, v1.16b\n" \
"bif v28.16b, v6.16b, v2.16b\n" \
"bif v29.16b, v6.16b, v3.16b\n" \
"bif v30.16b, v6.16b, v4.16b\n" \
"bif v31.16b, v6.16b, v5.16b\n" \
"fcvtas v0.4s, v26.4s\n" /* 60, cvt to int */ \
"fcvtas v1.4s, v27.4s\n" /* 61, cvt to int */ \
"fcvtas v2.4s, v28.4s\n" /* 62, cvt to int */ \
......@@ -1318,6 +1414,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
int k,
int tail) {
// clang-format off
float32_t vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT
: [a_ptr] "+r"(a_ptr),
[b_ptr] "+r"(b_ptr),
......@@ -1331,7 +1428,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
[c_ptr5] "+r"(c_ptr5),
[c_ptr6] "+r"(c_ptr6),
[c_ptr7] "+r"(c_ptr7)
: [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu)
: [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu), [vmax] "r"(vmax)
: "cc","memory","v0","v1","v2","v3",
"v4","v5","v6","v7","v8","v9","v10",
"v11","v12","v13","v14","v15","v16","v17",
......@@ -1614,6 +1711,24 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
"vadd.f32 q3, q11, q3\n" /* r21, add offset */ \
"vadd.f32 q4, q12, q4\n" /* r30, add offset */ \
"vadd.f32 q5, q13, q5\n" /* r31, add offset */ \
"vld1.32 {d12-d13}, [%[vmax]]\n" /* set q4 = -127 \n"*/ \
"vcge.f32 q7, q8, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q10, q9, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q11, q0, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q12, q1, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q13, q2, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q14, q3, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q15, q4, q6\n" /* @ q8 >= -127 \n */ \
/* choose data */ \
"vbif q8, q6, q7\n" /* @ choose */ \
"vcge.f32 q7, q5, q6\n" /* @ q8 >= -127 \n */ \
"vbif q9, q6, q10\n" /* @ choose */ \
"vbif q0, q6, q11\n" /* @ choose */ \
"vbif q1, q6, q12\n" /* @ choose */ \
"vbif q2, q6, q13\n" /* @ choose */ \
"vbif q3, q6, q14\n" /* @ choose */ \
"vbif q4, q6, q15\n" /* @ choose */ \
"vbif q5, q6, q7\n" /* @ choose */ \
"vcvt.s32.f32 q6, q8\n" /* r00, fp32->int32 */ \
"vcvt.s32.f32 q7, q9\n" /* r01, fp32->int32 */ \
"vcvt.s32.f32 q10, q0\n" /* r10, fp32->int32 */ \
......@@ -1682,7 +1797,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"q14",
"q15",
"r0",
"cc");
"cc",
"memory");
}
template <>
......@@ -1697,6 +1813,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
bool is_relu,
int k,
int rem) {
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
: [a_ptr] "+r"(a_ptr),
[b_ptr] "+r"(b_ptr),
......@@ -1708,6 +1825,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
: [is_relu] "r"(is_relu),
[bias] "r"(bias),
[rem] "r"(rem),
[vmax] "r"(vmax),
[scale] "r"(scale)
: "q0",
"q1",
......@@ -1726,7 +1844,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"q14",
"q15",
"r0",
"cc");
"cc",
"memory");
}
#endif // __aarch64__ // NOLINT
......
......@@ -79,6 +79,7 @@ inline void write_gemv_out(const int* in,
for (int i = 0; i < size; ++i) {
out[0] =
saturate_cast<signed char>(roundf(*(in++) * *(scale++) + *(bias++)));
out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127
if (flag_relu) {
out[0] = out[0] > 0 ? out[0] : 0;
}
......@@ -87,6 +88,7 @@ inline void write_gemv_out(const int* in,
} else {
for (int i = 0; i < size; ++i) {
out[0] = saturate_cast<signed char>(roundf(*(in++) * *(scale++)));
out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127
if (flag_relu) {
out[0] = out[0] > 0 ? out[0] : 0;
}
......
......@@ -40,13 +40,11 @@ void fp32_to_int8(const float* din,
int cnt = inner_size / 16;
int remain = inner_size & 15;
int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for
for (int j = 0; j < loop_size; ++j) {
float inv_scale = 1.f / scale[j % axis_size];
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vscale = vdupq_n_f32(inv_scale);
float32x4_t vmax = vdupq_n_f32(-127.f);
float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f);
const float* din_c = din + j * inner_size;
......@@ -56,6 +54,7 @@ void fp32_to_int8(const float* din,
const float* din_ptr = din_c;
signed char* dout_ptr = dout_c;
#ifdef __aarch64__
float32x4_t vmax = vdupq_n_f32(-127.0);
asm volatile(
"ldp q0, q1, [%[in]], #32 \n"
"ldp q2, q3, [%[in]], #32 \n"
......@@ -64,16 +63,19 @@ void fp32_to_int8(const float* din,
"fmul v5.4s, v1.4s, %[scale].4s \n"
"fmul v6.4s, v2.4s, %[scale].4s \n"
"fmul v7.4s, v3.4s, %[scale].4s \n"
/* data >= -127 */
"fcmge v8.4s, v4.4s, %[vmax].4s \n"
"fcmge v9.4s, v5.4s, %[vmax].4s \n"
"fcmge v10.4s, v6.4s, %[vmax].4s \n"
"fcmge v11.4s, v7.4s, %[vmax].4s \n"
/* choose data */
"bif v4.16b, %[vmax].16b, v8.16b \n"
"bif v5.16b, %[vmax].16b, v9.16b \n"
"bif v6.16b, %[vmax].16b, v10.16b \n"
"bif v7.16b, %[vmax].16b, v11.16b \n"
"ldp q0, q1, [%[in]], #32 \n"
"subs %[cnt], %[cnt], #1 \n"
/* fp32 - int32 */
"FCVTAS v8.4s, v4.4s \n"
"FCVTAS v9.4s, v5.4s \n"
"FCVTAS v10.4s, v6.4s \n"
......@@ -89,7 +91,9 @@ void fp32_to_int8(const float* din,
"bne 0b \n"
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale), [vmax] "w"(vmax)
: "v0",
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
......@@ -102,6 +106,7 @@ void fp32_to_int8(const float* din,
"v10",
"v11");
#else
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
......@@ -113,23 +118,27 @@ void fp32_to_int8(const float* din,
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n"
"vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n"
"vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset\n"
"vcgt.f32 q8, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset\n"
"vbif.f32 q6, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q7, %q[vnoff], q8 @ get right offset\n"
"vbif.f32 q7, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q4, q0, %q[vscale] @ mul scale\n"
"vld1.32 {d0-d1}, [%[vmax]] @ set q0 = -127 \n"
"vmla.f32 q5, q1, %q[vscale] @ mul scale\n"
"vmla.f32 q6, q2, %q[vscale] @ mul scale\n"
"vmla.f32 q7, q3, %q[vscale] @ mul scale\n"
"vcge.f32 q8, q4, %q[vmax] @ q4 >= vmax \n"
"vcge.f32 q9, q5, %q[vmax] @ q4 >= vmax \n"
"vcge.f32 q10, q6, %q[vmax] @ q4 >= vmax \n"
"vbif q4, %q[vmax], q8 @ choose \n"
"vcge.f32 q8, q7, %q[vmax] @ q4 >= vmax \n"
"vbif q5, %q[vmax], q9 @ choose \n"
"vbif q6, %q[vmax], q10 @ choose \n"
"vbif q7, %q[vmax], q8 @ choose \n"
/* data >= -127 */
"vcge.f32 q8, q4, q0 @ q4 >= -127 \n"
"vcge.f32 q9, q5, q0 @ q4 >= -127 \n"
"vcge.f32 q10, q6, q0 @ q4 >= -127 \n"
"vcge.f32 q11, q7, q0 @ q4 >= -127 \n"
/* choose data */
"vbif q4, q0, q8 @ choose \n"
"vbif q5, q0, q9 @ choose \n"
"vbif q6, q0, q10 @ choose \n"
"vbif q7, q0, q11 @ choose \n"
/* fp32 - int32 */
"vcvt.s32.f32 q0, q4 @ cvt to int32\n"
"vcvt.s32.f32 q1, q5 @ cvt to int32\n"
"vcvt.s32.f32 q2, q6 @ cvt to int32\n"
......@@ -150,9 +159,22 @@ void fp32_to_int8(const float* din,
: [vscale] "w"(vscale),
[vpoff] "w"(vpoff),
[vnoff] "w"(vnoff),
[vzero] "w"(vzero),
[vmax] "w"(vmax)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10");
[vmax] "r"(vmax),
[vzero] "w"(vzero)
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11");
#endif
}
const float* din_r = din_c + 16 * cnt;
......@@ -203,7 +225,7 @@ void fp32_to_int16(const float* din,
"bne 0b \n"
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v8", "v9");
: "cc", "memory", "v0", "v1", "v4", "v5", "v8", "v9");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
......@@ -232,7 +254,7 @@ void fp32_to_int16(const float* din,
[vpoff] "w"(vpoff),
[vnoff] "w"(vnoff),
[vzero] "w"(vzero)
: "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
: "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
#endif
}
const float* din_r = din_c + 8 * cnt;
......@@ -294,7 +316,9 @@ void int8_to_fp32(const int8_t* in,
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0",
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
......@@ -335,7 +359,7 @@ void int8_to_fp32(const int8_t* in,
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
}
const signed char* din_r = din_c + 16 * cnt;
......@@ -394,7 +418,18 @@ void int16_to_fp32(const int16_t* in,
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
: "cc",
"memory",
"v0",
"v1",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n"
......@@ -422,7 +457,7 @@ void int16_to_fp32(const int16_t* in,
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
}
const int16_t* din_r = din_c + 16 * cnt;
......@@ -473,7 +508,9 @@ void int32_to_fp32(const int* din,
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0",
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
......@@ -506,7 +543,9 @@ void int32_to_fp32(const int* din,
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0",
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
......@@ -551,41 +590,53 @@ void int32_to_int8(const int* din,
const int* din_ptr = din_c;
int8_t* dout_ptr = dout_c;
#ifdef __aarch64__
float32x4_t vmax = vdupq_n_f32(-127.0);
asm volatile(
"0: \n"
"ld1 {v0.4s, v1.4s}, [%[in]], #32 \n"
"ld1 {v2.4s, v3.4s}, [%[in]], #32 \n"
/* int32 - fp32 */
"scvtf v4.4s, v0.4s \n"
"scvtf v5.4s, v1.4s \n"
"scvtf v6.4s, v2.4s \n"
"scvtf v7.4s, v3.4s \n"
/* mul scale */
"fmul v0.4s, v4.4s, %[scale].4s \n"
"fmul v1.4s, v5.4s, %[scale].4s \n"
"fmul v2.4s, v6.4s, %[scale].4s \n"
"fmul v3.4s, v7.4s, %[scale].4s \n"
/* data >= -127 */
"fcmge v4.4s, v0.4s, %[vmax].4s \n"
"fcmge v5.4s, v1.4s, %[vmax].4s \n"
"fcmge v6.4s, v2.4s, %[vmax].4s \n"
"fcmge v7.4s, v3.4s, %[vmax].4s \n"
/* choose data */
"bif v0.16b, %[vmax].16b, v4.16b \n"
"bif v1.16b, %[vmax].16b, v5.16b \n"
"bif v2.16b, %[vmax].16b, v6.16b \n"
"bif v3.16b, %[vmax].16b, v7.16b \n"
/* fp32 - int32 */
"fcvtas v4.4s, v0.4s \n"
"fcvtas v5.4s, v1.4s \n"
"fcvtas v6.4s, v2.4s \n"
"fcvtas v7.4s, v3.4s \n"
/* int32 - int16 */
"sqxtn v0.4h, v4.4s \n"
"sqxtn2 v0.8h, v5.4s \n"
"sqxtn v1.4h, v6.4s \n"
"sqxtn2 v1.8h, v7.4s \n"
/* int16 - int8 */
"sqxtn v2.8b, v0.8h \n"
"sqxtn2 v2.16b, v1.8h \n"
/* store */
"st1 {v2.16b}, [%[out]], #16 \n"
"subs %[loop], %[loop], #1 \n"
"bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
: [scale] "w"(vscale), [vmax] "w"(vmax)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
......@@ -607,9 +658,21 @@ void int32_to_int8(const int* din,
"vbif.f32 q2, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q3, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q0, q4, %q[vscale] @ mul scale\n"
"vld1.32 {d8-d9}, [%[vmax]] @ set q4 = -127 \n"
"vmla.f32 q1, q5, %q[vscale] @ mul scale\n"
"vmla.f32 q2, q6, %q[vscale] @ mul scale\n"
"vmla.f32 q3, q7, %q[vscale] @ mul scale\n"
/* data >= -127 */
"vcge.f32 q8, q0, q4 @ q0 >= -127 \n"
"vcge.f32 q9, q1, q4 @ q1 >= -127 \n"
"vcge.f32 q10, q2, q4 @ q2 >= -127 \n"
"vcge.f32 q11, q3, q4 @ q3 >= -127 \n"
/* choose data */
"vbif q0, q4, q8 @ choose \n"
"vbif q1, q4, q9 @ choose \n"
"vbif q2, q4, q10 @ choose \n"
"vbif q3, q4, q11 @ choose \n"
/* fp32 - int32 */
"vcvt.s32.f32 q4, q0 @ cvt to int32\n"
"vcvt.s32.f32 q5, q1 @ cvt to int32\n"
"vcvt.s32.f32 q6, q2 @ cvt to int32\n"
......@@ -628,9 +691,12 @@ void int32_to_int8(const int* din,
: [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
: [vscale] "w"(vscale),
[vzero] "w"(vzero),
[vmax] "r"(vmax),
[vnoff] "w"(vnoff),
[vpoff] "w"(vpoff)
: "q0",
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
......@@ -648,6 +714,7 @@ void int32_to_int8(const int* din,
int8_t* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
}
}
}
......@@ -682,7 +749,7 @@ float compute_max_kernel(const float* din, int64_t size) {
"bne 0b \n"
: [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n"
......@@ -703,7 +770,7 @@ float compute_max_kernel(const float* din, int64_t size) {
: [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
float32x2_t vmax_p =
vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val));
......
......@@ -34,7 +34,7 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(basic_test, true, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(batch, 1, "batch size");
......@@ -457,7 +457,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
const std::vector<int>& power_mode) {}
#endif // LITE_WITH_ARM
#if 0 /// 3x3dw
#if 1 /// 3x3dw
TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
if (FLAGS_basic_test) {
for (auto& stride : {1, 2}) {
......@@ -525,7 +525,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
}
#endif /// 5x5dw
#if 0 /// conv1x1s1
#if 1 /// conv1x1s1
TEST(TestConv1x1s1Int8, test_conv1x1s1) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 32}) {
......@@ -562,7 +562,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
}
#endif /// conv1x1s1
#if 0 /// conv3x3s1
#if 1 /// conv3x3s1
TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 33}) {
......@@ -602,7 +602,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
}
#endif /// conv3x3s1
#if 0 /// conv3x3s2
#if 1 /// conv3x3s2
TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 31}) {
......
......@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(basic_test, true, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(M, 512, "gemm: M");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册