未验证 提交 c0fa7ada 编写于 作者: H HappyAngel 提交者: GitHub

[ARM] fix caliberate error, values from [-128,127] to [-127,127] (#2927)

* fix caliberate error, values from [-128, 127] to [-127, 127], test=develop
* add classify demo and detection demo, test=develop
上级 b94839b0
...@@ -330,6 +330,30 @@ if(NOT IOS) ...@@ -330,6 +330,30 @@ if(NOT IOS)
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -341,6 +365,7 @@ if(NOT IOS) ...@@ -341,6 +365,7 @@ if(NOT IOS)
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <sstream>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
using paddle::lite::profile::Timer;
DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_bool(use_optimize_nb,
false,
"optimized & naive buffer model for mobile devices");
DEFINE_string(arg_name, "", "the arg name");
DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
DEFINE_string(in_txt, "", "input text");
DEFINE_string(out_txt, "", "output text");
DEFINE_string(label_file, "", "label file path");
DEFINE_int32(topk, 1, "topk num");
namespace paddle {
namespace lite_api {
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model
int ret = system(
paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
.c_str());
if (ret == 0) {
LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load model from " << load_model_dir;
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
std::vector<std::string> load_labels(std::string label_path) {
FILE* fp = fopen(label_path.c_str(), "r");
if (fp == nullptr) {
LOG(FATAL) << "load label file failed! " << label_path;
}
std::vector<std::string> labels;
while (!feof(fp)) {
char str[1024];
fgets(str, 1024, fp);
std::string str_s(str);
if (str_s.length() > 0) {
for (int i = 0; i < str_s.length(); i++) {
if (str_s[i] == ' ') {
std::string strr = str_s.substr(i, str_s.length() - i - 1);
labels.push_back(strr);
i = str_s.length();
}
}
}
}
fclose(fp);
return labels;
}
void print_topk(const float* scores,
const int size,
const int topk,
const std::vector<std::string> labels) {
std::vector<std::pair<float, int>> vec;
vec.resize(size);
for (int i = 0; i < size; i++) {
vec[i] = std::make_pair(scores[i], i);
}
std::partial_sort(vec.begin(),
vec.begin() + topk,
vec.end(),
std::greater<std::pair<float, int>>());
// print topk and score
std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w");
fprintf(fp, "%d \n", topk);
for (int i = 0; i < topk; i++) {
float score = vec[i].first;
int index = vec[i].second;
fprintf(fp, "%d ", index);
fprintf(fp, "%f \n", score);
LOG(INFO) << i << ": " << index << " " << labels[index] << " " << score;
}
fclose(fp);
}
void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const PowerMode power_mode,
const int thread_num,
const int repeat,
const int warmup_times = 0) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
bool flag_in = true;
bool flag_out = true;
if (FLAGS_in_txt == "") {
flag_in = false;
}
if (FLAGS_out_txt == "") {
flag_out = false;
}
printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
FILE* fp_r = nullptr;
if (flag_in) {
fp_r = fopen(FLAGS_in_txt.c_str(), "r");
}
for (int i = 0; i < input_num; ++i) {
if (flag_in) {
fscanf(fp_r, "%f\n", &input_data[i]);
} else {
input_data[i] = 1.f;
}
}
if (flag_in) {
fclose(fp_r);
}
}
for (int i = 0; i < warmup_times; ++i) {
predictor->Run();
}
Timer ti;
for (int j = 0; j < repeat; ++j) {
ti.Start();
predictor->Run();
float t = ti.Stop();
LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num << ", warmup: " << warmup_times
<< ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
<< " ms"
<< ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0);
auto out = output->data<float>();
auto output_shape = output->shape();
int output_num = 1;
for (int i = 0; i < output_shape.size(); ++i) {
output_num *= output_shape[i];
}
// classify
printf("load_labels \n");
std::vector<std::string> labels = load_labels(FLAGS_label_file);
printf("print_topk \n");
print_topk(out, output_num, FLAGS_topk, labels);
LOG(INFO) << "output_num: " << output_num;
LOG(INFO) << "out " << out[0];
LOG(INFO) << "out " << out[1];
FILE* fp = nullptr;
if (flag_out) {
fp = fopen(FLAGS_out_txt.c_str(), "w");
}
double sum1 = 0.f;
for (int i = 0; i < output_num; ++i) {
if (flag_out) {
fprintf(fp, "%f\n", out[i]);
}
sum1 += out[i];
}
if (flag_out) {
fclose(fp);
}
printf("out mean: %f \n", sum1 / output_num);
FILE* fp_w = fopen("time.txt", "a+");
if (!fp_w) {
printf("open file failed \n");
return;
}
fprintf(fp_w,
"model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
model_dir.c_str(),
thread_num,
ti.LapTimes().Avg(),
ti.LapTimes().Min(),
ti.LapTimes().Max());
fclose(fp_w);
// please turn off memory_optimize_pass to use this feature.
if (FLAGS_arg_name != "") {
auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
auto arg_shape = arg_tensor->shape();
int arg_num = 1;
std::ostringstream os;
os << "{";
for (int i = 0; i < arg_shape.size(); ++i) {
arg_num *= arg_shape[i];
os << arg_shape[i] << ",";
}
os << "}";
float sum = 0.;
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
}
}
#endif
} // namespace lite_api
} // namespace paddle
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model";
exit(0);
}
std::string save_optimized_model_dir = "";
if (FLAGS_use_optimize_nb) {
save_optimized_model_dir = FLAGS_model_dir;
} else {
save_optimized_model_dir = FLAGS_model_dir + "opt2";
}
auto split_string =
[](const std::string& str_in) -> std::vector<std::string> {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
};
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
while (!tmp_str.empty()) {
int dim = atoi(tmp_str.data());
shape.push_back(dim);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return shape;
};
LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
if (!FLAGS_use_optimize_nb) {
// Output optimized model
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
paddle::lite_api::Run(
input_shapes,
save_optimized_model_dir,
static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
FLAGS_threads,
FLAGS_repeats,
FLAGS_warmup);
#endif
return 0;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <sstream>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
using paddle::lite::profile::Timer;
DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_bool(use_optimize_nb,
false,
"optimized & naive buffer model for mobile devices");
DEFINE_string(arg_name, "", "the arg name");
DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
DEFINE_string(in_txt, "", "input text");
DEFINE_string(out_txt, "", "output text");
DEFINE_int32(orih, 1920, "input image height");
DEFINE_int32(oriw, 1080, "input image width");
namespace paddle {
namespace lite_api {
struct Object {
float x;
float y;
float width;
float height;
float class_id;
float prob;
};
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)},
});
auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model
int ret = system(
paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
.c_str());
if (ret == 0) {
LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load model from " << load_model_dir;
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
void detect_choose(const float* dout,
std::vector<int64_t> dims,
const float thresh) {
std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w");
for (int iw = 0; iw < dims[0]; iw++) {
const float* values = dout + iw * dims[1];
if (values[1] > thresh) { // pro > 0.01
fprintf(fp, "%f \n", values[0]);
fprintf(fp, "%f \n", values[1]);
fprintf(fp, "%f \n", values[2]);
fprintf(fp, "%f \n", values[3]);
fprintf(fp, "%f \n", values[4]);
fprintf(fp, "%f \n", values[5]);
}
}
fclose(fp);
}
void detect_object(const float* dout,
std::vector<int64_t> dims,
const float thresh,
int orih,
int oriw) {
std::vector<Object> objects;
for (int iw = 0; iw < dims[0]; iw++) {
Object object;
const float* values = dout + iw * dims[1];
object.class_id = values[0];
object.prob = values[1];
object.x = values[2] * oriw;
object.y = values[3] * orih;
object.width = values[4] * oriw - object.x;
object.height = values[5] * orih - object.y;
objects.push_back(object);
}
std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w");
for (int i = 0; i < objects.size(); ++i) {
Object object = objects.at(i);
if (object.prob > thresh && object.x > 0 && object.y > 0 &&
object.width > 0 && object.height > 0) {
if (object.x >= oriw || object.width >= oriw || object.y >= orih ||
object.height >= orih)
continue;
fprintf(fp, "%f \n", object.x);
fprintf(fp, "%f \n", object.y);
fprintf(fp, "%f \n", object.width);
fprintf(fp, "%f \n", object.height);
fprintf(fp, "%f \n", object.prob);
fprintf(fp, "%f \n", object.class_id);
LOG(INFO) << "object id: " << object.class_id << ", image size: " << oriw
<< ", " << orih << ", detect object: " << object.prob
<< ", location: x=" << object.x << ", y=" << object.y
<< ", width=" << object.width << ", height=" << object.height;
}
}
fclose(fp);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const PowerMode power_mode,
const int thread_num,
const int repeat,
const int warmup_times = 0) {
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_power_mode(power_mode);
config.set_threads(thread_num);
auto predictor = lite_api::CreatePaddlePredictor(config);
bool flag_in = true;
bool flag_out = true;
if (FLAGS_in_txt == "") {
flag_in = false;
}
if (FLAGS_out_txt == "") {
flag_out = false;
}
printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
FILE* fp_r = nullptr;
if (flag_in) {
fp_r = fopen(FLAGS_in_txt.c_str(), "r");
}
for (int i = 0; i < input_num; ++i) {
if (flag_in) {
fscanf(fp_r, "%f\n", &input_data[i]);
} else {
input_data[i] = 1.f;
}
}
if (flag_in) {
fclose(fp_r);
}
}
for (int i = 0; i < warmup_times; ++i) {
predictor->Run();
}
Timer ti;
for (int j = 0; j < repeat; ++j) {
ti.Start();
predictor->Run();
float t = ti.Stop();
LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num << ", warmup: " << warmup_times
<< ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
<< " ms"
<< ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0);
auto out = output->data<float>();
auto output_shape = output->shape();
// detect
detect_object(
out, output_shape, atof(FLAGS_threshold.data()), FLAGS_orih, FLAGS_oriw);
// detect_choose(out, output_shape, atof(FLAGS_threshold.data()));
LOG(INFO) << "out " << out[0];
LOG(INFO) << "out " << out[1];
int output_num = 1;
for (int i = 0; i < output_shape.size(); ++i) {
output_num *= output_shape[i];
}
LOG(INFO) << "output_num: " << output_num;
FILE* fp = nullptr;
if (flag_out) {
fp = fopen(FLAGS_out_txt.c_str(), "w");
}
double sum1 = 0.f;
for (int i = 0; i < output_num; ++i) {
if (flag_out) {
fprintf(fp, "%f\n", out[i]);
}
sum1 += out[i];
}
if (flag_out) {
fclose(fp);
}
printf("out mean: %f \n", sum1 / output_num);
FILE* fp_w = fopen("time.txt", "a+");
if (!fp_w) {
printf("open file failed \n");
return;
}
fprintf(fp_w,
"model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
model_dir.c_str(),
thread_num,
ti.LapTimes().Avg(),
ti.LapTimes().Min(),
ti.LapTimes().Max());
fclose(fp_w);
// please turn off memory_optimize_pass to use this feature.
if (FLAGS_arg_name != "") {
auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
auto arg_shape = arg_tensor->shape();
int arg_num = 1;
std::ostringstream os;
os << "{";
for (int i = 0; i < arg_shape.size(); ++i) {
arg_num *= arg_shape[i];
os << arg_shape[i] << ",";
}
os << "}";
float sum = 0.;
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
}
}
#endif
} // namespace lite_api
} // namespace paddle
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") {
LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model";
exit(0);
}
std::string save_optimized_model_dir = "";
if (FLAGS_use_optimize_nb) {
save_optimized_model_dir = FLAGS_model_dir;
} else {
save_optimized_model_dir = FLAGS_model_dir + "opt2";
}
auto split_string =
[](const std::string& str_in) -> std::vector<std::string> {
std::vector<std::string> str_out;
std::string tmp_str = str_in;
while (!tmp_str.empty()) {
size_t next_offset = tmp_str.find(":");
str_out.push_back(tmp_str.substr(0, next_offset));
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return str_out;
};
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
while (!tmp_str.empty()) {
int dim = atoi(tmp_str.data());
shape.push_back(dim);
size_t next_offset = tmp_str.find(",");
if (next_offset == std::string::npos) {
break;
} else {
tmp_str = tmp_str.substr(next_offset + 1);
}
}
return shape;
};
LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
if (!FLAGS_use_optimize_nb) {
// Output optimized model
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
paddle::lite_api::Run(
input_shapes,
save_optimized_model_dir,
static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
FLAGS_threads,
FLAGS_repeats,
FLAGS_warmup);
#endif
return 0;
}
...@@ -572,6 +572,25 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -572,6 +572,25 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
#define GEMM_INT8_INT8_OUT \ #define GEMM_INT8_INT8_OUT \
GEMM_TRANS_INT32_TO_FP32 \ GEMM_TRANS_INT32_TO_FP32 \
GEMM_INT8_RELU \ GEMM_INT8_RELU \
"ld1 {v8.4s}, [%[vmax]] \n" /* v8 = -127 */ \
/* data >= -127 */ \
"fcmge v0.4s, v16.4s, v8.4s\n" \
"fcmge v1.4s, v17.4s, v8.4s\n" \
"fcmge v2.4s, v18.4s, v8.4s\n" \
"fcmge v3.4s, v19.4s, v8.4s\n" \
"fcmge v4.4s, v20.4s, v8.4s\n" \
"fcmge v5.4s, v21.4s, v8.4s\n" \
"fcmge v6.4s, v22.4s, v8.4s\n" \
"fcmge v7.4s, v23.4s, v8.4s\n" \
/* choose data */ \
"bif v16.16b, v8.16b, v0.16b \n" \
"bif v17.16b, v8.16b, v1.16b \n" \
"bif v18.16b, v8.16b, v2.16b \n" \
"bif v19.16b, v8.16b, v3.16b \n" \
"bif v20.16b, v8.16b, v4.16b \n" \
"bif v21.16b, v8.16b, v5.16b \n" \
"bif v22.16b, v8.16b, v6.16b \n" \
"bif v23.16b, v8.16b, v7.16b \n" \
"fcvtas v0.4s, v16.4s\n" /* 00, cvt to int */ \ "fcvtas v0.4s, v16.4s\n" /* 00, cvt to int */ \
"fcvtas v1.4s, v17.4s\n" /* 01, cvt to int */ \ "fcvtas v1.4s, v17.4s\n" /* 01, cvt to int */ \
"fcvtas v2.4s, v18.4s\n" /* 02, cvt to int */ \ "fcvtas v2.4s, v18.4s\n" /* 02, cvt to int */ \
...@@ -580,6 +599,24 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -580,6 +599,24 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"fcvtas v5.4s, v21.4s\n" /* 11, cvt to int */ \ "fcvtas v5.4s, v21.4s\n" /* 11, cvt to int */ \
"fcvtas v6.4s, v22.4s\n" /* 12, cvt to int */ \ "fcvtas v6.4s, v22.4s\n" /* 12, cvt to int */ \
"fcvtas v7.4s, v23.4s\n" /* 13, cvt to int */ \ "fcvtas v7.4s, v23.4s\n" /* 13, cvt to int */ \
/* data >= -127 */ \
"fcmge v16.4s, v24.4s, v8.4s\n" \
"fcmge v17.4s, v25.4s, v8.4s\n" \
"fcmge v18.4s, v26.4s, v8.4s\n" \
"fcmge v19.4s, v27.4s, v8.4s\n" \
"fcmge v20.4s, v28.4s, v8.4s\n" \
"fcmge v21.4s, v29.4s, v8.4s\n" \
"fcmge v22.4s, v30.4s, v8.4s\n" \
"fcmge v23.4s, v31.4s, v8.4s\n" \
/* choose data */ \
"bif v24.16b, v8.16b, v16.16b\n" \
"bif v25.16b, v8.16b, v17.16b\n" \
"bif v26.16b, v8.16b, v18.16b\n" \
"bif v27.16b, v8.16b, v19.16b\n" \
"bif v28.16b, v8.16b, v20.16b\n" \
"bif v29.16b, v8.16b, v21.16b\n" \
"bif v30.16b, v8.16b, v22.16b\n" \
"bif v31.16b, v8.16b, v23.16b\n" \
"sqxtn v16.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ "sqxtn v16.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \
"fcvtas v8.4s, v24.4s\n" /* 20, cvt to int */ \ "fcvtas v8.4s, v24.4s\n" /* 20, cvt to int */ \
"sqxtn2 v16.8h, v1.4s\n" /* 01, cvt int32 to int16 */ \ "sqxtn2 v16.8h, v1.4s\n" /* 01, cvt int32 to int16 */ \
...@@ -648,7 +685,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -648,7 +685,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"v9","v10","v11","v12","v13","v14", "v9","v10","v11","v12","v13","v14",
"v15","v16","v17","v18","v19","v20", "v15","v16","v17","v18","v19","v20",
"v21","v22","v23","v24","v25","v26", "v21","v22","v23","v24","v25","v26",
"v27","v28","v29","v30","v31","cc"); "v27","v28","v29","v30","v31","cc", "memory");
// clang-format on // clang-format on
} }
...@@ -665,6 +702,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -665,6 +702,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
int k, int k,
int rem) { int rem) {
// clang-format off // clang-format off
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
: [a_ptr] "+r"(a_ptr), : [a_ptr] "+r"(a_ptr),
[b_ptr] "+r"(b_ptr), [b_ptr] "+r"(b_ptr),
...@@ -676,13 +714,14 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -676,13 +714,14 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
: [is_relu] "r"(is_relu), : [is_relu] "r"(is_relu),
[bias] "r"(bias), [bias] "r"(bias),
[rem] "r"(rem), [rem] "r"(rem),
[scale] "r"(scale) [scale] "r"(scale),
[vmax] "r"(vmax)
: "v0","v1","v2","v3","v4","v5","v6","v7", : "v0","v1","v2","v3","v4","v5","v6","v7",
"v8","v9","v10","v11","v12", "v8","v9","v10","v11","v12",
"v13","v14","v15","v16","v17", "v13","v14","v15","v16","v17",
"v18","v19","v20","v21","v22", "v18","v19","v20","v21","v22",
"v23","v24","v25","v26","v27", "v23","v24","v25","v26","v27",
"v28","v29","v30","v31","cc"); "v28","v29","v30","v31","cc", "memory");
// clang-format on // clang-format on
} }
...@@ -1179,6 +1218,25 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, ...@@ -1179,6 +1218,25 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
#define GEMM_SDOT_INT8_OUT \ #define GEMM_SDOT_INT8_OUT \
GEMM_SDOT_CVT_INT32_TO_FP32 \ GEMM_SDOT_CVT_INT32_TO_FP32 \
GEMM_SDOT_RELU \ GEMM_SDOT_RELU \
"ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \
/* data >= -127 */ \
"fcmge v0.4s, v8.4s, v6.4s\n" \
"fcmge v1.4s, v9.4s, v6.4s\n" \
"fcmge v2.4s, v10.4s, v6.4s\n" \
"fcmge v3.4s, v11.4s, v6.4s\n" \
"fcmge v4.4s, v12.4s, v6.4s\n" \
"fcmge v5.4s, v13.4s, v6.4s\n" \
"fcmge v7.4s, v14.4s, v6.4s\n" \
/* choose data */ \
"bif v8.16b, v6.16b, v0.16b\n" \
"fcmge v0.4s, v15.4s, v6.4s\n" \
"bif v9.16b, v6.16b, v1.16b\n" \
"bif v10.16b, v6.16b, v2.16b\n" \
"bif v11.16b, v6.16b, v3.16b\n" \
"bif v12.16b, v6.16b, v4.16b\n" \
"bif v13.16b, v6.16b, v5.16b\n" \
"bif v14.16b, v6.16b, v7.16b\n" \
"bif v15.16b, v6.16b, v0.16b \n" \
"fcvtas v0.4s, v8.4s\n" /* 00, cvt to int */ \ "fcvtas v0.4s, v8.4s\n" /* 00, cvt to int */ \
"fcvtas v1.4s, v9.4s\n" /* 01, cvt to int */ \ "fcvtas v1.4s, v9.4s\n" /* 01, cvt to int */ \
"fcvtas v2.4s, v10.4s\n" /* 02, cvt to int */ \ "fcvtas v2.4s, v10.4s\n" /* 02, cvt to int */ \
...@@ -1194,7 +1252,30 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, ...@@ -1194,7 +1252,30 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
"sqxtn2 v12.8h, v4.4s\n" /* 11, cvt int32 to int16 */ \ "sqxtn2 v12.8h, v4.4s\n" /* 11, cvt int32 to int16 */ \
"sqxtn v13.4h, v5.4s\n" /* 12, cvt int32 to int16 */ \ "sqxtn v13.4h, v5.4s\n" /* 12, cvt int32 to int16 */ \
"sqxtn v14.4h, v6.4s\n" /* 20, cvt int32 to int16 */ \ "sqxtn v14.4h, v6.4s\n" /* 20, cvt int32 to int16 */ \
"ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \
"sqxtn2 v14.8h, v7.4s\n" /* 21, cvt int32 to int16 */ \ "sqxtn2 v14.8h, v7.4s\n" /* 21, cvt int32 to int16 */ \
/* data >= -127 */ \
"fcmge v0.4s, v16.4s, v6.4s\n" \
"fcmge v1.4s, v17.4s, v6.4s\n" \
"fcmge v2.4s, v18.4s, v6.4s\n" \
"fcmge v3.4s, v19.4s, v6.4s\n" \
"fcmge v4.4s, v20.4s, v6.4s\n" \
"fcmge v5.4s, v21.4s, v6.4s\n" \
"fcmge v7.4s, v22.4s, v6.4s\n" \
"fcmge v8.4s, v23.4s, v6.4s\n" \
"fcmge v9.4s, v24.4s, v6.4s\n" \
/* choose data */ \
"bif v16.16b, v6.16b, v0.16b\n" \
"fcmge v0.4s, v25.4s, v6.4s\n" \
"bif v17.16b, v6.16b, v1.16b\n" \
"bif v18.16b, v6.16b, v2.16b\n" \
"bif v19.16b, v6.16b, v3.16b\n" \
"bif v20.16b, v6.16b, v4.16b\n" \
"bif v21.16b, v6.16b, v5.16b\n" \
"bif v22.16b, v6.16b, v7.16b\n" \
"bif v23.16b, v6.16b, v8.16b\n" \
"bif v24.16b, v6.16b, v9.16b\n" \
"bif v25.16b, v6.16b, v0.16b\n" \
"fcvtas v0.4s, v16.4s\n" /* 22, cvt to int */ \ "fcvtas v0.4s, v16.4s\n" /* 22, cvt to int */ \
"fcvtas v1.4s, v17.4s\n" /* 30, cvt to int */ \ "fcvtas v1.4s, v17.4s\n" /* 30, cvt to int */ \
"fcvtas v2.4s, v18.4s\n" /* 31, cvt to int */ \ "fcvtas v2.4s, v18.4s\n" /* 31, cvt to int */ \
...@@ -1214,7 +1295,22 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, ...@@ -1214,7 +1295,22 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
"sqxtn v19.4h, v6.4s\n" /* 42, cvt int32 to int16 */ \ "sqxtn v19.4h, v6.4s\n" /* 42, cvt int32 to int16 */ \
"sqxtn v20.4h, v7.4s\n" /* 50, cvt int32 to int16 */ \ "sqxtn v20.4h, v7.4s\n" /* 50, cvt int32 to int16 */ \
"sqxtn2 v20.8h, v8.4s\n" /* 51, cvt int32 to int16 */ \ "sqxtn2 v20.8h, v8.4s\n" /* 51, cvt int32 to int16 */ \
"ld1 {v6.4s}, [%[vmax]]\n" /* v8 = -127.f */ \
"sqxtn v21.4h, v9.4s\n" /* 52, cvt int32 to int16 */ \ "sqxtn v21.4h, v9.4s\n" /* 52, cvt int32 to int16 */ \
/* data >= -127 */ \
"fcmge v0.4s, v26.4s, v6.4s\n" \
"fcmge v1.4s, v27.4s, v6.4s\n" \
"fcmge v2.4s, v28.4s, v6.4s\n" \
"fcmge v3.4s, v29.4s, v6.4s\n" \
"fcmge v4.4s, v30.4s, v6.4s\n" \
"fcmge v5.4s, v31.4s, v6.4s\n" \
/* choose data */ \
"bif v26.16b, v6.16b, v0.16b\n" \
"bif v27.16b, v6.16b, v1.16b\n" \
"bif v28.16b, v6.16b, v2.16b\n" \
"bif v29.16b, v6.16b, v3.16b\n" \
"bif v30.16b, v6.16b, v4.16b\n" \
"bif v31.16b, v6.16b, v5.16b\n" \
"fcvtas v0.4s, v26.4s\n" /* 60, cvt to int */ \ "fcvtas v0.4s, v26.4s\n" /* 60, cvt to int */ \
"fcvtas v1.4s, v27.4s\n" /* 61, cvt to int */ \ "fcvtas v1.4s, v27.4s\n" /* 61, cvt to int */ \
"fcvtas v2.4s, v28.4s\n" /* 62, cvt to int */ \ "fcvtas v2.4s, v28.4s\n" /* 62, cvt to int */ \
...@@ -1318,6 +1414,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, ...@@ -1318,6 +1414,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
int k, int k,
int tail) { int tail) {
// clang-format off // clang-format off
float32_t vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT
: [a_ptr] "+r"(a_ptr), : [a_ptr] "+r"(a_ptr),
[b_ptr] "+r"(b_ptr), [b_ptr] "+r"(b_ptr),
...@@ -1331,7 +1428,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, ...@@ -1331,7 +1428,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
[c_ptr5] "+r"(c_ptr5), [c_ptr5] "+r"(c_ptr5),
[c_ptr6] "+r"(c_ptr6), [c_ptr6] "+r"(c_ptr6),
[c_ptr7] "+r"(c_ptr7) [c_ptr7] "+r"(c_ptr7)
: [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu) : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu), [vmax] "r"(vmax)
: "cc","memory","v0","v1","v2","v3", : "cc","memory","v0","v1","v2","v3",
"v4","v5","v6","v7","v8","v9","v10", "v4","v5","v6","v7","v8","v9","v10",
"v11","v12","v13","v14","v15","v16","v17", "v11","v12","v13","v14","v15","v16","v17",
...@@ -1614,6 +1711,24 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr, ...@@ -1614,6 +1711,24 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
"vadd.f32 q3, q11, q3\n" /* r21, add offset */ \ "vadd.f32 q3, q11, q3\n" /* r21, add offset */ \
"vadd.f32 q4, q12, q4\n" /* r30, add offset */ \ "vadd.f32 q4, q12, q4\n" /* r30, add offset */ \
"vadd.f32 q5, q13, q5\n" /* r31, add offset */ \ "vadd.f32 q5, q13, q5\n" /* r31, add offset */ \
"vld1.32 {d12-d13}, [%[vmax]]\n" /* set q4 = -127 \n"*/ \
"vcge.f32 q7, q8, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q10, q9, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q11, q0, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q12, q1, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q13, q2, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q14, q3, q6\n" /* @ q8 >= -127 \n */ \
"vcge.f32 q15, q4, q6\n" /* @ q8 >= -127 \n */ \
/* choose data */ \
"vbif q8, q6, q7\n" /* @ choose */ \
"vcge.f32 q7, q5, q6\n" /* @ q8 >= -127 \n */ \
"vbif q9, q6, q10\n" /* @ choose */ \
"vbif q0, q6, q11\n" /* @ choose */ \
"vbif q1, q6, q12\n" /* @ choose */ \
"vbif q2, q6, q13\n" /* @ choose */ \
"vbif q3, q6, q14\n" /* @ choose */ \
"vbif q4, q6, q15\n" /* @ choose */ \
"vbif q5, q6, q7\n" /* @ choose */ \
"vcvt.s32.f32 q6, q8\n" /* r00, fp32->int32 */ \ "vcvt.s32.f32 q6, q8\n" /* r00, fp32->int32 */ \
"vcvt.s32.f32 q7, q9\n" /* r01, fp32->int32 */ \ "vcvt.s32.f32 q7, q9\n" /* r01, fp32->int32 */ \
"vcvt.s32.f32 q10, q0\n" /* r10, fp32->int32 */ \ "vcvt.s32.f32 q10, q0\n" /* r10, fp32->int32 */ \
...@@ -1682,7 +1797,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -1682,7 +1797,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"q14", "q14",
"q15", "q15",
"r0", "r0",
"cc"); "cc",
"memory");
} }
template <> template <>
...@@ -1697,6 +1813,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -1697,6 +1813,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
bool is_relu, bool is_relu,
int k, int k,
int rem) { int rem) {
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
: [a_ptr] "+r"(a_ptr), : [a_ptr] "+r"(a_ptr),
[b_ptr] "+r"(b_ptr), [b_ptr] "+r"(b_ptr),
...@@ -1708,6 +1825,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -1708,6 +1825,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
: [is_relu] "r"(is_relu), : [is_relu] "r"(is_relu),
[bias] "r"(bias), [bias] "r"(bias),
[rem] "r"(rem), [rem] "r"(rem),
[vmax] "r"(vmax),
[scale] "r"(scale) [scale] "r"(scale)
: "q0", : "q0",
"q1", "q1",
...@@ -1726,7 +1844,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr, ...@@ -1726,7 +1844,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
"q14", "q14",
"q15", "q15",
"r0", "r0",
"cc"); "cc",
"memory");
} }
#endif // __aarch64__ // NOLINT #endif // __aarch64__ // NOLINT
......
...@@ -79,6 +79,7 @@ inline void write_gemv_out(const int* in, ...@@ -79,6 +79,7 @@ inline void write_gemv_out(const int* in,
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
out[0] = out[0] =
saturate_cast<signed char>(roundf(*(in++) * *(scale++) + *(bias++))); saturate_cast<signed char>(roundf(*(in++) * *(scale++) + *(bias++)));
out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127
if (flag_relu) { if (flag_relu) {
out[0] = out[0] > 0 ? out[0] : 0; out[0] = out[0] > 0 ? out[0] : 0;
} }
...@@ -87,6 +88,7 @@ inline void write_gemv_out(const int* in, ...@@ -87,6 +88,7 @@ inline void write_gemv_out(const int* in,
} else { } else {
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
out[0] = saturate_cast<signed char>(roundf(*(in++) * *(scale++))); out[0] = saturate_cast<signed char>(roundf(*(in++) * *(scale++)));
out[0] = out[0] < -127 ? -127 : out[0]; // -127 - 127
if (flag_relu) { if (flag_relu) {
out[0] = out[0] > 0 ? out[0] : 0; out[0] = out[0] > 0 ? out[0] : 0;
} }
......
...@@ -40,13 +40,11 @@ void fp32_to_int8(const float* din, ...@@ -40,13 +40,11 @@ void fp32_to_int8(const float* din,
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
int64_t loop_size = outer_size * axis_size; int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < loop_size; ++j) { for (int j = 0; j < loop_size; ++j) {
float inv_scale = 1.f / scale[j % axis_size]; float inv_scale = 1.f / scale[j % axis_size];
float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vscale = vdupq_n_f32(inv_scale); float32x4_t vscale = vdupq_n_f32(inv_scale);
float32x4_t vmax = vdupq_n_f32(-127.f);
float32x4_t vpoff = vdupq_n_f32(0.5f); float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f); float32x4_t vnoff = vdupq_n_f32(-0.5f);
const float* din_c = din + j * inner_size; const float* din_c = din + j * inner_size;
...@@ -56,6 +54,7 @@ void fp32_to_int8(const float* din, ...@@ -56,6 +54,7 @@ void fp32_to_int8(const float* din,
const float* din_ptr = din_c; const float* din_ptr = din_c;
signed char* dout_ptr = dout_c; signed char* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
float32x4_t vmax = vdupq_n_f32(-127.0);
asm volatile( asm volatile(
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"ldp q2, q3, [%[in]], #32 \n" "ldp q2, q3, [%[in]], #32 \n"
...@@ -64,16 +63,19 @@ void fp32_to_int8(const float* din, ...@@ -64,16 +63,19 @@ void fp32_to_int8(const float* din,
"fmul v5.4s, v1.4s, %[scale].4s \n" "fmul v5.4s, v1.4s, %[scale].4s \n"
"fmul v6.4s, v2.4s, %[scale].4s \n" "fmul v6.4s, v2.4s, %[scale].4s \n"
"fmul v7.4s, v3.4s, %[scale].4s \n" "fmul v7.4s, v3.4s, %[scale].4s \n"
/* data >= -127 */
"fcmge v8.4s, v4.4s, %[vmax].4s \n" "fcmge v8.4s, v4.4s, %[vmax].4s \n"
"fcmge v9.4s, v5.4s, %[vmax].4s \n" "fcmge v9.4s, v5.4s, %[vmax].4s \n"
"fcmge v10.4s, v6.4s, %[vmax].4s \n" "fcmge v10.4s, v6.4s, %[vmax].4s \n"
"fcmge v11.4s, v7.4s, %[vmax].4s \n" "fcmge v11.4s, v7.4s, %[vmax].4s \n"
/* choose data */
"bif v4.16b, %[vmax].16b, v8.16b \n" "bif v4.16b, %[vmax].16b, v8.16b \n"
"bif v5.16b, %[vmax].16b, v9.16b \n" "bif v5.16b, %[vmax].16b, v9.16b \n"
"bif v6.16b, %[vmax].16b, v10.16b \n" "bif v6.16b, %[vmax].16b, v10.16b \n"
"bif v7.16b, %[vmax].16b, v11.16b \n" "bif v7.16b, %[vmax].16b, v11.16b \n"
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"subs %[cnt], %[cnt], #1 \n" "subs %[cnt], %[cnt], #1 \n"
/* fp32 - int32 */
"FCVTAS v8.4s, v4.4s \n" "FCVTAS v8.4s, v4.4s \n"
"FCVTAS v9.4s, v5.4s \n" "FCVTAS v9.4s, v5.4s \n"
"FCVTAS v10.4s, v6.4s \n" "FCVTAS v10.4s, v6.4s \n"
...@@ -89,7 +91,9 @@ void fp32_to_int8(const float* din, ...@@ -89,7 +91,9 @@ void fp32_to_int8(const float* din,
"bne 0b \n" "bne 0b \n"
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale), [vmax] "w"(vmax) : [scale] "w"(vscale), [vmax] "w"(vmax)
: "v0", : "cc",
"memory",
"v0",
"v1", "v1",
"v2", "v2",
"v3", "v3",
...@@ -102,6 +106,7 @@ void fp32_to_int8(const float* din, ...@@ -102,6 +106,7 @@ void fp32_to_int8(const float* din,
"v10", "v10",
"v11"); "v11");
#else #else
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
...@@ -113,23 +118,27 @@ void fp32_to_int8(const float* din, ...@@ -113,23 +118,27 @@ void fp32_to_int8(const float* din,
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n"
"vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n"
"vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n"
"vcgt.f32 q8, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n"
"vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q7, %q[vnoff], q8 @ get right offset\n" "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q4, q0, %q[vscale] @ mul scale\n" "vmla.f32 q4, q0, %q[vscale] @ mul scale\n"
"vld1.32 {d0-d1}, [%[vmax]] @ set q0 = -127 \n"
"vmla.f32 q5, q1, %q[vscale] @ mul scale\n" "vmla.f32 q5, q1, %q[vscale] @ mul scale\n"
"vmla.f32 q6, q2, %q[vscale] @ mul scale\n" "vmla.f32 q6, q2, %q[vscale] @ mul scale\n"
"vmla.f32 q7, q3, %q[vscale] @ mul scale\n" "vmla.f32 q7, q3, %q[vscale] @ mul scale\n"
"vcge.f32 q8, q4, %q[vmax] @ q4 >= vmax \n" /* data >= -127 */
"vcge.f32 q9, q5, %q[vmax] @ q4 >= vmax \n" "vcge.f32 q8, q4, q0 @ q4 >= -127 \n"
"vcge.f32 q10, q6, %q[vmax] @ q4 >= vmax \n" "vcge.f32 q9, q5, q0 @ q4 >= -127 \n"
"vbif q4, %q[vmax], q8 @ choose \n" "vcge.f32 q10, q6, q0 @ q4 >= -127 \n"
"vcge.f32 q8, q7, %q[vmax] @ q4 >= vmax \n" "vcge.f32 q11, q7, q0 @ q4 >= -127 \n"
"vbif q5, %q[vmax], q9 @ choose \n" /* choose data */
"vbif q6, %q[vmax], q10 @ choose \n" "vbif q4, q0, q8 @ choose \n"
"vbif q7, %q[vmax], q8 @ choose \n" "vbif q5, q0, q9 @ choose \n"
"vbif q6, q0, q10 @ choose \n"
"vbif q7, q0, q11 @ choose \n"
/* fp32 - int32 */
"vcvt.s32.f32 q0, q4 @ cvt to int32\n" "vcvt.s32.f32 q0, q4 @ cvt to int32\n"
"vcvt.s32.f32 q1, q5 @ cvt to int32\n" "vcvt.s32.f32 q1, q5 @ cvt to int32\n"
"vcvt.s32.f32 q2, q6 @ cvt to int32\n" "vcvt.s32.f32 q2, q6 @ cvt to int32\n"
...@@ -150,9 +159,22 @@ void fp32_to_int8(const float* din, ...@@ -150,9 +159,22 @@ void fp32_to_int8(const float* din,
: [vscale] "w"(vscale), : [vscale] "w"(vscale),
[vpoff] "w"(vpoff), [vpoff] "w"(vpoff),
[vnoff] "w"(vnoff), [vnoff] "w"(vnoff),
[vzero] "w"(vzero), [vmax] "r"(vmax),
[vmax] "w"(vmax) [vzero] "w"(vzero)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10"); : "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11");
#endif #endif
} }
const float* din_r = din_c + 16 * cnt; const float* din_r = din_c + 16 * cnt;
...@@ -203,7 +225,7 @@ void fp32_to_int16(const float* din, ...@@ -203,7 +225,7 @@ void fp32_to_int16(const float* din,
"bne 0b \n" "bne 0b \n"
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale) : [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v8", "v9"); : "cc", "memory", "v0", "v1", "v4", "v5", "v8", "v9");
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
...@@ -232,7 +254,7 @@ void fp32_to_int16(const float* din, ...@@ -232,7 +254,7 @@ void fp32_to_int16(const float* din,
[vpoff] "w"(vpoff), [vpoff] "w"(vpoff),
[vnoff] "w"(vnoff), [vnoff] "w"(vnoff),
[vzero] "w"(vzero) [vzero] "w"(vzero)
: "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
#endif #endif
} }
const float* din_r = din_c + 8 * cnt; const float* din_r = din_c + 8 * cnt;
...@@ -294,7 +316,9 @@ void int8_to_fp32(const int8_t* in, ...@@ -294,7 +316,9 @@ void int8_to_fp32(const int8_t* in,
"bne 0b \n" "bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale) : [scale] "w"(vscale)
: "v0", : "cc",
"memory",
"v0",
"v1", "v1",
"v2", "v2",
"v3", "v3",
...@@ -335,7 +359,7 @@ void int8_to_fp32(const int8_t* in, ...@@ -335,7 +359,7 @@ void int8_to_fp32(const int8_t* in,
"bne 0b \n" "bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale) : [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__ #endif // __aarch64__
} }
const signed char* din_r = din_c + 16 * cnt; const signed char* din_r = din_c + 16 * cnt;
...@@ -394,7 +418,18 @@ void int16_to_fp32(const int16_t* in, ...@@ -394,7 +418,18 @@ void int16_to_fp32(const int16_t* in,
"bne 0b \n" "bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale) : [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); : "cc",
"memory",
"v0",
"v1",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11");
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n"
...@@ -422,7 +457,7 @@ void int16_to_fp32(const int16_t* in, ...@@ -422,7 +457,7 @@ void int16_to_fp32(const int16_t* in,
"bne 0b \n" "bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale) : [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__ #endif // __aarch64__
} }
const int16_t* din_r = din_c + 16 * cnt; const int16_t* din_r = din_c + 16 * cnt;
...@@ -473,7 +508,9 @@ void int32_to_fp32(const int* din, ...@@ -473,7 +508,9 @@ void int32_to_fp32(const int* din,
"bne 0b \n" "bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale) : [scale] "w"(vscale)
: "v0", : "cc",
"memory",
"v0",
"v1", "v1",
"v2", "v2",
"v3", "v3",
...@@ -506,7 +543,9 @@ void int32_to_fp32(const int* din, ...@@ -506,7 +543,9 @@ void int32_to_fp32(const int* din,
"bne 0b \n" "bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale) : [scale] "w"(vscale)
: "q0", : "cc",
"memory",
"q0",
"q1", "q1",
"q2", "q2",
"q3", "q3",
...@@ -551,41 +590,53 @@ void int32_to_int8(const int* din, ...@@ -551,41 +590,53 @@ void int32_to_int8(const int* din,
const int* din_ptr = din_c; const int* din_ptr = din_c;
int8_t* dout_ptr = dout_c; int8_t* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
float32x4_t vmax = vdupq_n_f32(-127.0);
asm volatile( asm volatile(
"0: \n" "0: \n"
"ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n"
"ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n"
/* int32 - fp32 */
"scvtf v4.4s, v0.4s \n" "scvtf v4.4s, v0.4s \n"
"scvtf v5.4s, v1.4s \n" "scvtf v5.4s, v1.4s \n"
"scvtf v6.4s, v2.4s \n" "scvtf v6.4s, v2.4s \n"
"scvtf v7.4s, v3.4s \n" "scvtf v7.4s, v3.4s \n"
/* mul scale */
"fmul v0.4s, v4.4s, %[scale].4s \n" "fmul v0.4s, v4.4s, %[scale].4s \n"
"fmul v1.4s, v5.4s, %[scale].4s \n" "fmul v1.4s, v5.4s, %[scale].4s \n"
"fmul v2.4s, v6.4s, %[scale].4s \n" "fmul v2.4s, v6.4s, %[scale].4s \n"
"fmul v3.4s, v7.4s, %[scale].4s \n" "fmul v3.4s, v7.4s, %[scale].4s \n"
/* data >= -127 */
"fcmge v4.4s, v0.4s, %[vmax].4s \n"
"fcmge v5.4s, v1.4s, %[vmax].4s \n"
"fcmge v6.4s, v2.4s, %[vmax].4s \n"
"fcmge v7.4s, v3.4s, %[vmax].4s \n"
/* choose data */
"bif v0.16b, %[vmax].16b, v4.16b \n"
"bif v1.16b, %[vmax].16b, v5.16b \n"
"bif v2.16b, %[vmax].16b, v6.16b \n"
"bif v3.16b, %[vmax].16b, v7.16b \n"
/* fp32 - int32 */
"fcvtas v4.4s, v0.4s \n" "fcvtas v4.4s, v0.4s \n"
"fcvtas v5.4s, v1.4s \n" "fcvtas v5.4s, v1.4s \n"
"fcvtas v6.4s, v2.4s \n" "fcvtas v6.4s, v2.4s \n"
"fcvtas v7.4s, v3.4s \n" "fcvtas v7.4s, v3.4s \n"
/* int32 - int16 */
"sqxtn v0.4h, v4.4s \n" "sqxtn v0.4h, v4.4s \n"
"sqxtn2 v0.8h, v5.4s \n" "sqxtn2 v0.8h, v5.4s \n"
"sqxtn v1.4h, v6.4s \n" "sqxtn v1.4h, v6.4s \n"
"sqxtn2 v1.8h, v7.4s \n" "sqxtn2 v1.8h, v7.4s \n"
/* int16 - int8 */
"sqxtn v2.8b, v0.8h \n" "sqxtn v2.8b, v0.8h \n"
"sqxtn2 v2.16b, v1.8h \n" "sqxtn2 v2.16b, v1.8h \n"
/* store */
"st1 {v2.16b}, [%[out]], #16 \n" "st1 {v2.16b}, [%[out]], #16 \n"
"subs %[loop], %[loop], #1 \n" "subs %[loop], %[loop], #1 \n"
"bne 0b \n" "bne 0b \n"
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale) : [scale] "w"(vscale), [vmax] "w"(vmax)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else #else
float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
...@@ -607,9 +658,21 @@ void int32_to_int8(const int* din, ...@@ -607,9 +658,21 @@ void int32_to_int8(const int* din,
"vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q0, q4, %q[vscale] @ mul scale\n" "vmla.f32 q0, q4, %q[vscale] @ mul scale\n"
"vld1.32 {d8-d9}, [%[vmax]] @ set q4 = -127 \n"
"vmla.f32 q1, q5, %q[vscale] @ mul scale\n" "vmla.f32 q1, q5, %q[vscale] @ mul scale\n"
"vmla.f32 q2, q6, %q[vscale] @ mul scale\n" "vmla.f32 q2, q6, %q[vscale] @ mul scale\n"
"vmla.f32 q3, q7, %q[vscale] @ mul scale\n" "vmla.f32 q3, q7, %q[vscale] @ mul scale\n"
/* data >= -127 */
"vcge.f32 q8, q0, q4 @ q0 >= -127 \n"
"vcge.f32 q9, q1, q4 @ q1 >= -127 \n"
"vcge.f32 q10, q2, q4 @ q2 >= -127 \n"
"vcge.f32 q11, q3, q4 @ q3 >= -127 \n"
/* choose data */
"vbif q0, q4, q8 @ choose \n"
"vbif q1, q4, q9 @ choose \n"
"vbif q2, q4, q10 @ choose \n"
"vbif q3, q4, q11 @ choose \n"
/* fp32 - int32 */
"vcvt.s32.f32 q4, q0 @ cvt to int32\n" "vcvt.s32.f32 q4, q0 @ cvt to int32\n"
"vcvt.s32.f32 q5, q1 @ cvt to int32\n" "vcvt.s32.f32 q5, q1 @ cvt to int32\n"
"vcvt.s32.f32 q6, q2 @ cvt to int32\n" "vcvt.s32.f32 q6, q2 @ cvt to int32\n"
...@@ -628,9 +691,12 @@ void int32_to_int8(const int* din, ...@@ -628,9 +691,12 @@ void int32_to_int8(const int* din,
: [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr) : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
: [vscale] "w"(vscale), : [vscale] "w"(vscale),
[vzero] "w"(vzero), [vzero] "w"(vzero),
[vmax] "r"(vmax),
[vnoff] "w"(vnoff), [vnoff] "w"(vnoff),
[vpoff] "w"(vpoff) [vpoff] "w"(vpoff)
: "q0", : "cc",
"memory",
"q0",
"q1", "q1",
"q2", "q2",
"q3", "q3",
...@@ -648,6 +714,7 @@ void int32_to_int8(const int* din, ...@@ -648,6 +714,7 @@ void int32_to_int8(const int* din,
int8_t* dout_r = dout_c + 16 * cnt; int8_t* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) { for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i])); dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
} }
} }
} }
...@@ -682,7 +749,7 @@ float compute_max_kernel(const float* din, int64_t size) { ...@@ -682,7 +749,7 @@ float compute_max_kernel(const float* din, int64_t size) {
"bne 0b \n" "bne 0b \n"
: [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
: :
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n" "vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n"
...@@ -703,7 +770,7 @@ float compute_max_kernel(const float* din, int64_t size) { ...@@ -703,7 +770,7 @@ float compute_max_kernel(const float* din, int64_t size) {
: [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
: :
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif #endif
float32x2_t vmax_p = float32x2_t vmax_p =
vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val)); vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val));
......
...@@ -34,7 +34,7 @@ DEFINE_int32(power_mode, ...@@ -34,7 +34,7 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times"); DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(basic_test, true, "do all tests");
DEFINE_bool(check_result, true, "check the result"); DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(batch, 1, "batch size"); DEFINE_int32(batch, 1, "batch size");
...@@ -457,7 +457,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims, ...@@ -457,7 +457,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
const std::vector<int>& power_mode) {} const std::vector<int>& power_mode) {}
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
#if 0 /// 3x3dw #if 1 /// 3x3dw
TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) { TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
for (auto& stride : {1, 2}) { for (auto& stride : {1, 2}) {
...@@ -525,7 +525,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) { ...@@ -525,7 +525,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
} }
#endif /// 5x5dw #endif /// 5x5dw
#if 0 /// conv1x1s1 #if 1 /// conv1x1s1
TEST(TestConv1x1s1Int8, test_conv1x1s1) { TEST(TestConv1x1s1Int8, test_conv1x1s1) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 32}) { for (auto& cin : {1, 3, 8, 32}) {
...@@ -562,7 +562,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) { ...@@ -562,7 +562,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
} }
#endif /// conv1x1s1 #endif /// conv1x1s1
#if 0 /// conv3x3s1 #if 1 /// conv3x3s1
TEST(TestConv3x3s1Int8, test_conv_3x3s1) { TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 33}) { for (auto& cin : {1, 3, 8, 33}) {
...@@ -602,7 +602,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) { ...@@ -602,7 +602,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
} }
#endif /// conv3x3s1 #endif /// conv3x3s1
#if 0 /// conv3x3s2 #if 1 /// conv3x3s2
TEST(TestConv3x3s2Int8, test_conv_3x3s2) { TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 31}) { for (auto& cin : {1, 3, 31}) {
......
...@@ -37,7 +37,7 @@ DEFINE_int32(power_mode, ...@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times"); DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(basic_test, true, "do all tests");
DEFINE_bool(check_result, true, "check the result"); DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(M, 512, "gemm: M"); DEFINE_int32(M, 512, "gemm: M");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册