提交 81ff3830 编写于 作者: C chenhoujiang

Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into dev-latest

......@@ -24,6 +24,7 @@ const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
......@@ -31,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const char *G_OP_TYPE_FC = "fusion_fc";
const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul";
......@@ -117,11 +119,13 @@ std::unordered_map<
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
......
......@@ -108,9 +108,11 @@ extern const char *G_OP_TYPE_BOX_CODER;
extern const char *G_OP_TYPE_CONCAT;
extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
extern const char *G_OP_TYPE_FC;
extern const char *G_OP_TYPE_FC_INT8;
extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "fpga/V1/api.h"
#include "fpga/V1/bias_scale.h"
#include "fpga/V1/deconv_filter.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
......@@ -124,6 +125,32 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
int group_num, int stride) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT
auto dims = filter_tensor->dims();
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
memcpy(new_data, data_ptr, memory_size);
int hw = height * width;
deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);
num = dims[1];
channel = dims[0];
deconv_filter::deconv_format_filter(
&new_data, (int)num, (int)channel, // NOLINT
(int)height, // NOLINT
(int)width, group_num, max_value, stride); // NOLINT
framework::DDim dims_new =
framework::make_ddim({num, channel, height, width});
filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
}
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
......@@ -240,6 +267,100 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr);
}
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num;
arg->sub_conv_num = stride_h;
arg->filter_num = (uint32_t)filter->dims()[0];
int sub_conv_num = arg->sub_conv_num;
int sub_stride = 1;
int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w,
stride_w);
int sub_filter_width =
deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w);
int sub_output_width = deconv_filter::deconv_get_sub_out_axis(
input->dims()[3], sub_pad, sub_filter_width);
int sub_output_height = deconv_filter::deconv_get_sub_out_axis(
input->dims()[2], sub_pad, sub_filter_width);
arg->sub_output_width = sub_output_width;
arg->sub_output_height = sub_output_height;
arg->omit_size =
deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w);
arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
int sub_channels = (int32_t)input->dims()[1];
int omit_size = arg->omit_size;
int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
int sub_filter_num = sub_conv_num * (arg->filter_num);
int conv_output_size =
(align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
sub_output_height;
int ouput_size = conv_output_size * sub_conv_num;
int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT);
int align_sub_filter_count =
align_to_x(sub_filter_width * sub_filter_width * sub_channels,
FILTER_ELEMENT_ALIGNMENT);
int align_conv_sub_filter_count =
align_sub_filter_count * align_sub_filter_num;
for (int i = 0; i < sub_conv_num; ++i) {
arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
arg->conv_args[i].group_num = group_num;
arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].relu_enabled = relu_enabled;
arg->conv_args[i].kernel.width = sub_filter_width;
arg->conv_args[i].kernel.height = sub_filter_width;
arg->conv_args[i].kernel.stride_w = 1;
arg->conv_args[i].kernel.stride_h = 1;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.channels = sub_channels;
arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
arg->conv_args[i].image.pad_width = sub_pad;
arg->conv_args[i].image.pad_height = sub_pad;
arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].sb_address = (void *)bs_ptr;
char *filter_sub_space =
(char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
fpga_copy(filter_sub_space,
(char *)filter_ptr + i * align_conv_sub_filter_count,
align_conv_sub_filter_count);
arg->conv_args[i].filter_address = (void *)(filter_sub_space);
fpga_flush(filter_sub_space, align_conv_sub_filter_count);
if (sub_conv_num == 1) {
arg->conv_args[i].output.address = out_ptr;
arg->conv_args[i].output.scale_address = out->scale;
} else {
half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
arg->conv_args[i].output.address = (void *)((half *)ptr_output);
float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.scale_address = ptr_output_scale;
}
}
arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
// fpga_free(filter_ptr);
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -43,6 +43,25 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
int group_num, int stride);
template <typename Dtype>
void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
float data;
std::ofstream out(filename.c_str());
for (int i = 0; i < dataSize; ++i) {
data = (((Dtype*)buffer)[i]);
out << data << std::endl;
}
out.close();
return;
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V1/deconv_bias_scale.h"
// #include "deconv_bias_scale.h"
#include "fpga/V1/bias_scale.h"
// #include "bias_scale.h"
#include <memory.h>
#include "fpga/V1/api.h"
// #include "fpga_api.h"
namespace paddle_mobile {
namespace fpga {
namespace deconv_bias_scale {
void deconv_bias_scale_expand(float** bias_scale_array, int num,
int sub_conv_n) {
int sub_num = num * sub_conv_n;
float* ptr_tmp = *bias_scale_array;
float* ptr_bias_scale_expand =
(float*)fpga_malloc(sizeof(float) * sub_num * 2);
int scale_base_offset = sub_num;
for (int i = 0; i < sub_conv_n; ++i) {
int offset = num * i;
// copy bias
fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
// copy scale
fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
num * sizeof(float));
}
*bias_scale_array = ptr_bias_scale_expand;
fpga_free(ptr_tmp);
}
} // namespace deconv_bias_scale
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define BS_NUM_ALIGNMENT 8
namespace paddle_mobile {
namespace fpga {
namespace deconv_bias_scale {
void deconv_bias_scale_expand(float** bias_scale_array, int num,
int sub_conv_n);
} // namespace deconv_bias_scale
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V1/deconv_filter.h"
#include <memory.h>
#include <algorithm>
// #include "deconv_filter.h"
#include "fpga/V1/filter.h"
// #include "filter.h"
#include "fpga/V1/api.h"
// #include "fpga_api.h"
// just for test
//#include <string>
//#include "deconv.h"
//#include "deconv_api.h"
// using namespace std;
// using namespace paddle_mobile::fpga;
// using namespace baidu::fpga::deconv::api;
// namespace api = baidu::fpga::deconv::api;
namespace paddle_mobile {
namespace fpga {
namespace deconv_filter {
/*
inverse kernel weights of each channel for every filter
*/
void deconv_inverse_filter(float** data_in, int num, int channel, int width,
int height) {
float* tmp = *data_in;
// float fix_range = 127;// float scale = fix_range / max;
int data_size = num * channel * width * height;
int hw_len = height * width;
float* tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
for (int i = 0; i < num; ++i) {
for (int j = 0; j < channel; ++j) {
for (int k = 0; k < hw_len; ++k) {
tmp_data[i * channel * hw_len + j * hw_len + k] =
(*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
}
}
}
*data_in = (float*)tmp_data; //
fpga_free(tmp);
}
/*
calculate sub padding number
*/
int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
// error
return 0;
}
return (filter_axis - pad - 1) / stride;
}
int deconv_get_sub_filter_axis(int filter_axis, int stride) {
return (filter_axis / stride);
}
int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
}
/*
(filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
position. so the omit rows or columns is (stride - )
*/
int deconv_get_omit(int stride, int filter_width, int pad) {
if (((filter_width - pad) <= 0)) { // ((filter_width-pad) > stride) ||
// error
return 0;
}
int idx = 1;
bool flag = false;
for (idx = 1; idx <= stride; ++idx) {
int j = idx;
for (; j <= filter_width;) {
if (j == filter_width - pad) {
flag = true;
break;
}
j = j + stride;
}
if (flag) {
break;
}
}
return (stride - idx);
}
int deconv_get_sub_filter_num(int filter_num, int stride) {
return filter_num * stride;
}
void deconv_get_sub_filter(char** data_in, int height, int width,
int sub_conv_n, int kernel_num, int channel) {
char* ptr_tmp = *data_in;
int sub_num = kernel_num * sub_conv_n;
int sub_h = height / sub_conv_n;
int sub_w = width / sub_conv_n;
int sub_filter_size =
kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
char* ptr_sub_filter = (char*)fpga_malloc(sub_filter_size * sizeof(char));
for (int idx = 0; idx < sub_conv_n; ++idx) {
for (int nn = 0; nn < sub_num; ++nn) {
int ni = nn % kernel_num;
int woff = sub_conv_n - 1 - (nn / kernel_num); //
for (int hh = 0; hh < sub_h; ++hh) {
int hi = hh * sub_conv_n + idx % sub_conv_n;
for (int ww = 0; ww < sub_w; ++ww) {
int wi = ww * sub_conv_n + woff; // 1 0
int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel; //
int kidx = ((ni * height + hi) * width + wi) * channel; //
fpga_copy(
ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
(*data_in) + kidx, channel * sizeof(char));
// for (int cc =0; cc < channel; ++cc) {
// ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
// (*data_in)[kidx + cc];
// }
}
}
}
}
*data_in = ptr_sub_filter;
fpga_free(ptr_tmp);
}
void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
int hw) {
float* tmp = *filter_in;
float* ptr_filter = (float*)(paddle_mobile::fpga::fpga_malloc(
hw * kernel_num * channels * sizeof(float)));
for (int c = 0; c < channels; ++c) {
for (int n = 0; n < kernel_num; ++n) {
paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
tmp + n * channels * hw + c * hw,
hw * sizeof(float));
}
}
*filter_in = ptr_filter;
paddle_mobile::fpga::fpga_free(tmp);
}
void deconv_format_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max, int stride) {
int data_size = channel * height * width * num;
/*{
float result2 = (float)0;
string filename = "origin_filter_data";
api::savefile<float>(filename, (void *)*data_in, data_size, result2);
}*/
deconv_inverse_filter(data_in, num, channel, width, height);
/* {
float result2 = (float)0;
string filename = "inverse_filter_data";
api::savefile<float>(filename, (void *)*data_in, data_size, result2);
}*/
filter::quantize(data_in, data_size, max);
/* {
char result2 = (char)0;
string filename = "quantize_filter_data";
api::savefile<char>(filename, (void *)*data_in, data_size, result2);
}*/
char** quantize_data = (char**)data_in; // NOLINT
filter::convert_to_hwc(quantize_data, num, channel, height, width);
/*{
char result2 = (char)0;
string filename = "convert_to_hwc_filter_data";
api::savefile<char>(filename, (void *)*quantize_data, data_size,
result2);
}*/
deconv_get_sub_filter(quantize_data, height, width, stride, num, channel);
/*{
char result2 = (char)0;
string filename = "sub_filter_filter_data";
api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
}*/
int sub_conv_n = stride;
int sub_h = height / sub_conv_n;
int sub_w = width / sub_conv_n;
int sub_chw = sub_h * sub_w * channel;
int sub_num = sub_conv_n * num;
int division_capacity = filter::calc_division_capacity(sub_chw);
int num_per_div_before_alignment =
filter::calc_num_per_div(sub_num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num = (sub_num + num_per_div_before_alignment - 1) /
num_per_div_before_alignment;
int residual = (sub_num) % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
char** ptr_ptr_data = (char**)fpga_malloc(sub_conv_n * sizeof(char*));
int origin_offset = sub_chw * sub_num;
for (int i = 0; i < sub_conv_n; ++i) {
(ptr_ptr_data)[i] = (char*)fpga_malloc(origin_offset * sizeof(char));
fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
origin_offset * sizeof(char));
/* char result2 = (char)0;
string filename = "ptr_ptr_data" + to_string(i);
api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
result2);
*/
}
// char result2 = (char)0;
// string filename = "interleave";
// api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
// result2);
fpga_free(*quantize_data);
int align_offset =
align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset *
sizeof(char)); // continuous space
for (int i = 0; i < sub_conv_n; ++i) {
int offset = i * origin_offset;
char* ptr_tmp = (ptr_ptr_data)[i];
filter::align_element(&ptr_tmp, sub_num, sub_chw);
filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
/* char result2 = (char)0;
string filename = "interleave" + to_string(i);
api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
*/
fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
fpga_free(ptr_tmp);
}
*data_in = (float*)ptr_space;
/* {
char result2 = (char)0;
string filename = "ptr_space";
api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
align_offset, result2);
}*/
fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
}
} // namespace deconv_filter
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle_mobile {
namespace fpga {
namespace deconv_filter {
void deconv_inverse_filter(float** data_in, int num, int channel, int width,
int height);
int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
int deconv_get_sub_filter_num(int filter_num, int stride);
int deconv_get_sub_filter_axis(int filter_axis, int stride);
int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
int deconv_get_omit(int stride, int filter_width, int pad);
void deconv_get_sub_filter(char** data_in, int height, int width,
int sub_conv_n, int kernel_num, int channel);
void deconv_format_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max, int stride);
void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
} // namespace deconv_filter
} // namespace fpga
} // namespace paddle_mobile
......@@ -146,12 +146,16 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
memset(data_tmp, 0, num_element * sizeof(char));
for (i = 0; i < div_num; i++) {
for (i = 0; i < div_num - 1; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
(num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
*data_in = data_tmp;
fpga_free(tmp);
}
......
......@@ -29,11 +29,11 @@ void convert_to_hwc(char** data_in, int num, int channel, int height,
int width);
float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max);
void align_element(float** data_in, int num, int chw);
void align_element(char** data_in, int num, int chw);
void align_num(char** data_in, int num_per_div_before_alignment, int num,
int chw);
void reorder(float** data_in, int num_after_alignment, int chw);
void interleave(float** data_in, int num_after_alignment, int chw);
void reorder(char** data_in, int num_after_alignment, int chw);
void interleave(char** data_in, int num_after_alignment, int chw);
void format_filter(float** data_in, int num, int channel, int height, int width,
int group_num, float max);
......
......@@ -146,11 +146,11 @@ int format_conv_data(framework::Tensor *filter_tensor,
}
int format_fc_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float *bs_ptr) {
framework::Tensor *ofm_tensor, float **bs_ptr) {
float max_value = fpga::filter_find_max(filter_tensor);
fpga::format_fc_filter(filter_tensor, max_value);
int aligned_num = get_aligned_filter_num(filter_tensor);
fpga::format_bias_scale_array(&bs_ptr,
fpga::format_bias_scale_array(bs_ptr,
(int)filter_tensor->dims()[0], // NOLINT
aligned_num);
int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
......@@ -214,7 +214,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg[i].output.scale_address = out->scale;
int num_after_alignment = filter::calc_aligned_num(
(int)input->dims()[1], arg->filter_num); // NOLINT
arg->filter_num, (int)input->dims()[1]); // NOLINT
arg->conv_arg[i].free_space =
fpga_malloc(num_after_alignment * 2 * sizeof(half));
}
......
......@@ -41,7 +41,7 @@ void format_concat_output(framework::Tensor* out, int height, int width,
int format_conv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr, int group);
int format_fc_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float* bs_ptr);
framework::Tensor* ofm_tensor, float** bs_ptr);
void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h,
......
......@@ -92,7 +92,8 @@ void fpga_free(void *ptr) {
}
void fpga_copy(void *dest, const void *src, size_t num) {
#ifdef PADDLE_MOBILE_ZU5
driver::fpga_copy_driver(dest, src, num);
// driver::fpga_copy_driver(dest, src, num);
memcpy(dest, src, num);
#else
memcpy(dest, src, num);
#endif
......
......@@ -26,6 +26,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFpgaConv(const struct SplitConvArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
int ComputeFPGASplit(const struct SplitArgs& args);
int ComputeFpgaDeconv(const struct DeconvArgs& args);
} // namespace fpga
} // namespace paddle_mobile
......@@ -56,6 +56,7 @@ class CLImage {
tensor_dims_ = dim;
}
bool isInit() { return initialized_; }
/*
* need call SetTensorData first
*
......
......@@ -98,6 +98,24 @@ class OpRegistry {
}
};
#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
template class op_class<device_type, int8_t>; \
template <typename Dtype, typename T> \
class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> { \
public: \
DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class); \
}; \
static paddle_mobile::framework::OperatorRegistrar< \
device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
__op_registrar_##op_type##_##device_name(#op_type); \
int TouchOpRegistrar_##op_type##_##device_name() { \
__op_registrar_##op_type##_##device_name.Touch(); \
return 0; \
}
#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \
template class op_class<device_type, float>; \
template <typename Dtype, typename T> \
......
......@@ -153,7 +153,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
paddle_mobile::operators::math::Gemm gemm;
auto time1 = paddle_mobile::time();
gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
static_cast<float>(0), c, ldc, false, nullptr);
static_cast<float>(0), c, ldc, false,
static_cast<float *>(nullptr));
auto time2 = paddle_mobile::time();
double cost = paddle_mobile::time_diff(time1, time2);
paddle_mobile::memory::Free(a);
......
......@@ -30,6 +30,9 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(dropout, ops::DropoutOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
#endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include "operators/fusion_conv_add_relu_int8_op.h"
#include <vector>
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
ops::FusionConvAddReluInt8Op);
#endif
#endif // FUSION_CONVADDRELU_INT8_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionConvAddReluInt8Op
: public framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>> {
public:
FusionConvAddReluInt8Op(const std::string &type,
const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType,
FusionConvAddReluParam<DeviceType>,
ConvAddReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDRELU_INT8_OP
......@@ -55,6 +55,9 @@ REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#include "operators/fusion_fc_int8_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionFcInt8Op<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
auto y_dims = this->param_.InputY()->dims();
int x_num_col_dims = this->param_.XNumColDims();
int y_num_col_dims = this->param_.YNumColDims();
assert(x_dims.size() > x_num_col_dims);
assert(y_dims.size() > y_num_col_dims);
/// (1,2,3,4) , x_num_col_dims = 2 -> (2,12)
auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
assert(x_mat_dims[1] == y_mat_dims[0]);
std::vector<int64_t> output_dims;
output_dims.reserve(
static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
for (int i = 0; i < x_num_col_dims; ++i) {
output_dims.push_back(x_dims[i]);
}
for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
output_dims.push_back(y_dims[i]);
}
framework::DDim ddim = framework::make_ddim(output_dims);
this->param_.Out()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op);
#endif
#endif // FUSION_FC_INT8_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class FusionFcInt8Op
: public framework::OperatorWithKernel<DeviceType,
FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>> {
public:
FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
FusionFcKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_FC_INT8_OP
......@@ -28,10 +28,24 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
template <>
void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<float>(param);
ConvAddReluCompute<float, float>(param);
}
template class ConvAddReluKernel<CPU, float>;
#ifdef FUSION_CONVADDRELU_INT8_OP
template <>
bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
return true;
}
template <>
void ConvAddReluKernel<CPU, int8_t>::Compute(
const FusionConvAddReluParam<CPU> &param) {
ConvAddReluCompute<int8_t, int32_t>(param);
}
template class ConvAddReluKernel<CPU, int8_t>;
#endif
} // namespace operators
} // namespace paddle_mobile
......
......@@ -27,10 +27,27 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
template <>
void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
FusionFcCompute<float>(param);
FusionFcCompute<float, float>(param);
param.Out()->set_lod(param.InputX()->lod());
}
template class FusionFcKernel<CPU, float>;
#ifdef FUSION_FC_INT8_OP
template <>
bool FusionFcKernel<CPU, int8_t>::Init(FusionFcParam<CPU> *param) {
return true;
}
template <>
void FusionFcKernel<CPU, int8_t>::Compute(const FusionFcParam<CPU> &param) {
FusionFcCompute<int8_t, int32_t>(param);
param.Out()->set_lod(param.InputX()->lod());
}
template class FusionFcKernel<CPU, int8_t>;
#endif
} // namespace operators
} // namespace paddle_mobile
......
......@@ -20,10 +20,12 @@ limitations under the License. */
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#ifndef __aarch64__
inline float32_t vmaxvq_f32(float32x4_t r) {
float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
......
......@@ -25,21 +25,30 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
template <typename P>
template <typename P, typename S>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int axis = param.Axis();
int32_t axis = param.Axis();
S *bias_data = bias.data<S>();
Tensor *output = param.Output();
float *biase_data = bias.data<float>();
output->mutable_data<P>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
float alpha = 1.0f;
float beta = 1.0f;
const int batch_size = static_cast<int>(input->dims()[0]);
#ifdef FUSION_CONVADDRELU_INT8_OP
alpha = param.InputScale()->data<float>()[0];
beta = 0.0f;
#endif
int32_t groups = param.Groups();
std::vector<int32_t> strides = param.Strides();
std::vector<int32_t> paddings = param.Paddings();
std::vector<int32_t> dilations = param.Dilations();
const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
......@@ -61,13 +70,13 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col.mutable_data<P>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
input->dims(), 1, static_cast<int32_t>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
......@@ -77,17 +86,17 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
math::Vol2ColFunctor<CPU, P> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
for (int i = 0; i < batch_size; i++) {
for (int32_t i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
for (int32_t g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
......@@ -97,8 +106,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
......@@ -108,9 +117,9 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true, biase_data);
math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
beta, true, bias_data);
}
}
}
......
......@@ -106,10 +106,9 @@ inline void GemmConv(const ConvParam<CPU> &param) {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Itype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
math::matmul(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(0),
false, static_cast<Otype *>(nullptr));
}
}
}
......
......@@ -15,23 +15,29 @@ limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include <type_traits>
#include "operators/math/math_function.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
template <typename P, typename S>
void FusionFcCompute(const FusionFcParam<CPU> &param) {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
const Tensor *input_z = param.InputZ();
auto *input_z_data = input_z->data<float>();
Tensor *input_z = param.InputZ();
S *input_z_data = input_z->data<S>();
int axis = param.Axis();
Tensor *out = param.Out();
// int m = out->dims()[0];
// int n = out->dims()[1];
auto *out_data = out->mutable_data<float>();
auto *out_data = out->mutable_data<P>();
float alpha = 1.0f;
float beta = 1.0f;
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
......@@ -51,21 +57,28 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
}
if (std::is_same<P, int8_t>::value) {
#ifdef FUSION_FC_INT8_OP
alpha = param.InputScale()->data<float>()[0];
beta = 0.0f;
math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
input_z_data, true);
#endif
} else {
// bias_data的维度和out的第二个维度一致
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data,
sizeof(float) * classes);
}
// for (int i = 0; i < out->numel(); i++) {
// DLOG << out_data[i];
// }
// bias_data的维度和out的维度一致
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1), false);
math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
false);
}
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
} // namespace operators
......
......@@ -73,8 +73,9 @@ void MulCompute(const MulParam<CPU> &param) {
}
if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>();
math::matmul<int8_t>(x_matrix, false, y_matrix, false,
static_cast<int8_t>(1), out, static_cast<int8_t>(0));
math::matmul<float, int32_t>(x_matrix, false, y_matrix, false,
static_cast<float>(1), out,
static_cast<float>(0));
} else {
out->mutable_data<float>();
......
......@@ -23,20 +23,22 @@ namespace paddle_mobile {
namespace operators {
using framework::Tensor;
inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
std::vector<int> strides, std::vector<int> paddings,
const Tensor *in_x, Tensor *out) {
template <typename T, typename S>
void PoolBasic(std::string pooling_type, std::vector<int> ksize,
std::vector<int> strides, std::vector<int> paddings,
const Tensor *in_x, Tensor *out) {
if (pooling_type == "max") {
math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
math::MaxPool<float> pool_process;
math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
math::MaxPool<T> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
} else if (pooling_type == "avg") {
math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
math::AvgPool<float> pool_process;
math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
math::AvgPool<T, S> pool_process;
pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
}
}
template <typename P>
void PoolCompute(const PoolParam<CPU> &param) {
const Tensor *in_x = param.Input();
......@@ -52,50 +54,67 @@ void PoolCompute(const PoolParam<CPU> &param) {
LOG(paddle_mobile::LogLevel::kLOG_ERROR)
<< "Pool op only supports 2D and 3D input.";
}
if (param.isGlobalPooling()) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
}
}
if (ksize[0] == 3 && ksize[0] == ksize[1]) {
if (pooling_type == "max") {
if (strides[0] == strides[1] && strides[0] == 1 &&
paddings[0] == paddings[1] && paddings[1] == 1) {
math::Pool3x3Maxs1p1(in_x, out);
if (in_x->type() == typeid(int8_t)) {
if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) {
if (strides[0] == strides[1] && strides[0] == 1) {
math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]);
} else if (strides[0] == strides[1] && strides[0] == 2) {
math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]);
} else {
math::Pool3x3Max(strides, paddings, in_x, out);
}
} else if (pooling_type == "avg") {
if (strides[0] == strides[1] && strides[0] == 1 &&
paddings[0] == paddings[1] && paddings[1] == 1) {
math::Pool3x3Avgs1p1(in_x, out);
} else {
math::Pool3x3Avg(strides, paddings, in_x, out);
math::Pool3x3Max_int8(strides, paddings, in_x, out);
}
} else {
PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x,
out);
}
} else {
if (ksize[0] == 3 && ksize[0] == ksize[1]) {
if (pooling_type == "max") {
if (strides[0] == strides[1] && strides[0] == 1 &&
paddings[0] == paddings[1] && paddings[1] == 1) {
math::Pool3x3Maxs1p1(in_x, out);
} else {
math::Pool3x3Max(strides, paddings, in_x, out);
}
} else if (pooling_type == "avg") {
if (strides[0] == strides[1] && strides[0] == 1 &&
paddings[0] == paddings[1] && paddings[1] == 1) {
math::Pool3x3Avgs1p1(in_x, out);
} else {
math::Pool3x3Avg(strides, paddings, in_x, out);
}
}
} else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
strides[0] == strides[1] && paddings[0] == paddings[1] &&
paddings[1] == 0) {
} else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
strides[0] == strides[1] && paddings[0] == paddings[1] &&
paddings[1] == 0) {
#if __ARM_NEON
#if __aarch64__
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#else
/// todo: fix bug in Pool2x2
if (pooling_type == "max") {
math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
}
/// todo: fix bug in Pool2x2
if (pooling_type == "max") {
math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
}
#endif
#else
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
#endif // __ARM_NEON
} else {
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
} else {
PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
out);
}
}
}
......
......@@ -77,15 +77,25 @@ void BatchNormKernel<GPU_CL, float>::Compute(
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
const int out_width = default_work_size[1];
clSetKernelArg(kernel, 1, sizeof(int), &out_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
// cl_event out_event = param.OutputY()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
DLOG << *param.InputX();
DLOG << *param.NewBias();
DLOG << *param.NewScale();
DLOG << default_work_size[0];
DLOG << default_work_size[1];
DLOG << default_work_size[2];
DLOG << out_width;
DLOG << *param.OutputY();
cl_int status;
clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
CL_CHECK_ERRORS(status);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define BATCH_NORM
#define BIASE
#define RELU
#include "conv_kernel.inc.cl"
......@@ -924,6 +924,387 @@ __kernel void conv_5x5(__private const int global_size_dim0,
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
}
__kernel void convBNAdd_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
half4 output = (half4)0.0f;
half4 input[9];
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE
output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh));
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
}
__kernel void convBNAdd_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
half4 output = 0.0f;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE
output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh));
#endif
#ifdef RELU
output = activation(output);
#endif
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos, output);
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void dropout(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_W,
__private const float dropoutPro) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input;
half4 output;
input = read_imageh(input_image, sampler,output_pos);
half4 dropout = (half4)(1 - dropoutPro);
output = dropout * input;
write_imageh(output_image, output_pos, output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/kernel/conv_bn_add_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvBNAddReluKernel<GPU_CL, float>::Init(
FusionConvBNAddReluParam<GPU_CL> *param) {
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
const framework::CLImage *mean = param->InputMean();
const framework::CLImage *variance = param->InputVariance();
const framework::CLImage *scale = param->InputScale();
const framework::CLImage *bias = param->InputBias();
const float epsilon = param->Epsilon();
const int C = mean->numel();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
float inv_std_ptr[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
float *new_scale_ptr = new float[C];
float *new_bias_ptr = new float[C];
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
}
framework::CLImage *new_scale = new framework::CLImage();
// for (int j = 0; j < C; ++j) {
// DLOG << " new scale - " << j << new_scale_ptr[j];
// }
//
// for (int j = 0; j < C; ++j) {
// DLOG << " new bias - " << j << new_bias_ptr[j];
// }
new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
// DLOG << " climage - y bias: " << *(param->Bias());
//
// DLOG << " climage - new scale: " << *new_scale;
framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
// DLOG << " climage - new bias: " << *new_bias;
//
// DLOG << " climage - filter: " << *(param->Filter());
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
delete[](new_scale_ptr);
delete[](new_bias_ptr);
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("convBNAdd_1x1", "conv_bn_add_relu_kernel.cl");
DLOG << " conv bn add relu conv 1x1";
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_convBNAdd_3x3",
"conv_bn_add_relu_kernel.cl");
DLOG << " conv bn add relu depth_conv_3x3";
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("convBNAdd_3x3", "conv_bn_add_relu_kernel.cl");
DLOG << " conv bn add relu conv_3x3";
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
return true;
}
template <>
void ConvBNAddReluKernel<GPU_CL, float>::Compute(
const FusionConvBNAddReluParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto biase = param.Bias()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
// DLOG << " nh " << nh;
// DLOG << " stride " << stride;
// DLOG << " offset " << offset;
// DLOG << " input_c " << input_c;
// DLOG << " dilation " << dilation;
// DLOG << " input width " << input_width;
// DLOG << " input height " << input_height;
// DLOG << " output width " << output_width;
// DLOG << " output height " << output_height;
// DLOG << " input dim " << *param.Input();
// DLOG << " output dim " <<* param.Output();
// DLOG << " filter dim " << *param.Filter();
// DLOG<<*param.Bias();
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class ConvBNAddReluKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEPTHWISECONV_OP
#include "operators/kernel/depthwise_conv_kernel.h"
#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG << " depthwise conv kernel init begin ";
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
DLOG << " depthwise conv kernel init end ";
return true;
}
template <>
void DepthwiseConvKernel<GPU_CL, float>::Compute(
const ConvParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class DepthwiseConvKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License. */
//
//#ifdef DEQUANT_OP
//
//#include "operators/kernel/dequantize_kernel.h"
//
// namespace paddle_mobile {
// namespace operators {
//
// template <>
// bool DequantizeKernel<GPU_CL, float>::Init(DequantizeParam<GPU_CL> *param) {
// DLOG << " depthwise conv kernel init begin ";
// PADDLE_MOBILE_ENFORCE(
// param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
// param->Paddings()[0] == param->Paddings()[1],
// "need equal");
// param->Filter()->InitCLImage(cl_helper_.CLContext(),
// this->cl_helper_.CLCommandQueue());
// int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
// static_cast<int>(param->Paddings()[1]);
// param->SetOffset(offset);
// this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
// DLOG << " depthwise conv kernel init end ";
// return true;
//}
//
// template <>
// void DequantizeKernel<GPU_CL, float>::Compute(
// const DequantizeParam<GPU_CL> &param) {
// auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// auto input = param.Input()->GetCLImage();
// auto filter = param.Filter()->GetCLImage();
// auto output = param.Output()->GetCLImage();
// int stride = param.Strides()[0];
// int offset = param.Offset();
// int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
// param.Input()->Converter())
// ->GetCBlock();
// int dilation = param.Dilations()[0];
//
// int input_width = param.Input()->dims()[3];
// int input_height = param.Input()->dims()[2];
// int output_width = param.Output()->dims()[3];
// int output_height = param.Output()->dims()[2];
//
// cl_int status;
//
// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// status = clSetKernelArg(kernel, 1, sizeof(int), &w);
// status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
// status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
// status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
// status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
//
// CL_CHECK_ERRORS(status);
//
// // cl_event out_event = param.Output()->GetClEvent();
// // cl_event wait_event = param.Input()->GetClEvent();
//
// status = clEnqueueNDRangeKernel(
// this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
// NULL, default_work_size.data(), NULL, 0, NULL, NULL);
//
// CL_CHECK_ERRORS(status);
//}
//
// template class DepthwiseConvKernel<GPU_CL, float>;
//
//} // namespace operators
//} // namespace paddle_mobile
//
//#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DROPOUT_OP
#include "operators/kernel/dropout_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DropoutKernel<GPU_CL, float>::Init(DropoutParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("dropout", "dropout_kernel.cl");
return true;
}
template <>
void DropoutKernel<GPU_CL, float>::Compute(const DropoutParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
auto *input_image = param.InputX()->GetCLImage();
auto *output_image = param.Out()->GetCLImage();
const float dropoutProb = param.DropoutProb();
const auto &inputDim = param.InputX()->dims();
int input_dims[4] = {1, 1, 1, 1};
// 1 1000 1 1
for (int i = 0; i < inputDim.size(); i++) {
input_dims[4 - inputDim.size() + i] = inputDim[i];
}
int out_W = input_dims[1];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(float), &dropoutProb);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -24,7 +24,11 @@ bool ElementwiseAddKernel<GPU_CL, float>::Init(
ElementwiseAddParam<GPU_CL> *param) {
DLOG << "-----init add-----";
CLImage *bias = (CLImage *)(param->InputY());
bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue());
if (!bias->isInit()) {
bias->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
}
DLOG << " bias: " << *bias;
if (bias->dims().size() == 4) {
this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#include "operators/kernel/mul_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool MulKernel<GPU_CL, float>::Init(MulParam<GPU_CL> *param) {
return true;
}
template <>
void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {}
template class MulKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -23,12 +23,61 @@ namespace operators {
template <>
bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
bool relu_enabled = false;
auto input = const_cast<Tensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
float max_value = fpga::filter_find_max(filter);
fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]);
// int element_num_per_div =
// fpga::get_filter_num_per_div(filter, param->Groups());
// deconv only support group=1 && no spilt
fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n,
channel * sub_conv_n);
fpga::format_fp16_ofm(out);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
return true;
}
template <>
void DeconvAddKernel<FPGA, float>::Compute(
const FusionDeconvAddParam<FPGA> &param) {}
const FusionDeconvAddParam<FPGA> &param) {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -24,12 +24,60 @@ namespace operators {
template <>
bool DeconvAddReluKernel<FPGA, float>::Init(
FusionDeconvAddReluParam<FPGA> *param) {
bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
float max_value = fpga::filter_find_max(filter);
fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]);
// int element_num_per_div =
// fpga::get_filter_num_per_div(filter, param->Groups());
// deconv only support group=1 && no spilt
fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n,
channel * sub_conv_n);
fpga::format_fp16_ofm(out);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
return true;
}
template <>
void DeconvAddReluKernel<FPGA, float>::Compute(
const FusionDeconvAddReluParam<FPGA> &param) {}
const FusionDeconvAddReluParam<FPGA> &param) {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -21,7 +21,7 @@ namespace operators {
template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) {
bool relu_enabled = false;
bool relu_enabled = true;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
......
......@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
out->Resize(framework::make_ddim({1, channel, 1, 1}));
filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
fpga::format_fc_data(filter, out, bs_ptr);
fpga::format_fc_data(filter, out, &bs_ptr);
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) && defined(__aarch64__)
#include "operators/math/depthwise_conv3x3.h"
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
namespace math {
// template<>
// void DepthwiseConv3x3<int8_t, int32_t>(
// const framework::Tensor *input, const framework::Tensor *filter,
// const std::vector<int> &strides, framework::Tensor *output) {
// PADDLE_MOBILE_THROW_EXCEPTION(
// "Depthwise conv with generic strides has not been implemented.");
// }
template <>
void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
const framework::Tensor &filter,
const std::vector<int> &paddings,
framework::Tensor *output) {
PADDLE_MOBILE_THROW_EXCEPTION(
"Depthwise conv3x3 with stride 1 for arm v8 has not been implemented.");
}
template <>
void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
const framework::Tensor &filter,
const std::vector<int> &paddings,
framework::Tensor *output) {
PADDLE_MOBILE_THROW_EXCEPTION(
"Depthwise conv3x3 with stride 2 for arm v8 has not been implemented.");
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -15,6 +15,10 @@ limitations under the License. */
#pragma once
#include <string>
#include "common/log.h"
#include "memory/t_malloc.h"
#ifdef _OPENMP
#include <omp.h>
#endif
// 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)]
......@@ -23,10 +27,12 @@ limitations under the License. */
#if __aarch64__
#define MR_INT8 4
#define NR_INT8 2
#define MR 6
#define NR 16
#else
#define MR_INT8 4
#define NR_INT8 2
#define MR 6
#define NR 8
#endif
......@@ -170,6 +176,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
......@@ -193,52 +200,72 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits int small block inner product
void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
// 8 bits int inner product
void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
const int8_t *a, const int8_t *b, int8_t beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int8_t *bias);
template <typename Otype>
void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, Otype *C,
int32_t ldc, bool relu);
template <typename Otype>
void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, Otype *C,
int32_t ldc, bool relu, int32_t *bias,
bool addOnRow = false);
// 8 bits int pack function
void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer);
void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer);
void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer);
void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer);
void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer);
// 8 bits int matrix product
void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
int32_t ldc, bool relu, int8_t *bias);
void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
int32_t *C, int32_t ldc, bool relu, int8_t *bias);
template <typename Itype, typename Btype, typename Otype>
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
template <typename Otype>
void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias,
bool addOnRow = false);
template <typename Itype, typename Btype, typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
template <typename Otype>
void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false);
// 8 bits int write back
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B
void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
// C = A * B + C
void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias
void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias, relu(C)
void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
// C = A * B + bias, scale * relu(C)
void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale);
// C = A * B + bias, scale * C, bias is added on column
void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale);
// C = A * B + bias, scale * C, bias is added on row
void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale);
private:
int MC = 0;
......@@ -254,10 +281,218 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits int
int8_t *packedA_int8;
int8_t *packedB_int8;
int32_t *packedC_int8;
int32_t *packedC_int32;
int8_t *zero_int8;
};
// 8 bits int matrix product (m*k x k*n)
template <typename Otype>
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, float beta,
Otype *C, int32_t ldc, bool relu, int32_t *bias,
bool addOnRow) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
int32_t L2 = 512 * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) {
NC = NR_INT8;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
int32_t mc, nc;
for (int32_t j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) {
InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu);
} else {
if (addOnRow) {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + j,
addOnRow);
} else {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int32, &C(i, j), ldc, relu, bias + i,
addOnRow);
}
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int32);
paddle_mobile::memory::Free(zero_int8);
}
// 8 bits int matrix product (m*k x k*n), omp version
template <typename Otype>
void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
float beta, Otype *C, int32_t ldc, bool relu,
int32_t *bias, bool addOnRow) {
#ifdef _OPENMP
int32_t max_threads = omp_get_max_threads();
#else
int32_t max_threads = 1;
#endif
int32_t L1 = 64 / max_threads * 1024;
const int32_t k_complete = (k + 15) - ((k + 15) & 15);
KC = k_complete;
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
if (m > n) {
// 对 A 分块
MC = L1 / (KC * sizeof(int8_t));
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// 补齐 B
NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__
// TODO()
#else
PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
#endif
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
} else {
// 对 B 分块
NC = L1 / (KC * sizeof(int8_t));
if (NC == 0) {
NC = NR_INT8;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
}
// 补齐 A
MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__
// TODO()
#else
PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
#endif
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
}
packedC_int32 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
if (m > n) {
#pragma omp parallel for
for (int32_t i = 0; i < m; i += MC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t mc;
mc = s_min(m - i, MC);
int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
#else
PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
#endif
if (bias == nullptr) {
InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu);
} else {
if (addOnRow) {
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
local_C, &C(i, 0), ldc, relu, bias, addOnRow);
} else {
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
local_C, &C(i, 0), ldc, relu, bias + i, addOnRow);
}
}
}
} else {
#pragma omp parallel for
for (int32_t j = 0; j < n; j += NC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t nc;
nc = s_min(n - j, NC);
int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int32 + MC * NC * local_threads;
#if __aarch64__
// TODO()
#else
PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
#endif
if (bias == nullptr) {
InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu);
} else {
if (addOnRow) {
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
local_C, &C(0, j), ldc, relu, bias + j, addOnRow);
} else {
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
local_C, &C(0, j), ldc, relu, bias, addOnRow);
}
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int32);
paddle_mobile::memory::Free(zero_int8);
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -14,10 +14,11 @@ limitations under the License. */
#include <string.h>
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h"
#if __ARM_NEON
#include <arm_neon.h>
#include <iostream>
#endif
#ifdef _OPENMP
#include <omp.h>
......@@ -30,7 +31,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) {
#if __ARM_NEON
#if __aarch64__
// TODO(wzzju)
// TODO()
#else
const int8_t *a_ptr, *b_ptr;
a_ptr = a;
......@@ -62,7 +63,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"pld [%[b_ptr], #128] \n\t"
"vld1.s8 {d0-d3}, [%[a_ptr]]! \n\t" // load A 8 cols
"vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B first 4 rows
"vmovl.s8 q2, d0 \n\t" // process B first 4
"vmovl.s8 q2, d0 \n\t" // process B first
// rows
"vmovl.s8 q3, d8 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
......@@ -241,12 +242,141 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
#endif // __ARM_NEON
}
// The core idea of AddDot4x2 function is borrowed from the Google's gemmlowp
// open source library. The address of gemmlowp is
// https://github.com/google/gemmlowp.
void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
#define PADDLE_LABEL_LOOP "1"
#define PADDLE_LABEL_AFTER_LOOP "2"
asm volatile(
"lsl %[ldc], %[ldc], #2 \n\t" // sizeof(int32) == 4
"vldr d0, [%[b], #0] \n\t"
"vmov.s32 q8, #0 \n\t"
"vldr d4, [%[a], #0] \n\t"
"vmov.s32 q9, q8 \n\t"
"vldr d2, [%[b], #16] \n\t"
"vmov.s32 q10, q8 \n\t"
"vldr d6, [%[a], #16] \n\t"
"vmov.s32 q11, q8 \n\t"
"vldr d1, [%[b], #8]\n\t"
"vmov.s32 q12, q8 \n\t"
"vldr d5, [%[a], #8]\n"
"vmov.s32 q13, q8 \n\t"
"vldr d3, [%[b], #24]\n\t"
"vmov.s32 q14, q8 \n\t"
"vldr d7, [%[a], #24]\n"
"vmov.s32 q15, q8 \n\t"
PADDLE_LABEL_LOOP
": \n\t"
"vmull.s8 q4, d0, d4 \n\t" // first half
"add %[b], %[b], #32 \n\t"
"vmull.s8 q5, d2, d4 \n\t"
"vldr d4, [%[a], #32] \n\t"
"vmull.s8 q6, d0, d6 \n\t"
"vmull.s8 q7, d2, d6 \n\t"
"vldr d6, [%[a], #48] \n\t"
"vmlal.s8 q4, d1, d5 \n\t" // second half
"vmlal.s8 q5, d3, d5 \n\t"
"vldr d5, [%[a], #40] \n\t"
"vmlal.s8 q6, d1, d7 \n\t"
"vmlal.s8 q7, d3, d7 \n\t"
"vldr d7, [%[a], #56] \n\t"
"vpadal.s16 q8, q4 \n\t" // pairwise-add
"add %[a], %[a], #64 \n\t"
"vpadal.s16 q9, q5 \n\t"
"subs %[k], %[k], #16 \n\t"
"vpadal.s16 q10, q6 \n\t"
"vpadal.s16 q11, q7 \n\t"
"beq " PADDLE_LABEL_AFTER_LOOP
"f \n\t"
"vmull.s8 q4, d0, d4 \n\t" // first half
"vmull.s8 q5, d2, d4 \n\t"
"vldr d4, [%[a], #0] \n\t"
"vmull.s8 q6, d0, d6 \n\t"
"vldr d0, [%[b], #0] \n\t"
"vmull.s8 q7, d2, d6 \n\t"
"vldr d2, [%[b], #16] \n\t"
"vmlal.s8 q4, d1, d5 \n\t" // second half
"vldr d6, [%[a], #16] \n\t"
"vmlal.s8 q5, d3, d5 \n\t"
"vldr d5, [%[a], #8] \n\t"
"vmlal.s8 q6, d1, d7 \n\t"
"vldr d1, [%[b], #8] \n\t"
"vmlal.s8 q7, d3, d7 \n\t"
"vldr d3, [%[b], #24] \n\t"
"vpadal.s16 q12, q4 \n\t" // pairwise-add
"vldr d7, [%[a], #24] \n\t"
"vpadal.s16 q13, q5 \n\t"
"vpadal.s16 q14, q6 \n\t"
"vpadal.s16 q15, q7 \n\t"
"b " PADDLE_LABEL_LOOP "b \n\t"
PADDLE_LABEL_AFTER_LOOP
": \n\t"
"vmull.s8 q4, d0, d4 \n\t" // first half
"vmull.s8 q5, d2, d4 \n\t"
"vmull.s8 q6, d0, d6 \n\t"
"vmull.s8 q7, d2, d6 \n\t"
"vmlal.s8 q4, d1, d5 \n\t" // second half
"vmlal.s8 q5, d3, d5 \n\t"
"vmlal.s8 q6, d1, d7 \n\t"
"vmlal.s8 q7, d3, d7 \n\t"
"vpadal.s16 q12, q4 \n\t" // pairwise-add
"vpadal.s16 q13, q5 \n\t"
"vpadal.s16 q14, q6 \n\t"
"vpadal.s16 q15, q7 \n\t"
"vpadd.s32 d0, d16, d17 \n\t" // reduce to int32
"vpadd.s32 d1, d18, d19 \n\t"
"vpadd.s32 d2, d20, d21 \n\t"
"vpadd.s32 d3, d22, d23 \n\t"
"vpadd.s32 d4, d24, d25 \n\t"
"vpadd.s32 d5, d26, d27 \n\t"
"vpadd.s32 d6, d28, d29 \n\t"
"vpadd.s32 d7, d30, d31 \n\t"
"vpadd.s32 d8, d0, d1 \n\t" // reduce to int32 again
"vpadd.s32 d9, d2, d3 \n\t"
"vpadd.s32 d10, d4, d5 \n\t"
"vpadd.s32 d11, d6, d7 \n\t"
"vst1.32 {d8}, [%[c]], %[ldc] \n\t"
"vst1.32 {d9}, [%[c]], %[ldc] \n\t"
"vst1.32 {d10}, [%[c]], %[ldc] \n\t"
"vst1.32 {d11}, [%[c]] \n\t"
: [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)
: [ldc] "r"(ldc)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q11", "q12", "q13", "q14", "q15");
#undef PADDLE_LABEL_AFTER_LOOP
#undef PADDLE_LABEL_LOOP
#endif // __aarch64__
#endif // __ARM_NEON
}
// 8 bits int small block inner product
void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) {
#if __ARM_NEON
#if __aarch64__
// TODO(wzzju)
// TODO
#else
const int8_t *a_ptr, *b_ptr;
a_ptr = a;
......@@ -539,51 +669,229 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
}
// 8 bits int inner product
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
const int8_t *a, const int8_t *b, int8_t beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int8_t *bias) {
template <>
void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, int8_t *C,
int32_t ldc, bool relu) {}
template <>
void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
const int8_t *b, float beta, int32_t *c, int32_t *C,
int32_t ldc, bool relu) {
#pragma omp parallel for
for (int32_t j = 0; j < nc; j += NR) {
for (int32_t j = 0; j < nc; j += NR_INT8) {
for (int32_t i = 0; i < mc; i += MR_INT8) {
#if __aarch64__
// TODO(wzzju)
// TODO
#else
// AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#endif // __aarch64__
}
}
if (alpha != 1) {
WriteWithAlphaBeta(mc, nc, c, C, ldc);
return;
}
if (beta == 0) {
if (!relu) {
WriteBasic(mc, nc, c, C, ldc);
return;
}
if (beta == 1 && !relu) {
if (bias == nullptr) {
WriteWithAdd(mc, nc, c, C, ldc);
} else {
WriteWithAddV1(mc, nc, c, C, ldc, bias);
}
template <>
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
const int8_t *a, const int8_t *b, float beta,
int32_t *c, int8_t *C, int32_t ldc, bool relu,
int32_t *bias, bool addOnRow) {
#pragma omp parallel for
for (int32_t j = 0; j < nc; j += NR_INT8) {
for (int32_t i = 0; i < mc; i += MR_INT8) {
#if __aarch64__
// TODO
#else
// AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#endif // __aarch64__
}
return;
}
if (beta == 1 && relu) {
if (bias == nullptr) {
WriteWithAddRelu(mc, nc, c, C, ldc);
if (relu) {
WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
return;
} else {
if (addOnRow) {
WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha);
} else {
WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
}
return;
}
}
template <>
void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
const int8_t *a, const int8_t *b, float beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int32_t *bias, bool addOnRow) {}
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer) {
const int32_t i_length = m - m_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
for (int32_t i = 0; i < i_length; i += 4) {
const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda;
const int8_t *a3 = A + (i + 3) * lda;
int8_t *local_buffer = buffer + i * KC;
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (m_tail != 0) {
const int8_t *a0 = &A(i_length, 0);
const int8_t *a1 = a0 + lda;
const int8_t *a2 = a0 + 2 * lda;
const int8_t *a3 = a0 + 3 * lda;
int8_t *local_buffer = buffer + i_length * KC;
switch (m_tail) {
case 1:
a1 = zero_int8;
case 2:
a2 = zero_int8;
case 3:
a3 = zero_int8;
break;
default:
break;
}
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
}
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer) {
const int8_t *a0, *a1, *a2, *a3;
for (int32_t i = 0; i < m - m_tail; i += MR_INT8) {
for (int32_t i = 0; i < m - m_tail; i += 4) {
a0 = A + i * lda;
a1 = A + (i + 1) * lda;
a2 = A + (i + 2) * lda;
......@@ -625,7 +933,7 @@ void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer) {
const int32_t i_length = m - m_tail;
for (int32_t i = 0; i < i_length; i += MR_INT8) {
for (int32_t i = 0; i < i_length; i += 6) {
const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda;
......@@ -676,17 +984,85 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
}
}
// 8 bits int PackMatrixB
void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
for (int32_t j = 0; j < j_length; j += 2) {
int8_t *local_buffer = buffer + j * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j);
const int8_t *b1 = &B((i << 4), j + 1);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b1;
b1 += ldb;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j);
const int8_t *b1 = &B((k_count << 4), j + 1);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b1;
b1 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (n_tail != 0) {
int8_t *local_buffer = buffer + j_length * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j_length);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = 0;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j_length);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
}
// 8 bits int PackMatrixB
void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail;
for (int32_t j = 0; j < j_length; j += NR) {
for (int32_t j = 0; j < j_length; j += 8) {
int8_t *local_buffer = buffer + j * k;
for (int32_t i = 0; i < k; ++i) {
const int8_t *b0 = &B(i, j);
#if __ARM_NEON
#if __aarch64__
// TODO(wzzju)
// TODO
#else
asm volatile(
// "pld [%[b0]] \n\t"
......@@ -715,94 +1091,27 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
for (int32_t j = j_length; j < n; ++j) {
*local_buffer++ = *b0++;
}
for (int32_t j = n; j < j_length + NR; ++j) {
for (int32_t j = n; j < j_length + 8; ++j) {
*local_buffer++ = 0;
}
}
}
}
// 8 bits int matrix product (m*k x k*n)
void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
int32_t *C, int32_t ldc, bool relu, int8_t *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t L1 = 32 * 1024;
int32_t L2 = 512 * 1024;
KC = k;
MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR_INT8, and NC is multiple of NR
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) {
NC = NR;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR - 1) / NR * NR;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
packedC_int8 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
int32_t mc, nc;
for (int32_t j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
// PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int8, &C(i, j), ldc, relu, nullptr);
} else {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int8, &C(i, j), ldc, relu, bias + i);
}
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int8);
paddle_mobile::memory::Free(zero_int8);
}
// 8 bits int write back
// C = alpha * A * B + beta * C
void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc) {}
// C = A * B, 8位 int32_t
// C = A * B
void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc) {
#if __ARM_NEON
#if __aarch64__
// TODO(wzzju)
// TODO
#else
int32_t nc1 = nc >> 4;
int32_t _nc1 = nc & 15;
int32_t step = sizeof(int32_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4));
int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile C_ptr;
int32_t *C0, *c0;
c_ptr = c;
......@@ -836,7 +1145,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3");
}
......@@ -854,20 +1163,372 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
#endif // __ARM_NEON
}
// C = A * B + C
void Gemm::WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc) {}
// C = A * B + bias, scale * C, bias is added on column
void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int8_t narrow = -128;
int32_t nc1 = nc >> 3;
int32_t _nc1 = nc & 7;
int32_t step = sizeof(int8_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile bias_ptr;
int8_t *volatile C_ptr;
c_ptr = c;
C_ptr = C;
bias_ptr = bias;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"vdup.32 q15, %[scale] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"loop_mc_%=: \n\t"
"vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t"
"vdup.32 q13, d26[0] \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c_ptr]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vqadd.s32 q1, q1, q13 \n\t"
"vcvt.f32.s32 q2, q0 \n\t"
"vcvt.f32.s32 q3, q1 \n\t"
"vmul.f32 q2, q2, q15 \n\t"
"vmul.f32 q3, q3, q15 \n\t"
"vcvt.s32.f32 q4, q2 \n\t"
"vcvt.s32.f32 q5, q3 \n\t"
"vqmovn.s32 d12, q4 \n\t"
"vqmovn.s32 d13, q5 \n\t"
"vqmovn.s16 d14, q6 \n\t"
"vceq.s8 d15, d14, d24 \n\t"
"vsub.s8 d14, d14, d15 \n\t"
"vst1.8 {d14}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
// C = A * B + bias
void Gemm::WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias) {}
// C = A * B + C, relu(C)
void Gemm::WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc) {}
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
[scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q7", "q12", "q13", "q15");
}
// C = A * B + bias, relu(C)
void Gemm::WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias) {}
int32_t nc_left;
int32_t *c0;
int8_t *C0;
int32_t bias_v;
if (_nc1 != 0) {
for (int32_t i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 8 + i * ldc;
c0 = c_ptr + nc1 * 8 + i * NC;
bias_v = *(bias_ptr + i);
nc_left = _nc1;
asm volatile(
"vdup.32 q15, %[scale] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"vdup.32 q13, %[bias_v] \n\t"
"cmp %[_nc1], #4 \n\t"
"blt less_four_%= \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vst1.8 {d8[1]}, [%[C0]]! \n\t"
"vst1.8 {d8[2]}, [%[C0]]! \n\t"
"vst1.8 {d8[3]}, [%[C0]]! \n\t"
"subs %[_nc1], %[_nc1], #4 \n\t"
"beq process_over_%= \n\t"
"less_four_%=: \n\t"
"vld1.32 {q0}, [%[c0]] \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"loop_save_%=: \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vext.8 d8, d8, d8, #1 \n\t"
"subs %[_nc1], %[_nc1], #1 \n\t"
"bgt loop_save_%= \n\t"
"process_over_%=: \n\t"
:
: [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
[bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * C, bias is added on row
void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int8_t narrow = -128;
int32_t nc1 = nc >> 3;
int32_t _nc1 = nc & 7;
int32_t step = sizeof(int8_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile bias_ptr;
int8_t *volatile C_ptr;
c_ptr = c;
C_ptr = C;
bias_ptr = bias;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"vdup.32 q15, %[scale] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"loop_mc_%=: \n\t"
"mov r4, %[bias_ptr] \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q13, q14}, [r4]! \n\t"
"vld1.32 {q0, q1}, [%[c_ptr]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vqadd.s32 q1, q1, q14 \n\t"
"vcvt.f32.s32 q2, q0 \n\t"
"vcvt.f32.s32 q3, q1 \n\t"
"vmul.f32 q2, q2, q15 \n\t"
"vmul.f32 q3, q3, q15 \n\t"
"vcvt.s32.f32 q4, q2 \n\t"
"vcvt.s32.f32 q5, q3 \n\t"
"vqmovn.s32 d12, q4 \n\t"
"vqmovn.s32 d13, q5 \n\t"
"vqmovn.s16 d14, q6 \n\t"
"vceq.s8 d15, d14, d24 \n\t"
"vsub.s8 d14, d14, d15 \n\t"
"vst1.8 {d14}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
[scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5",
"q6", "q7", "q12", "q13", "q15");
}
int32_t nc_left;
int32_t *c0;
int8_t *C0;
int32_t *volatile bias0 = bias_ptr + nc1 * 8;
if (_nc1 != 0) {
for (int32_t i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 8 + i * ldc;
c0 = c_ptr + nc1 * 8 + i * NC;
nc_left = _nc1;
asm volatile(
"vdup.32 q15, %[scale] \n\t"
"vdup.8 d24, %[narrow] \n\t"
"cmp %[_nc1], #4 \n\t"
"blt less_four_%= \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vld1.32 {q13}, [%[bias0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vst1.8 {d8[1]}, [%[C0]]! \n\t"
"vst1.8 {d8[2]}, [%[C0]]! \n\t"
"vst1.8 {d8[3]}, [%[C0]]! \n\t"
"subs %[_nc1], %[_nc1], #4 \n\t"
"beq process_over_%= \n\t"
"less_four_%=: \n\t"
"vld1.32 {q0}, [%[c0]] \n\t"
"vld1.32 {q13}, [%[bias0]] \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vceq.s8 d9, d8, d24 \n\t"
"vsub.s8 d8, d8, d9 \n\t"
"loop_save_%=: \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vext.8 d8, d8, d8, #1 \n\t"
"subs %[_nc1], %[_nc1], #1 \n\t"
"bgt loop_save_%= \n\t"
"process_over_%=: \n\t"
:
: [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0),
[scale] "r"(scale), [narrow] "r"(narrow)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * relu(C), bias is added on column
void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
int32_t ldc, int32_t *bias, float scale) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int32_t zero = 0;
int32_t nc1 = nc >> 3;
int32_t _nc1 = nc & 7;
int32_t step = sizeof(int8_t) * ldc;
int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
int32_t volatile m = mc;
int32_t volatile n = nc1;
int32_t *volatile c_ptr, *volatile bias_ptr;
int8_t *volatile C_ptr;
c_ptr = c;
C_ptr = C;
bias_ptr = bias;
if (nc1 > 0) {
asm volatile(
"subs %[mc], %[mc], #1 \n\t"
"blt end_mc_%= \n\t"
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"loop_mc_%=: \n\t"
"vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t"
"vdup.32 q13, d26[0] \n\t"
"mov r6, %[C_ptr] \n\t"
"mov r5, %[nc1] \n\t"
"subs r5, r5, #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[c_ptr]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vqadd.s32 q1, q1, q13 \n\t"
"vmax.s32 q0, q0, q14 \n\t"
"vmax.s32 q1, q1, q14 \n\t"
"vcvt.f32.s32 q2, q0 \n\t"
"vcvt.f32.s32 q3, q1 \n\t"
"vmul.f32 q2, q2, q15 \n\t"
"vmul.f32 q3, q3, q15 \n\t"
"vcvt.s32.f32 q4, q2 \n\t"
"vcvt.s32.f32 q5, q3 \n\t"
"vqmovn.s32 d12, q4 \n\t"
"vqmovn.s32 d13, q5 \n\t"
"vqmovn.s16 d14, q6 \n\t"
"vst1.8 {d14}, [r6]! \n\t"
"subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"add %[C_ptr], %[C_ptr], %[step] \n\t"
"add %[c_ptr], %[c_ptr], %[step1] \n\t"
"subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t"
"end_mc_%=: \n\t"
:
: [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
[step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
[scale] "r"(scale), [zero] "r"(zero)
: "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q7", "q13", "q14", "q15");
}
int32_t nc_left;
int32_t *c0;
int8_t *C0;
int32_t bias_v;
if (_nc1 != 0) {
for (int32_t i = 0; i < mc; i++) {
C0 = C_ptr + nc1 * 8 + i * ldc;
c0 = c_ptr + nc1 * 8 + i * NC;
bias_v = *(bias_ptr + i);
nc_left = _nc1;
asm volatile(
"vdup.32 q15, %[scale] \n\t"
"vdup.32 q14, %[zero] \n\t"
"vdup.32 q13, %[bias_v] \n\t"
"cmp %[_nc1], #4 \n\t"
"blt less_four_%= \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vmax.s32 q0, q0, q14 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vst1.8 {d8[1]}, [%[C0]]! \n\t"
"vst1.8 {d8[2]}, [%[C0]]! \n\t"
"vst1.8 {d8[3]}, [%[C0]]! \n\t"
"subs %[_nc1], %[_nc1], #4 \n\t"
"beq process_over_%= \n\t"
"less_four_%=: \n\t"
"vld1.32 {q0}, [%[c0]]! \n\t"
"vqadd.s32 q0, q0, q13 \n\t"
"vmax.s32 q0, q0, q14 \n\t"
"vcvt.f32.s32 q1, q0 \n\t"
"vmul.f32 q1, q1, q15 \n\t"
"vcvt.s32.f32 q2, q1 \n\t"
"vqmovn.s32 d6, q2 \n\t"
"vqmovn.s16 d8, q3 \n\t"
"loop_save_%=: \n\t"
"vst1.8 {d8[0]}, [%[C0]]! \n\t"
"vext.8 d8, d8, d8, #1 \n\t"
"subs %[_nc1], %[_nc1], #1 \n\t"
"bgt loop_save_%= \n\t"
"process_over_%=: \n\t"
:
: [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
[bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15");
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
} // namespace math
} // namespace operators
......
......@@ -27,130 +27,17 @@ namespace paddle_mobile {
namespace operators {
namespace math {
// 8 bits int matrix product (m*k x k*n)
void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha,
const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
int8_t beta, int32_t *C, int32_t ldc, bool relu,
int8_t *bias) {
#ifdef _OPENMP
int32_t max_threads = omp_get_max_threads();
#else
int32_t max_threads = 1;
#endif
int32_t L1 = 64 / max_threads * 1024;
KC = k;
zero_int8 =
static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
if (m > n) {
// 对 A 分块
MC = L1 / (KC * sizeof(int8_t));
if (MC == 0) {
MC = MR_INT8;
} else {
int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
}
// 补齐 B
NC = (n + NR - 1) / NR * NR;
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8);
#endif
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
} else {
// 对 B 分块
NC = L1 / (KC * sizeof(int8_t));
if (NC == 0) {
NC = NR;
} else {
int32_t nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num;
NC = (NC + NR - 1) / NR * NR;
}
// 补齐 A
MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
packedA_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8);
#endif
packedB_int8 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
}
packedC_int8 = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
if (m > n) {
#pragma omp parallel for
for (int32_t i = 0; i < m; i += MC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t mc;
mc = s_min(m - i, MC);
int8_t *local_A = packedA_int8 + MC * KC * local_threads;
int32_t *local_C = packedC_int8 + MC * NC * local_threads;
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A);
#endif
InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
&C(i, 0), ldc, relu, bias + i);
}
} else {
#pragma omp parallel for
for (int32_t j = 0; j < n; j += NC) {
#ifdef _OPENMP
int32_t local_threads = omp_get_thread_num();
#else
int32_t local_threads = 0;
#endif
int32_t nc;
nc = s_min(n - j, NC);
int8_t *local_B = packedB_int8 + KC * NC * local_threads;
int32_t *local_C = packedC_int8 + MC * NC * local_threads;
#if __aarch64__
// TODO(wzzju)
#else
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B);
#endif
InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
&C(0, j), ldc, relu, bias);
}
}
paddle_mobile::memory::Free(packedA_int8);
paddle_mobile::memory::Free(packedB_int8);
paddle_mobile::memory::Free(packedC_int8);
paddle_mobile::memory::Free(zero_int8);
}
void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail;
#pragma omp parallel for
for (int32_t j = 0; j < j_length; j += NR) {
for (int32_t j = 0; j < j_length; j += 8) {
int8_t *local_buffer = buffer + j * k;
for (int32_t i = 0; i < k; ++i) {
const int8_t *b0 = &B(i, j);
#if __ARM_NEON
#if __aarch64__
// TODO(wzzju)
// TODO
#else
asm volatile(
// "pld [%[b0]] \n\t"
......@@ -179,7 +66,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
for (int32_t j = j_length; j < n; ++j) {
*local_buffer++ = *b0++;
}
for (int32_t j = n; j < j_length + NR; ++j) {
for (int32_t j = n; j < j_length + 8; ++j) {
*local_buffer++ = 0;
}
}
......@@ -188,9 +75,9 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer) {
const int i_length = m - m_tail;
const int32_t i_length = m - m_tail;
#pragma omp parallel for
for (int32_t i = 0; i < i_length; i += MR_INT8) {
for (int32_t i = 0; i < i_length; i += 4) {
const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda;
......@@ -221,7 +108,7 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
default:
break;
}
for (int j = 0; j < k; ++j) {
for (int32_t j = 0; j < k; ++j) {
*local_buffer++ = *a0++;
*local_buffer++ = *a1++;
*local_buffer++ = *a2++;
......@@ -230,6 +117,232 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
}
}
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
const int8_t *A, int32_t lda, int8_t *buffer) {
const int32_t i_length = m - m_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
#pragma omp parallel for
for (int32_t i = 0; i < i_length; i += 4) {
const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda;
const int8_t *a3 = A + (i + 3) * lda;
int8_t *local_buffer = buffer + i * KC;
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (m_tail != 0) {
const int8_t *a0 = &A(i_length, 0);
const int8_t *a1 = a0 + lda;
const int8_t *a2 = a0 + 2 * lda;
const int8_t *a3 = a0 + 3 * lda;
int8_t *local_buffer = buffer + i_length * KC;
switch (m_tail) {
case 1:
a1 = zero_int8;
case 2:
a2 = zero_int8;
case 3:
a3 = zero_int8;
break;
default:
break;
}
for (int32_t j = 0; j < k_count; ++j) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.s8 {d0, d1}, [%[a0]]! \n\t"
"vld1.s8 {d2, d3}, [%[a1]]! \n\t"
"vld1.s8 {d4, d5}, [%[a2]]! \n\t"
"vld1.s8 {d6, d7}, [%[a3]]! \n\t"
"vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t"
"vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t"
"vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t"
"vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t"
: [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
[a2] "+r"(a2), [a3] "+r"(a3)
:
: "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a0++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a1++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a2++;
}
for (int32_t l = 0; l < 16; ++l) {
*local_buffer++ = *a3++;
}
#endif // __ARM_NEON
}
if (k_tail != 0) {
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a0++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a1++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a2++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *a3++;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
}
// 8 bits int PackMatrixB
void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
const int8_t *B, int32_t ldb, int8_t *buffer) {
const int32_t j_length = n - n_tail;
const int32_t k_count = k >> 4;
const int32_t k_tail = k & 15;
#pragma omp parallel for
for (int32_t j = 0; j < j_length; j += 2) {
int8_t *local_buffer = buffer + j * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j);
const int8_t *b1 = &B((i << 4), j + 1);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b1;
b1 += ldb;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j);
const int8_t *b1 = &B((k_count << 4), j + 1);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b1;
b1 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
if (n_tail != 0) {
int8_t *local_buffer = buffer + j_length * KC;
for (int32_t i = 0; i < k_count; ++i) {
const int8_t *b0 = &B((i << 4), j_length);
for (int m = 0; m < 16; ++m) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int m = 0; m < 16; ++m) {
*local_buffer++ = 0;
}
}
if (k_tail != 0) {
const int8_t *b0 = &B((k_count << 4), j_length);
for (int32_t j = k_count << 4; j < k; ++j) {
*local_buffer++ = *b0;
b0 += ldb;
}
for (int32_t j = k; j < KC; ++j) {
*local_buffer++ = 0;
}
for (int32_t j = k_count << 4; j < KC; ++j) {
*local_buffer++ = 0;
}
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -34,12 +34,12 @@ struct GRUUnitFunctor<CPU, T> {
gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight,
frame_size * 2, 1, value.gate_value, frame_size * 3, false,
nullptr);
static_cast<float *>(nullptr));
#else
gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight,
frame_size * 2, 1, value.gate_value, frame_size * 3, false,
nullptr);
static_cast<float *>(nullptr));
#endif
}
......@@ -51,12 +51,12 @@ struct GRUUnitFunctor<CPU, T> {
gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1,
value.reset_output_value, frame_size, value.state_weight,
frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, nullptr);
frame_size * 3, false, static_cast<float *>(nullptr));
#else
gemm.Sgemm(batch_size, frame_size, frame_size, 1,
value.reset_output_value, frame_size, value.state_weight,
frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, nullptr);
frame_size * 3, false, static_cast<float *>(nullptr));
#endif
}
......
......@@ -28,7 +28,13 @@ template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false,
T *bias = nullptr);
float *bias = nullptr);
template <typename T, typename S>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false,
S *bias = nullptr, bool addOnRow = false);
template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
......
......@@ -20,11 +20,12 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
namespace math {
template <>
void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
bool relu, int8_t *bias) {
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias,
bool addOnRow) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -52,21 +53,43 @@ void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
}
#ifdef _OPENMP
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
if (bias != nullptr) {
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
} else {
gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
}
#else
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
if (bias != nullptr) {
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
} else {
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
}
#endif
} else {
#ifdef _OPENMP
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
if (bias != nullptr) {
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
} else {
gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
}
#else
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
relu, bias);
if (bias != nullptr) {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
N, relu, bias, addOnRow);
} else {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
N, relu, bias, addOnRow);
}
#endif
}
}
......
......@@ -38,6 +38,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
const int input_width = static_cast<int>(input->dims()[3]);
const int output_height = static_cast<int>(output->dims()[2]);
const int output_width = static_cast<int>(output->dims()[3]);
output->mutable_data<float>();
const int hxw = input_height * input_width;
......@@ -472,7 +473,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
const int inputdata_channel_stride = h_in * w_in;
const int input_batch_stride = output_channels * inputdata_channel_stride;
const int output_batch_stride = output_channels * outputdata_channel_stride;
float *out_data = output->data<float>();
float *out_data = output->mutable_data<float>();
const float *input_data = input->data<float>();
for (int k = 0; k < batch_size; ++k) {
#pragma omp parallel for
......
......@@ -28,15 +28,21 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
using std::vector;
void Pool3x3Avgs1p1(const Tensor *input, Tensor *output);
void Pool3x3Maxs1p1(const Tensor *input, Tensor *output);
void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output);
void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
Tensor *out);
void Pool3x3Avgs1p1(const framework::Tensor *input, framework::Tensor *output);
void Pool3x3Maxs1p1(const framework::Tensor *input, framework::Tensor *output);
void Pool3x3Max(std::vector<int> strides, std::vector<int> paddings,
const framework::Tensor *input, framework::Tensor *output);
void Pool3x3Avg(std::vector<int> strides, std::vector<int> paddings,
const framework::Tensor *in_x, framework::Tensor *out);
void Pool3x3Maxs1_int8(const framework::Tensor *input,
framework::Tensor *output, int32_t pad_h, int32_t pad_w);
void Pool3x3Maxs2_int8(const framework::Tensor *input,
framework::Tensor *output, int32_t pad_h, int32_t pad_w);
void Pool3x3Max_int8(const std::vector<int> &strides,
const std::vector<int> &paddings,
const framework::Tensor *input, framework::Tensor *output);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/tensor.h"
#include "operators/math/pool_3x3.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
#include <climits>
#include <iostream>
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
using std::max;
using std::min;
using std::vector;
template <typename T>
static void make_paddings(const Tensor *input, Tensor *padded_input,
int32_t top, int32_t bottom, int32_t left,
int32_t right, T value) {
const int32_t batch_size = input->dims()[0];
const int32_t c_in = input->dims()[1];
const int32_t h_in = input->dims()[2];
const int32_t w_in = input->dims()[3];
const int32_t h_padded = h_in + top + bottom;
const int32_t w_padded = w_in + left + right;
padded_input->Resize({batch_size, c_in, h_padded, w_padded});
T *padded_input_data = padded_input->mutable_data<T>();
const T *input_data = input->data<T>();
const int32_t input_channel_stride = h_in * w_in;
const int32_t input_batch_stride = c_in * input_channel_stride;
const int32_t padded_channel_stride = h_padded * w_padded;
const int32_t padded_batch_stride = c_in * padded_channel_stride;
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int j = 0; j < c_in; ++j) {
const T *img_in = input_data + j * input_channel_stride;
T *img_padded = padded_input_data + j * padded_channel_stride;
int k = 0;
for (; k < top; ++k) {
for (int l = 0; l < w_padded; ++l) {
img_padded[l] = value;
}
img_padded += w_padded;
}
for (; k < top + h_in; ++k) {
int l = 0;
for (; l < left; ++l) {
img_padded[l] = value;
}
memcpy(img_padded + left, img_in, w_in * sizeof(T));
l += w_in;
img_in += w_in;
for (; l < w_padded; ++l) {
img_padded[l] = value;
}
img_padded += w_padded;
}
for (; k < h_padded; ++k) {
for (int l = 0; l < w_padded; ++l) {
img_padded[l] = value;
}
img_padded += w_padded;
}
}
input_data += input_batch_stride;
padded_input_data += padded_batch_stride;
}
// input_data = input->data<T>();
// std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// const T *img_in = input_data + j * input_channel_stride;
// for (int k = 0; k < h_in; ++k) {
// for (int l = 0; l < w_in; ++l) {
// std::cout << (int32_t)*img_in << "\t";
// img_in++;
// }
// std::cout << std::endl;
// }
// }
// input_data += input_batch_stride;
// }
// std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" <<
// std::endl;
//
// padded_input_data = padded_input->data<T>();
// std::cout << "******************Padding begin**********************"
// << std::endl;
// for (int i = 0; i < 1; ++i) {
// for (int j = 0; j < 1; ++j) {
// T *img_padded = padded_input_data + j * padded_channel_stride;
// for (int k = 0; k < h_padded; ++k) {
// for (int l = 0; l < w_padded; ++l) {
// std::cout << (int32_t)*img_padded << "\t";
// img_padded++;
// }
// std::cout << std::endl;
// }
// }
// padded_input_data += padded_batch_stride;
// }
// std::cout << "******************Padding end**********************"
// << std::endl;
}
void Pool3x3Maxs1_int8(const Tensor *input, Tensor *output, int32_t pad_h,
int32_t pad_w) {
Tensor padded_input;
if (pad_h != 0 && pad_w != 0) {
int8_t value = -SCHAR_MAX;
make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
input = &padded_input;
}
const int32_t batch_size = input->dims()[0];
const int32_t h_in = input->dims()[2];
const int32_t w_in = input->dims()[3];
const int8_t *input_data = input->data<int8_t>();
const int32_t output_channels = output->dims()[1];
const int32_t h_out = output->dims()[2];
const int32_t w_out = output->dims()[3];
int8_t *output_data = output->mutable_data<int8_t>();
const int32_t outputdata_channel_stride = h_out * w_out;
const int32_t inputdata_channel_stride = h_in * w_in;
const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
const int32_t output_batch_stride =
output_channels * outputdata_channel_stride;
// std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl;
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int j = 0; j < output_channels; ++j) {
const int8_t *img_in = input_data + j * inputdata_channel_stride;
int8_t *img_out = output_data + j * outputdata_channel_stride;
for (int k = 0; k < h_out; ++k) {
const int8_t *row0 = img_in + k * w_in;
const int8_t *row1 = img_in + (k + 1) * w_in;
const int8_t *row2 = img_in + (k + 2) * w_in;
#if __ARM_NEON
int32_t nw = w_out >> 4;
int32_t left_w = w_out & 0xf;
int32_t nw1 = left_w >> 3;
int32_t left_w1 = left_w & 0x7;
#if __aarch64__
// TODO
#else
if (nw > 0) {
#define LOOP_LABEL "1"
// result: q15
asm volatile(
"vld1.8 {q0}, [%[row0]]! \n\t" // q0=0-15
"vld1.8 {q2}, [%[row1]]! \n\t"
"vld1.8 {q4}, [%[row2]]! \n\t"
LOOP_LABEL
": \n\t"
"vld1.8 {q1}, [%[row0]]! \n\t" // q1=16-31
"vext.8 q6, q0, q1, #1 \n\t"
"vext.8 q7, q0, q1, #2 \n\t"
"vld1.8 {q3}, [%[row1]]! \n\t"
"vmax.s8 q15, q0, q6 \n\t"
"vmax.s8 q15, q15, q7 \n\t"
"vext.8 q6, q2, q3, #1 \n\t"
"vext.8 q7, q2, q3, #2 \n\t"
"vld1.8 {q5}, [%[row2]]! \n\t"
"vmax.s8 q14, q2, q6 \n\t"
"vmax.s8 q14, q14, q7 \n\t"
"vext.8 q6, q4, q5, #1 \n\t"
"vext.8 q7, q4, q5, #2 \n\t"
"vmax.s8 q13, q4, q6 \n\t"
"vmax.s8 q13, q13, q7 \n\t"
"vmax.s8 q15, q15, q14 \n\t"
"vmax.s8 q15, q15, q13 \n\t"
"vmov.s8 q0, q1 \n\t"
"vmov.s8 q2, q3 \n\t"
"vmov.s8 q4, q5 \n\t"
"vst1.8 {q15}, [%[img_out]]! \n\t"
"subs %[nw], #1 \n\t"
"bne " LOOP_LABEL
"b \n\t"
"sub %[row0], #16 \n\t"
"sub %[row1], #16 \n\t"
"sub %[row2], #16 \n\t"
: [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
[row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q13", "q14", "q15");
#undef LOOP_LABEL
}
if (nw1 > 0 || left_w1 > 0) {
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm volatile(
"vld1.8 {d0}, [%[row0]]! \n\t" // d0=0-8
"vld1.8 {d2}, [%[row1]]! \n\t"
"vld1.8 {d4}, [%[row2]]! \n\t"
"mov r0, #1 \n\t"
"cmp %[nw1], #0 \n\t"
"beq " PADDLE_LABEL_LESS8
"f\n\t"
"vld1.8 {d1}, [%[row0]]! \n\t" // d1=9-15
"vext.8 d6, d0, d1, #1 \n\t"
"vext.8 d7, d0, d1, #2 \n\t"
"vld1.8 {d3}, [%[row1]]! \n\t"
"vmax.s8 d15, d0, d6 \n\t"
"vmax.s8 d15, d15, d7 \n\t"
"vext.8 d6, d2, d3, #1 \n\t"
"vext.8 d7, d2, d3, #2 \n\t"
"vld1.8 {d5}, [%[row2]]! \n\t"
"vmax.s8 d14, d2, d6 \n\t"
"vmax.s8 d14, d14, d7 \n\t"
"vext.8 d6, d4, d5, #1 \n\t"
"vext.8 d7, d4, d5, #2 \n\t"
"vmax.s8 d13, d4, d6 \n\t"
"vmax.s8 d13, d13, d7 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
"vmov.s8 d0, d1 \n\t"
"vmov.s8 d2, d3 \n\t"
"vmov.s8 d4, d5 \n\t"
"vst1.8 {d15}, [%[img_out]]! \n\t"
PADDLE_LABEL_LESS8
": \n\t"
"cmp %[left_w1], #0 \n\t"
"beq " PADDLE_LABEL_OVER
"f\n\t"
"vld1.8 {d1}, [%[row0]] \n\t" // d1=9-15
"vext.8 d6, d0, d1, #1 \n\t"
"vext.8 d7, d0, d1, #2 \n\t"
"vld1.8 {d3}, [%[row1]] \n\t"
"vmax.s8 d15, d0, d6 \n\t"
"vmax.s8 d15, d15, d7 \n\t"
"vext.8 d6, d2, d3, #1 \n\t"
"vext.8 d7, d2, d3, #2 \n\t"
"vld1.8 {d5}, [%[row2]] \n\t"
"vmax.s8 d14, d2, d6 \n\t"
"vmax.s8 d14, d14, d7 \n\t"
"vext.8 d6, d4, d5, #1 \n\t"
"vext.8 d7, d4, d5, #2 \n\t"
"vmax.s8 d13, d4, d6 \n\t"
"vmax.s8 d13, d13, d7 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
PADDLE_LABEL_LESS8_SAVE
": \n\t"
"vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
"add %[row0], %[row0], #1 \n\t"
"add %[row1], %[row1], #1 \n\t"
"add %[row2], %[row2], #1 \n\t"
"vext.8 d15, d15, d15, #1 \n\t"
"subs %[left_w1], #1 \n\t"
"bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
PADDLE_LABEL_OVER ": \n\t"
: [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
[row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7", "d13", "d14", "d15");
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t left = w_out;
while (left > 0) {
const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
*img_out = std::max(std::max(max0, max1), max2);
row0 += 1;
row1 += 1;
row2 += 1;
img_out++;
left--;
}
#endif // __ARM_NEON
}
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
}
void Pool3x3Maxs2_int8(const Tensor *input, Tensor *output, int32_t pad_h,
int32_t pad_w) {
Tensor padded_input;
if (pad_h != 0 && pad_w != 0) {
int8_t value = -SCHAR_MAX;
make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
input = &padded_input;
}
const int32_t batch_size = input->dims()[0];
const int32_t h_in = input->dims()[2];
const int32_t w_in = input->dims()[3];
const int32_t output_channels = output->dims()[1];
const int32_t h_out = output->dims()[2];
const int32_t w_out = output->dims()[3];
const int32_t outputdata_channel_stride = h_out * w_out;
const int32_t inputdata_channel_stride = h_in * w_in;
const int32_t output_batch_stride =
output_channels * outputdata_channel_stride;
const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
const int8_t *input_data = input->data<int8_t>();
int8_t *output_data = output->mutable_data<int8_t>();
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int j = 0; j < output_channels; ++j) {
const int8_t *img_in = input_data + j * inputdata_channel_stride;
int8_t *img_out = output_data + j * outputdata_channel_stride;
for (int k = 0; k < h_out; ++k) {
const int8_t *row0 = img_in + 2 * k * w_in;
const int8_t *row1 = img_in + (2 * k + 1) * w_in;
const int8_t *row2 = img_in + (2 * k + 2) * w_in;
#if __ARM_NEON
int32_t nw = w_out >> 4;
int32_t left_w = w_out & 0xf;
int32_t nw1 = left_w >> 3;
int32_t left_w1 = left_w & 0x7;
#if __aarch64__
// TODO
#else
if (nw > 0) {
#define LOOP_LABEL "1"
// result: q15
asm volatile(
"vld2.8 {q0, q1}, [%[row0]]! \n\t" // q0=0-30, q1=1-31
"vld2.8 {q2, q3}, [%[row1]]! \n\t"
"vld2.8 {q4, q5}, [%[row2]]! \n\t"
LOOP_LABEL
": \n\t"
"vmax.s8 q15, q0, q1 \n\t"
"vld2.8 {q6, q7}, [%[row0]]! \n\t" // q0=32-62, q1=33-63
"vmax.s8 q14, q2, q3 \n\t"
"vmax.s8 q13, q4, q5 \n\t"
"vld2.8 {q8, q9}, [%[row1]]! \n\t"
"vext.8 q0, q0, q6, #1 \n\t"
"vmax.s8 q15, q15, q0 \n\t"
"vld2.8 {q10, q11}, [%[row2]]! \n\t"
"vext.8 q2, q2, q8, #1 \n\t"
"vmax.s8 q14, q14, q2 \n\t"
"vext.8 q4, q4, q10, #1 \n\t"
"vmax.s8 q13, q13, q4 \n\t"
"vmax.s8 q15, q15, q14 \n\t"
"vmax.s8 q15, q15, q13 \n\t"
"vmov.s8 q0, q6 \n\t"
"vmov.s8 q1, q7 \n\t"
"vmov.s8 q2, q8 \n\t"
"vmov.s8 q3, q9 \n\t"
"vmov.s8 q4, q10 \n\t"
"vmov.s8 q5, q11 \n\t"
"vst1.8 {q15}, [%[img_out]]! \n\t"
"subs %[nw], #1 \n\t"
"bne " LOOP_LABEL
"b \n\t"
"sub %[row0], #32 \n\t"
"sub %[row1], #32 \n\t"
"sub %[row2], #32 \n\t"
: [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
[row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q13", "q14", "q15");
#undef LOOP_LABEL
}
if (nw1 > 0 || left_w1 > 0) {
#define PADDLE_LABEL_LESS8 "1"
#define PADDLE_LABEL_LESS8_SAVE "2"
#define PADDLE_LABEL_OVER "3"
// result: d15
asm volatile(
"vld2.8 {d0, d1}, [%[row0]]! \n\t" // d0=0-14, d1=1-15
"vld2.8 {d2, d3}, [%[row1]]! \n\t"
"vld2.8 {d4, d5}, [%[row2]]! \n\t"
"mov r0, #1 \n\t"
"cmp %[nw1], #0 \n\t"
"beq " PADDLE_LABEL_LESS8
"f\n\t"
"vmax.s8 d15, d0, d1 \n\t"
"vld2.8 {d6, d7}, [%[row0]]! \n\t" // d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3 \n\t"
"vmax.s8 d13, d4, d5 \n\t"
"vld2.8 {d8, d9}, [%[row1]]! \n\t"
"vext.8 d0, d0, d6, #1 \n\t"
"vmax.s8 d15, d15, d0 \n\t"
"vld2.8 {d10, d11}, [%[row2]]! \n\t"
"vext.8 d2, d2, d8, #1 \n\t"
"vmax.s8 d14, d14, d2 \n\t"
"vext.8 d4, d4, d10, #1 \n\t"
"vmax.s8 d13, d13, d4 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
"vmov.s8 d0, d6 \n\t"
"vmov.s8 d1, d7 \n\t"
"vmov.s8 d2, d8 \n\t"
"vmov.s8 d3, d9 \n\t"
"vmov.s8 d4, d10 \n\t"
"vmov.s8 d5, d11 \n\t"
"vst1.8 {d15}, [%[img_out]]! \n\t"
PADDLE_LABEL_LESS8
": \n\t"
"cmp %[left_w1], #0 \n\t"
"beq " PADDLE_LABEL_OVER
"f\n\t"
"vmax.s8 d15, d0, d1 \n\t"
"vld2.8 {d6, d7}, [%[row0]] \n\t" // d0=32-62, d1=33-63
"vmax.s8 d14, d2, d3 \n\t"
"vmax.s8 d13, d4, d5 \n\t"
"vld2.8 {d8, d9}, [%[row1]] \n\t"
"vext.8 d0, d0, d6, #1 \n\t"
"vmax.s8 d15, d15, d0 \n\t"
"vld2.8 {d10, d11}, [%[row2]] \n\t"
"vext.8 d2, d2, d8, #1 \n\t"
"vmax.s8 d14, d14, d2 \n\t"
"vext.8 d4, d4, d10, #1 \n\t"
"vmax.s8 d13, d13, d4 \n\t"
"vmax.s8 d15, d15, d14 \n\t"
"vmax.s8 d15, d15, d13 \n\t"
PADDLE_LABEL_LESS8_SAVE
": \n\t"
"vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
"add %[row0], %[row0], #2 \n\t"
"add %[row1], %[row1], #2 \n\t"
"add %[row2], %[row2], #2 \n\t"
"vext.8 d15, d15, d15, #1 \n\t"
"subs %[left_w1], #1 \n\t"
"bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
PADDLE_LABEL_OVER ": \n\t"
: [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
[row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
:
: "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7", "d8", "d9", "d10", "d11", "d13", "d14", "d15");
#undef PADDLE_LABEL_OVER
#undef PADDLE_LABEL_LESS8_SAVE
#undef PADDLE_LABEL_LESS8
}
#endif // __aarch64__
#else
int32_t left = w_out;
while (left > 0) {
const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
*img_out = std::max(std::max(max0, max1), max2);
row0 += 2;
row1 += 2;
row2 += 2;
img_out++;
left--;
}
#endif // __ARM_NEON
}
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
}
void Pool3x3Max_int8(const vector<int> &strides, const vector<int> &paddings,
const Tensor *input, Tensor *output) {
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
// const int _kernel_size = 3;
const int stride = strides[0];
// const int stride_width = strides[1];
const int padding = paddings[0];
// const int padding_width = paddings[1];
const int8_t negative_max = -SCHAR_MAX;
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const int8_t *input_data = input->data<int8_t>();
int8_t *output_data = output->mutable_data<int8_t>();
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
for (int i = 0; i < batch_size; ++i) {
#pragma omp parallel for
for (int c = 0; c < output_channels; ++c) {
const int8_t *input_seg = input_data + c * input_channel_stride;
int8_t *output_seg = output_data + c * output_channel_stride;
for (int ph = 0; ph < output_height; ph++) {
int hstart = ph * stride - padding;
int hend = min(hstart + 3, input_height);
hstart = max(hstart, 0);
for (int pw = 0; pw < output_width; pw++) {
int wstart = pw * stride - padding;
int wend = min(wstart + 3, input_width);
wstart = max(wstart, 0);
const int8_t *pos1 = input_seg + hstart * input_width + wstart;
const int8_t *pos2 = input_seg + (hstart + 1) * input_width + wstart;
const int8_t *pos3 = input_seg + (hstart + 2) * input_width + wstart;
int8_t *output_ptr = output_seg + ph * output_width + pw;
if (hend - hstart != 3 || wend - wstart != 3) {
int8_t max_value = -SCHAR_MAX;
for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) {
int8_t value = input_seg[h * input_width + w];
if (value > max_value) {
max_value = value;
}
}
}
output_seg[ph * output_width + pw] = max_value;
} else {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile(
"vld1.8 {d0}, [%[pos1]] \n\t"
"vld1.8 {d1}, [%[pos2]] \n\t"
"vld1.8 {d2}, [%[pos3]] \n\t"
"vmax.s8 d3, d0, d1 \n\t"
"vmax.s8 d4, d2, d3 \n\t"
"vmov.s8 d4[3], %[negative_max] \n\t"
"vpmax.s8 d5, d4, d4 \n\t"
"vpmax.s8 d6, d5, d5 \n\t"
"vst1.8 {d6[0]},[%[output_ptr]] \n\t"
:
: [pos1] "r"(pos1), [pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
: "memory", "q0", "q1", "q2", "q3");
#endif
#else
const int8_t max0 = std::max(std::max(pos1[0], pos1[1]), pos1[2]);
const int8_t max1 = std::max(std::max(pos2[0], pos2[1]), pos2[2]);
const int8_t max2 = std::max(std::max(pos3[0], pos3[1]), pos3[2]);
*output_ptr = std::max(std::max(max0, max1), max2);
#endif // __ARM_NEON
}
}
}
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -70,15 +70,15 @@ class PoolFunctor<CPU, PoolProcess, T> {
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
T ele = pool_process.initial();
auto ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
pool_process.compute(input_data[h * input_width + w], &ele);
}
}
int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[ph * output_width + pw] = ele;
pool_process.finalize(static_cast<float>(pool_size), &ele);
output_data[ph * output_width + pw] = static_cast<T>(ele);
}
}
input_data += input_stride;
......@@ -88,8 +88,10 @@ class PoolFunctor<CPU, PoolProcess, T> {
}
};
template class PoolFunctor<CPU, math::AvgPool<float>, float>;
template class PoolFunctor<CPU, math::AvgPool<float, float>, float>;
template class PoolFunctor<CPU, math::MaxPool<float>, float>;
template class PoolFunctor<CPU, math::AvgPool<int8_t, int32_t>, int8_t>;
template class PoolFunctor<CPU, math::MaxPool<int8_t>, int8_t>;
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#pragma once
#include <climits>
#include <cmath>
#include "common/log.h"
#include "framework/tensor.h"
#include "pool_2x2.h"
......@@ -37,24 +39,42 @@ namespace math {
* in pool pooling, and finally takes the average.
* MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
*/
template <class T>
template <typename T>
class MaxPool {
public:
inline T initial() { return static_cast<T>(-FLT_MAX); }
inline T initial() {
if (typeid(T) == typeid(int8_t)) {
return static_cast<T>(-SCHAR_MAX);
}
return static_cast<T>(-FLT_MAX);
}
inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; }
inline void finalize(const T &pool_field, T *y) {}
};
template <class T>
template <typename Itype, typename Otype>
class AvgPool {
public:
inline T initial() { return static_cast<T>(0); }
inline void compute(const T &x, T *y) { *y += x; }
inline void finalize(const T &pool_field, T *y) { *y /= pool_field; }
inline Otype initial() { return static_cast<Otype>(0); }
inline void compute(const Itype &x, Otype *y) { *y += x; }
inline void finalize(const float &pool_field, Otype *y) {
if (typeid(Itype) == typeid(int8_t)) {
float tmp = *y / pool_field;
if (tmp > SCHAR_MAX) {
*y = SCHAR_MAX;
} else if (tmp < -SCHAR_MAX) {
*y = -SCHAR_MAX;
} else {
*y = static_cast<Otype>(std::round(tmp));
}
} else {
*y /= pool_field;
}
}
};
template <typename DeviceType, typename PoolProcess, typename T>
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn
// project.
#ifdef CONV_OP
#ifdef __aarch64__
#include "operators/math/pad.h"
#include "operators/math/winograd/winograd_transform.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <>
void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
framework::Tensor *output) {
/*
* w0 = g0
* w1 = ((g0 + g2) + g1) * (-2.0 / 9)
* w2 = ((g0 + g2) - g1) * (-2.0 / 9)
* w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90)
* w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90)
* w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
* w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180)
* w7 = g2
*/
// TODO(hjchen2)
PADDLE_MOBILE_THROW_EXCEPTION(
"Winograd for arm v8 has not been implemented.");
}
template <>
void winograd_transform_input<8, 3>(const framework::Tensor &input,
framework::Tensor *output) {
/*
* x0 = (d0 - d6) + (d4 - d2) * 5.25
* x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5)
* x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5)
* x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5)
* x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5)
* x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5)
* x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
* x7 = (d7 - d1) + (d3 - d5) * 5.25
*/
// TODO(hjchen2)
PADDLE_MOBILE_THROW_EXCEPTION(
"Winograd for arm v8 has not been implemented.");
}
template <>
void winograd_transform_output<8, 3>(const framework::Tensor &input,
const framework::Tensor &weight,
framework::Tensor *output) {
// TODO(hjchen2)
PADDLE_MOBILE_THROW_EXCEPTION(
"Winograd for arm v8 has not been implemented.");
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif // __aarch64__
#endif // CONV_OP
......@@ -58,6 +58,9 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(mul, ops::MulOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(mul, ops::MulOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
#endif
......
......@@ -439,7 +439,7 @@ class ConvParam : public OpParam {
#endif
private:
protected:
RType *input_;
RType *output_;
RType *filter_;
......@@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam {
x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
axis_ = GetAttr<int>("axis", attrs);
#ifdef FUSION_FC_INT8_OP
scale_ = InputScaleFrom<GType>(inputs, scope);
#endif
}
GType *InputX() const { return input_x_; }
......@@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam {
int x_num_col_dims_;
int y_num_col_dims_;
int axis_;
#ifdef PADDLE_MOBILE_FPGA
#ifdef FUSION_FC_INT8_OP
public:
const RType *InputScale() const { return scale_; }
private:
RType *scale_;
#endif
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
......@@ -1707,7 +1719,19 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
FusionConvAddReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
: FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {
#ifdef FUSION_CONVADDRELU_INT8_OP
scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
#endif
}
#ifdef FUSION_CONVADDRELU_INT8_OP
typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
const RType *InputScale() const { return scale_; }
private:
RType *scale_;
#endif
};
#endif
......
......@@ -269,8 +269,8 @@ if (NOT FOUND_MATCH)
#gen test
ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool paddle-mobile)
ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool-op paddle-mobile)
#gen test
ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
......@@ -324,6 +324,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
......
......@@ -25,7 +25,7 @@ limitations under the License. */
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
void print_matirx(int m, int n, int ldc, float *c) {
void print_matrix(int m, int n, int ldc, float *c) {
for (int i = 0; i < m; ++i) {
std::cout << c(i, 0);
for (int j = 1; j < n; ++j) {
......@@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
print_matrix(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
print_matrix(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
print_matrix(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
print_matrix(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <limits>
#include <random>
#include <type_traits>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
......@@ -32,26 +34,65 @@ limitations under the License. */
using std::default_random_engine;
using std::uniform_int_distribution;
void print_matirx(int m, int n, int ldc, int32_t *c) {
template <typename T>
void print_matrix(int m, int n, int ldc, T *c) {
for (int i = 0; i < m; ++i) {
std::cout << c(i, 0);
if (std::is_same<T, int8_t>::value) {
std::cout.setf(std::ios::left);
std::cout.width(4);
std::cout << static_cast<int32_t>(c(i, 0));
} else {
std::cout.setf(std::ios::left);
std::cout.width(6);
std::cout << c(i, 0);
}
for (int j = 1; j < n; ++j) {
std::cout << " | " << c(i, j);
if (std::is_same<T, int8_t>::value) {
std::cout << " | ";
std::cout.setf(std::ios::left);
std::cout.width(4);
std::cout << static_cast<int32_t>(c(i, j));
} else {
std::cout << " | ";
std::cout.setf(std::ios::left);
std::cout.width(6);
std::cout << c(i, j);
}
}
std::cout << std::endl;
std::cout << "\n";
}
std::cout << std::endl;
}
void print_matirx(int m, int n, int ldc, int8_t *c) {
for (int i = 0; i < m; ++i) {
std::cout << static_cast<int32_t>(c(i, 0));
for (int j = 1; j < n; ++j) {
std::cout << " | " << static_cast<int32_t>(c(i, j));
}
std::cout << std::endl;
}
std::cout << std::endl;
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
// round to zero
float round2zero(float v) {
float res;
if (v > 0)
res = std::floor(v);
else if (v < 0)
res = std::ceil(v);
return res;
}
int8_t qscale_int32(int32_t v, float scale) {
float res = static_cast<float>(v) * scale;
res = round2zero(res);
if (res > 127)
return static_cast<int8_t>(127);
else if (res < -127)
return static_cast<int8_t>(-127);
else
return static_cast<int8_t>(res);
}
int do_sgemm(int m, int n, int k, bool relu, int pr) {
......@@ -106,30 +147,152 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matirx(m, k, lda, a);
print_matrix(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matrix(k, n, ldb, b);
std::cout << "C:" << std::endl;
print_matrix(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matrix(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
paddle_mobile::memory::Free(c1);
return 0;
}
int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr,
bool addOnRow = false) {
int lda = k;
int ldb = n;
int ldc = n;
float scale = 0.00628f;
default_random_engine e;
uniform_int_distribution<int8_t> pixel(-127, 127);
int8_t *a = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
int8_t *b = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
int8_t *c = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
int8_t *c1 = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
int32_t *bias = nullptr;
if (addOnRow) {
bias = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * n));
} else {
bias = static_cast<int32_t *>(
paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
}
for (int i = 0; i < m * k; ++i) {
a[i] = pixel(e);
}
for (int i = 0; i < k * n; ++i) {
b[i] = pixel(e);
}
if (addOnRow) {
for (int i = 0; i < n; ++i) {
bias[i] = static_cast<int32_t>(pixel(e));
}
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
int32_t bias_v = bias[j];
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
r = qadd_int32(r, bias_v);
if (relu) r = std::max(0, r);
c1(i, j) = qscale_int32(r, scale);
}
}
} else {
for (int i = 0; i < m; ++i) {
bias[i] = static_cast<int32_t>(pixel(e));
}
for (int i = 0; i < m; ++i) {
int32_t bias_v = bias[i];
for (int j = 0; j < n; ++j) {
int32_t r = 0;
for (int p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
r = qadd_int32(r, bias_v);
if (relu) r = std::max(0, r);
c1(i, j) = qscale_int32(r, scale);
}
}
}
paddle_mobile::operators::math::Gemm gemm;
#ifdef _OPENMP
gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias, addOnRow);
#else
gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
relu, bias, addOnRow);
#endif
int eq = 0;
int neq = 0;
for (int i = 0; i < m * n; ++i) {
if (c[i] == c1[i]) {
++eq;
} else {
++neq;
}
}
if (pr > 0) {
std::cout << "A:" << std::endl;
print_matrix(m, k, lda, a);
std::cout << "B:" << std::endl;
print_matirx(k, n, ldb, b);
print_matrix(k, n, ldb, b);
std::cout << "Bias:" << std::endl;
if (addOnRow) {
print_matrix(1, n, n, bias);
} else {
print_matrix(m, 1, 1, bias);
}
std::cout << "C:" << std::endl;
print_matirx(m, n, ldc, c);
print_matrix(m, n, ldc, c);
std::cout << "C1:" << std::endl;
print_matirx(m, n, ldc, c1);
print_matrix(m, n, ldc, c1);
}
std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
<< " eq=" << eq << " neq=" << neq << std::endl;
PADDLE_MOBILE_ENFORCE(neq == 0,
"The execution of do_sgemm_with_bias is failed!");
paddle_mobile::memory::Free(a);
paddle_mobile::memory::Free(b);
paddle_mobile::memory::Free(c);
paddle_mobile::memory::Free(c1);
paddle_mobile::memory::Free(bias);
return 0;
}
int main() {
#ifdef _OPENMP
omp_set_num_threads(8);
omp_set_num_threads(4);
#endif
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm without bias:" << std::endl;
do_sgemm(9, 9, 9, false, 1);
do_sgemm(10, 6, 12, false, 0);
do_sgemm(512, 256, 384, false, 0);
......@@ -140,5 +303,44 @@ int main() {
do_sgemm(333, 797, 939, false, 0);
do_sgemm(1024, 1024, 1024, false, 0);
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with bias(bias is added on column):" << std::endl;
do_sgemm_with_bias(9, 9, 9, false, 1);
do_sgemm_with_bias(10, 6, 12, false, 0);
do_sgemm_with_bias(512, 256, 384, false, 0);
do_sgemm_with_bias(1366, 768, 256, false, 0);
do_sgemm_with_bias(1255, 755, 333, false, 0);
do_sgemm_with_bias(599, 1133, 393, false, 0);
do_sgemm_with_bias(777, 555, 999, false, 0);
do_sgemm_with_bias(333, 797, 939, false, 0);
do_sgemm_with_bias(1024, 1024, 1024, false, 0);
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with bias(bias is added on row):" << std::endl;
do_sgemm_with_bias(9, 9, 9, false, 1, true);
do_sgemm_with_bias(10, 6, 12, false, 0, true);
do_sgemm_with_bias(512, 256, 384, false, 0, true);
do_sgemm_with_bias(1366, 768, 256, false, 0, true);
do_sgemm_with_bias(1255, 755, 333, false, 0, true);
do_sgemm_with_bias(599, 1133, 393, false, 0, true);
do_sgemm_with_bias(777, 555, 999, false, 0, true);
do_sgemm_with_bias(333, 797, 939, false, 0, true);
do_sgemm_with_bias(1024, 1024, 1024, false, 0, true);
std::cout << "\n\n******************************************************\n\n"
<< std::endl;
std::cout << "Test gemm with relu and bias:" << std::endl;
do_sgemm_with_bias(9, 9, 9, true, 1);
do_sgemm_with_bias(10, 6, 12, true, 0);
do_sgemm_with_bias(512, 256, 384, true, 0);
do_sgemm_with_bias(1366, 768, 256, true, 0);
do_sgemm_with_bias(1255, 755, 333, true, 0);
do_sgemm_with_bias(599, 1133, 393, true, 0);
do_sgemm_with_bias(777, 555, 999, true, 0);
do_sgemm_with_bias(333, 797, 939, true, 0);
do_sgemm_with_bias(1024, 1024, 1024, true, 0);
return 0;
}
......@@ -28,7 +28,7 @@ limitations under the License. */
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(8);
paddle_mobile.SetThreadNum(4);
Tensor aa, bb, cc;
auto aaptr = aa.mutable_data<float>({m, k});
auto bbptr = bb.mutable_data<float>({k, n});
......@@ -44,10 +44,13 @@ int main() {
ccptr[i] = 2;
}
Tensor aa_int8, bb_int8, cc_int8;
Tensor aa_int8, bb_int8, cc_int32, cc_int8;
auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
int32_t* bias_data_col = new int32_t[m];
int32_t* bias_data_row = new int32_t[n];
for (int i = 0; i < m * k; ++i) {
aaptr_int8[i] = static_cast<int8_t>(2);
......@@ -56,7 +59,15 @@ int main() {
bbptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < m * n; ++i) {
ccptr_int8[i] = static_cast<int32_t>(2);
ccptr_int32[i] = static_cast<int32_t>(2);
}
for (int i = 0; i < m; ++i) {
bias_data_col[i] = 2;
}
for (int i = 0; i < n; ++i) {
bias_data_row[i] = 2;
}
// float
......@@ -67,31 +78,87 @@ int main() {
false, nullptr);
}
auto time1 = time();
auto time_start0 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr);
}
auto time2 = time();
std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
auto time_end0 = time();
std::cout << "float gemm cost :" << time_diff(time_start0, time_end0) / 10
<< "ms\n";
// int8_t
// int8_t without bias
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
paddle_mobile::operators::math::matmul<float, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0));
}
auto time_start1 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float, int32_t>(
aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
static_cast<float>(0));
}
auto time_end1 = time();
std::cout << "int8_t gemm cost :" << time_diff(time_start1, time_end1) / 10
<< "ms\n";
auto time3 = time();
// int8_t with bias, column element wise add
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false);
}
auto time_start2 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_col, false);
}
auto time4 = time();
std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n";
auto time_end2 = time();
std::cout << "int8_t gemm_with_bias(column add) cost :"
<< time_diff(time_start2, time_end2) / 10 << "ms\n";
// int8_t with bias, row element wise add
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true);
}
auto time_start3 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), false, bias_data_row, true);
}
auto time_end3 = time();
std::cout << "int8_t gemm_with_bias(row add) cost :"
<< time_diff(time_start3, time_end3) / 10 << "ms\n";
// int8_t with bias&relu
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false);
}
auto time_start4 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul(
aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
static_cast<float>(0), true, bias_data_col, false);
}
auto time_end4 = time();
std::cout << "int8_t gemm_with_bias_relu cost :"
<< time_diff(time_start4, time_end4) / 10 << "ms\n";
delete[] bias_data_row;
delete[] bias_data_col;
return 0;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <limits>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/fusion_conv_add_relu_int8_op.h"
namespace paddle_mobile {
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
// round to zero
float round2zero(float v) {
float res;
if (v > 0)
res = std::floor(v);
else if (v < 0)
res = std::ceil(v);
return res;
}
int8_t qscale_int32(int32_t v, float scale) {
float res = static_cast<float>(v) * scale;
res = round2zero(res);
if (res > 127)
return static_cast<int8_t>(127);
else if (res < -127)
return static_cast<int8_t>(-127);
else
return static_cast<int8_t>(res);
}
// Reference convolution from Caffe for checking results.
// accumulate through explicit loops over input, output, and filters.
template <typename T>
void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
const framework::Tensor *bias, const framework::AttributeMap &attrs,
framework::Tensor *output, float scale) {
framework::AttrReader attr_reader(attrs);
std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
int groups = attr_reader.Get<int>("groups");
int kernel_h = filter->dims()[2];
int kernel_w = filter->dims()[3];
int pad_h = paddings[0];
int pad_w = paddings[1];
int stride_h = strides[0];
int stride_w = strides[1];
int dilation_h = dilations[0];
int dilation_w = dilations[1];
auto in_shape = input->dims();
auto out_shape = output->dims();
const bool has_depth = 0;
int kernel_d, pad_d, stride_d, dilation_d;
if (has_depth) {
kernel_d = kernel_h;
stride_d = stride_h;
pad_d = pad_h;
dilation_d = dilation_h;
} else {
kernel_d = stride_d = dilation_d = 1;
pad_d = 0;
}
// Groups
int o_g = out_shape[1] / groups;
int k_g = in_shape[1] / groups;
int o_head, k_head;
// Convolution
vector<int> weight_offset(4 + has_depth);
vector<int> in_offset(4 + has_depth);
vector<int> out_offset(4 + has_depth);
auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
framework::DDim shape = input->dims();
size_t count = 0;
for (int i = 0; i < indics.size(); ++i) {
count *= shape[i];
count += indics[i];
}
return count;
};
const T *in_data = input->data<T>();
const T *w_data = filter->data<T>();
framework::Tensor output_32;
int32_t *out_data_32 = output_32.mutable_data<int32_t>(out_shape);
memset(out_data_32, 0, output_32.numel() * sizeof(int32_t));
for (int n = 0; n < out_shape[0]; n++) {
for (int g = 0; g < groups; g++) {
o_head = o_g * g;
k_head = k_g * g;
for (int o = 0; o < o_g; o++) {
for (int k = 0; k < k_g; k++) {
for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
for (int y = 0; y < out_shape[2 + has_depth]; y++) {
for (int x = 0; x < out_shape[3 + has_depth]; x++) {
for (int r = 0; r < kernel_d; r++) {
for (int p = 0; p < kernel_h; p++) {
for (int q = 0; q < kernel_w; q++) {
int in_z = z * stride_d - pad_d + r * dilation_d;
int in_y = y * stride_h - pad_h + p * dilation_h;
int in_x = x * stride_w - pad_w + q * dilation_w;
if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
in_y >= 0 && in_y < in_shape[2 + has_depth] &&
in_x >= 0 && in_x < in_shape[3 + has_depth]) {
weight_offset[0] = o + o_head;
weight_offset[1] = k;
if (has_depth) {
weight_offset[2] = r;
}
weight_offset[2 + has_depth] = p;
weight_offset[3 + has_depth] = q;
in_offset[0] = n;
in_offset[1] = k + k_head;
if (has_depth) {
in_offset[2] = in_z;
}
in_offset[2 + has_depth] = in_y;
in_offset[3 + has_depth] = in_x;
out_offset[0] = n;
out_offset[1] = o + o_head;
if (has_depth) {
out_offset[2] = z;
}
out_offset[2 + has_depth] = y;
out_offset[3 + has_depth] = x;
out_data_32[offset(output, out_offset)] +=
in_data[offset(input, in_offset)] *
w_data[offset(filter, weight_offset)];
}
}
}
}
}
}
}
}
}
}
}
T *out_data = output->mutable_data<T>();
int32_t n = out_shape[0];
int32_t c = out_shape[1];
int32_t h = out_shape[2];
int32_t w = out_shape[3];
const int32_t *bias_data = bias->data<int32_t>();
for (int i = 0; i < n; ++i) {
for (int j = 0; j < c; ++j) {
int32_t bias_v = bias_data[j];
for (int k = 0; k < h; ++k) {
for (int l = 0; l < w; ++l) {
int32_t tmp = out_data_32[i * c * h * w + j * h * w + k * w + l];
tmp = qadd_int32(tmp, bias_v);
tmp = std::max(0, tmp);
out_data[i * c * h * w + j * h * w + k * w + l] =
qscale_int32(tmp, scale);
}
}
}
}
}
template <typename T, int Kernel, int Pad, int Stride>
int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
int kernel_h = Kernel;
int kernel_w = Kernel;
int pad_h = Pad;
int pad_w = Pad;
int stride_h = Stride;
int stride_w = Stride;
int dilation_h = 1;
int dilation_w = 1;
int batch_size = 1;
int input_c = in_channels;
int input_h = in_height;
int input_w = in_width;
int output_c = out_channels;
framework::DDim input_shape =
framework::make_ddim({batch_size, input_c, input_h, input_w});
framework::DDim filter_shape =
framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1;
int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1;
framework::DDim output_shape = framework::make_ddim(
std::vector<int>({batch_size, output_c, output_h, output_w}));
framework::DDim bias_shape = framework::make_ddim({output_c});
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["Input"] = std::vector<std::string>({"input"});
inputs["Filter"] = std::vector<std::string>({"filter"});
inputs["Scale"] = std::vector<std::string>({"scale"});
inputs["Y"] = std::vector<std::string>({"bias"});
outputs["Out"] = std::vector<std::string>({"output"});
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(input, input_shape, -127, 127);
auto filter_var = scope.get()->Var("filter");
auto filter = filter_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(filter, filter_shape, -127, 127);
auto scale_var = scope.get()->Var("scale");
auto scale = scale_var->template GetMutable<framework::LoDTensor>();
scale->Resize(framework::make_ddim({1}));
float scale_v = 0.000828f;
scale->mutable_data<float>()[0] = scale_v;
auto bias_var = scope.get()->Var("bias");
auto bias = bias_var->template GetMutable<framework::LoDTensor>();
SetupTensor<int32_t>(bias, bias_shape, -127, 127);
auto output_var = scope.get()->Var("output");
framework::AttributeMap attrs;
attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
attrs["dilations"].Set<vector<int>>(
std::vector<int>({dilation_h, dilation_w}));
attrs["groups"].Set<int>(1);
attrs["axis"].Set<int>(0);
auto *op = new operators::FusionConvAddReluInt8Op<CPU, T>(
"fusion_conv_add_relu_int8", inputs, outputs, attrs, scope);
op->InferShape();
op->Init();
op->Run();
framework::Tensor output_cmp;
output_cmp.mutable_data<T>(output_shape);
conv2d<T>(input, filter, bias, attrs, &output_cmp, scale_v);
// compare results
int eq = 0;
int neq = 0;
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
T *output_cmp_data = output_cmp.data<T>();
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(
output_data[i] == output_cmp_data[i],
"The execution of test_fusion_conv_add_relu_int8_op is failed!");
if (output_data[i] == output_cmp_data[i]) {
++eq;
} else {
++neq;
}
}
std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
delete op;
return 0;
}
} // namespace paddle_mobile
int main(int argc, char *argv[]) {
if (argc < 5) {
LOG(paddle_mobile::kLOG_INFO)
<< "Usage:\n"
<< " ./test-conv-add-relu-int8-op in_channels in_height in_width "
"out_channels\n"
<< " params:\n"
<< " -in_channels: int, input image's channels\n"
<< " -in_height: int, input image's height\n"
<< " -in_width: int, input image's width\n"
<< " -out_channels: int, conv output channels\n";
return 1;
}
int in_channels = atoi(argv[1]);
int in_height = atoi(argv[2]);
int in_width = atoi(argv[3]);
int out_channels = atoi(argv[4]);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8_t, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
paddle_mobile::TestConvOp<int8_t, 7, 0, 2>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 1, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
paddle_mobile::TestConvOp<int8_t, 7, 1, 2>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 3, stride = 2
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
paddle_mobile::TestConvOp<int8_t, 7, 3, 2>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, 7, 0, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, 7, 1, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 3, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
paddle_mobile::TestConvOp<int8_t, 7, 3, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 5, stride = 3
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
paddle_mobile::TestConvOp<int8_t, 7, 5, 3>(in_channels, in_height, in_width,
out_channels);
// kernel = 7, pad = 3, stride = 4
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
paddle_mobile::TestConvOp<int8_t, 7, 3, 4>(in_channels, in_height, in_width,
out_channels);
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, 3, 0, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
paddle_mobile::TestConvOp<int8_t, 5, 0, 1>(in_channels, in_height, in_width,
out_channels);
// kernel = 5, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
out_channels);
}
#else
int main() {
std::cout << "FUSION_CONVADDRELU_INT8_OP is not defined!" << std::endl;
return 0;
}
#endif
......@@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <framework/program/program-optimize/program_optimize.h>
#include <iostream>
#include <type_traits>
#include "../test_helper.h"
#include "../test_include.h"
#include "framework/operator.h"
#include "operators/fusion_fc_int8_op.h"
#include "operators/fusion_fc_op.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
namespace paddle_mobile {
namespace framework {
using framework::AttributeMap;
using framework::DDim;
using framework::Scope;
using framework::make_ddim;
int32_t qadd_int32(int32_t l, int32_t r) {
int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
if (res > std::numeric_limits<int32_t>::max())
return std::numeric_limits<int32_t>::max();
else if (res < std::numeric_limits<int32_t>::min())
return std::numeric_limits<int32_t>::min();
else
return static_cast<int32_t>(res);
}
template <typename Dtype>
class TestFcOp {
public:
explicit TestFcOp(const Program<Dtype> p) : program_(p) {
use_optimize_ = true;
if (use_optimize_) {
to_predict_program_ = program_.optimizeProgram;
} else {
to_predict_program_ = program_.originProgram;
}
// round to zero
float round2zero(float v) {
float res;
if (v > 0)
res = std::floor(v);
else if (v < 0)
res = std::ceil(v);
return res;
}
const std::vector<std::shared_ptr<BlockDesc>> blocks =
to_predict_program_->Blocks();
// DLOG << " **block size " << blocks.size();
for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
// DLOG << " ops " << ops.size();
for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<OpDesc> op = ops[j];
if (op->Type() == "fc" && op->Input("X")[0] == "pool2d_13.tmp_0") {
DLOG << " fc attr size: " << op->GetAttrMap().size();
DLOG << " inputs size: " << op->GetInputs().size();
DLOG << " outputs size: " << op->GetOutputs().size();
DLOG << " Input X is : " << op->Input("X")[0];
DLOG << " Input Y is : " << op->Input("Y")[0];
DLOG << " Input Y is : " << op->Input("Z")[0];
DLOG << " Output Out is : " << op->Output("Out")[0];
std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
std::make_shared<operators::FusionFcOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), program_.scope);
ops_of_block_[*block_desc.get()].push_back(testOp);
int8_t qscale_int32(int32_t v, float scale) {
float res = static_cast<float>(v) * scale;
res = round2zero(res);
if (res > 127)
return static_cast<int8_t>(127);
else if (res < -127)
return static_cast<int8_t>(-127);
else
return static_cast<int8_t>(res);
}
template <typename T, typename S>
int TestFcOP() {
int32_t m = 377;
int32_t n = 1363;
int32_t k = 577;
int32_t lda = k;
int32_t ldb = n;
int32_t ldc = n;
DDim inputA_shape = make_ddim({m, k});
DDim inputB_shape = make_ddim({k, n});
DDim bias_shape = make_ddim({n});
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<Scope>();
inputs["X"] = std::vector<std::string>({"inputA"});
inputs["Y"] = std::vector<std::string>({"inputB"});
inputs["Z"] = std::vector<std::string>({"bias"});
inputs["Scale"] = std::vector<std::string>({"scale"});
outputs["Out"] = std::vector<std::string>({"output"});
auto inputA_var = scope.get()->Var("inputA");
auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputA, inputA_shape, -127, 127);
auto inputB_var = scope.get()->Var("inputB");
auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputB, inputB_shape, -127, 127);
auto bias_var = scope.get()->Var("bias");
auto bias = bias_var->template GetMutable<framework::LoDTensor>();
SetupTensor<S>(bias, bias_shape, -127, 127);
auto scale_var = scope.get()->Var("scale");
auto scale = scale_var->template GetMutable<framework::LoDTensor>();
scale->Resize(framework::make_ddim({1}));
float scale_v = 0.000828f;
scale->mutable_data<float>()[0] = scale_v;
auto output_var = scope.get()->Var("output");
AttributeMap attrs;
attrs["x_num_col_dims"].Set<int>(1);
attrs["y_num_col_dims"].Set<int>(1);
attrs["axis"].Set<int>(1);
operators::OperatorBase<CPU> *op = nullptr;
#ifdef FUSION_FC_INT8_OP
if (std::is_same<T, int8_t>::value) {
op = new operators::FusionFcInt8Op<CPU, T>("fusion_fc_int8", inputs,
outputs, attrs, scope);
} else {
op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
scope);
}
#else
op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
scope);
#endif
op->InferShape();
op->Run();
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
// compare
T *c = static_cast<T *>(memory::Alloc(sizeof(T) * m * n));
T *a = inputA->data<T>();
T *b = inputB->data<T>();
S *bias_data = bias->data<S>();
for (int32_t i = 0; i < m; ++i) {
for (int32_t j = 0; j < n; ++j) {
S bias_v = bias_data[j];
if (std::is_same<T, int8_t>::value) {
int32_t r = 0;
for (int32_t p = 0; p < k; p++) {
r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
}
r = qadd_int32(r, bias_v);
c(i, j) = qscale_int32(r, scale_v);
} else {
T r = 0;
for (int32_t p = 0; p < k; p++) {
r += a(i, p) * b(p, j);
}
r += bias_v;
c(i, j) = r;
}
}
}
std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2,
const Tensor &t3) {
// feed
auto scope = program_.scope;
Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
tensor_x->ShareDataWith(t1);
Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
tensor_y->ShareDataWith(t2);
Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
tensor_z->ShareDataWith(t3);
Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
auto *output_tensor = con_output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>({3, 10});
// DLOG << typeid(output_tensor).name();
// DLOG << "output_tensor dims: " << output_tensor->dims();
std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
out_tensor.reset(output_tensor);
predict(t1, t2, t3, 0);
return out_tensor;
}
private:
const framework::Program<Dtype> program_;
std::shared_ptr<ProgramDesc> to_predict_program_;
std::map<framework::BlockDesc,
std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
ops_of_block_;
bool use_optimize_ = false;
void predict(const Tensor &t1, const Tensor &t2, const Tensor &t3,
int block_id) {
std::shared_ptr<BlockDesc> to_predict_block =
to_predict_program_->Block(block_id);
for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
auto op = ops_of_block_[*to_predict_block.get()][j];
DLOG << "op -> run()";
op->Run();
int32_t eq = 0;
int32_t neq = 0;
for (int32_t i = 0; i < m * n; ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == c[i],
"The execution of test_fusion_fc_op is failed!");
if (output_data[i] == c[i]) {
++eq;
} else {
++neq;
}
}
};
template class TestFcOp<CPU>;
} // namespace framework
std::cout << "mnk=" << m << " " << n << " " << k << " eq=" << eq
<< " neq=" << neq << std::endl;
delete op;
return 0;
}
} // namespace paddle_mobile
int main() {
DLOG << "----------**********----------";
DLOG << "begin to run Fc Test";
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
// "../../../test/models/googlenet"
auto program = loader.Load(g_googlenet);
paddle_mobile::framework::ProgramOptimize optimize;
// program.originProgram->Description("origin");
auto optimize_program = optimize.FusionOptimize(program.originProgram);
program.optimizeProgram = optimize_program;
if (optimize_program != nullptr) {
optimize_program->Description("optimize");
} else {
LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
}
/// input x (1,3,224,224)
paddle_mobile::framework::LoDTensor inputx;
SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
static_cast<float>(1));
auto *inputx_ptr = inputx.data<float>();
/// input y (224,)
paddle_mobile::framework::LoDTensor inputy;
SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
static_cast<float>(1.5));
auto *inputy_ptr = inputy.data<float>();
paddle_mobile::framework::LoDTensor inputz;
SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
static_cast<float>(1));
auto *inputz_ptr = inputz.data<float>();
paddle_mobile::framework::TestFcOp<paddle_mobile::CPU> testFcOp(program);
auto output = testFcOp.predict(inputx, inputy, inputz);
auto *output_ptr = output->data<float>();
for (int j = 0; j < output->numel(); ++j) {
DLOG << "value of output: " << output_ptr[j];
}
DLOG << "1 (3,64) * 2 (64,10) = 96(3,10)";
DLOG << "output : 96(3,10) + bias(10)";
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
#ifdef FUSION_FC_INT8_OP
paddle_mobile::TestFcOP<int8_t, int32_t>();
#endif
paddle_mobile::TestFcOP<float, float>();
return 0;
}
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/mul_op.h"
......@@ -79,14 +80,14 @@ int TestMulOP() {
PADDLE_MOBILE_ENFORCE(
output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) {
if (output_data[i] == c[i]) {
++eq;
} else {
++neq;
}
}
DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq
<< " neq=" << neq;
std::cout << "mnk=" << m << " " << n << " " << k << " eq=" << eq
<< " neq=" << neq << std::endl;
delete op;
return 0;
}
......@@ -94,7 +95,7 @@ int TestMulOP() {
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(8);
paddle_mobile.SetThreadNum(4);
paddle_mobile::TestMulOP<int8_t, int32_t>();
paddle_mobile::TestMulOP<float, float>();
return 0;
......
......@@ -12,30 +12,281 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_include.h"
#include "operators/kernel/central-arm-func/pool_arm_func.h"
#include "operators/pool_op.h"
int main() {
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_googlenet));
if (program.originProgram == nullptr) {
DLOG << "program read file";
namespace paddle_mobile {
static int PoolOutputSize(int input_size, int filter_size, int padding,
int stride, bool ceil_mode) {
int output_size;
if (!ceil_mode) {
output_size = (input_size - filter_size + 2 * padding) / stride + 1;
} else {
output_size =
(input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
}
return output_size;
}
template <typename T>
static void PoolAvgPad0(std::vector<int> ksize, std::vector<int> strides,
const framework::Tensor *input,
framework::Tensor *out) {
const int32_t batch_size = input->dims()[0];
const int32_t input_c = input->dims()[1];
const int32_t input_h = input->dims()[2];
const int32_t input_w = input->dims()[3];
const int32_t out_c = out->dims()[1];
const int32_t out_h = out->dims()[2];
const int32_t out_w = out->dims()[3];
const int32_t kernel_h = ksize[0];
const int32_t kernel_w = ksize[1];
const int32_t stride_h = strides[0];
const int32_t stride_w = strides[1];
const int32_t inputdata_channel_stride = input_h * input_w;
const int32_t input_batch_stride = input_c * inputdata_channel_stride;
const int32_t outputdata_channel_stride = out_h * out_w;
const int32_t output_batch_stride = out_c * outputdata_channel_stride;
T *out_data = out->mutable_data<T>();
const T *input_data = input->data<T>();
const T **rows = new const T *[kernel_h];
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < out_c; ++j) {
const T *img_in = input_data + j * inputdata_channel_stride;
T *img_out = out_data + j * outputdata_channel_stride;
for (int k = 0; k < out_h; ++k) {
for (int m = 0; m < kernel_h; ++m) {
rows[m] = img_in + (stride_h * k + m) * input_w;
}
int32_t left = out_w;
while (left > 0) {
float tmp = 0;
for (int m = 0; m < kernel_h; ++m) {
for (int l = 0; l < kernel_w; ++l) {
tmp += rows[m][l];
}
}
if (typeid(T) == typeid(int8_t)) {
tmp = tmp / (kernel_h * kernel_w);
if (tmp < -127) {
*img_out = -127;
} else if (tmp > 127) {
*img_out = 127;
} else {
*img_out = static_cast<T>(std::round(tmp));
}
} else {
*img_out = static_cast<T>(tmp / (kernel_h * kernel_w));
}
for (int m = 0; m < kernel_h; ++m) {
rows[m] += stride_w;
}
img_out++;
left--;
}
}
}
input_data += input_batch_stride;
out_data += output_batch_stride;
}
delete[] rows;
}
template <typename T, int CeilMode, int PoolType, int Kernel, int Pad,
int Stride>
int TestPoolOp(int in_channels, int in_height, int in_width) {
int kernel_h = Kernel;
int kernel_w = Kernel;
int pad_h = Pad;
int pad_w = Pad;
int stride_h = Stride;
int stride_w = Stride;
bool ceil_mode = CeilMode != 0;
std::string pooling_type = (PoolType == 0 ? "max" : "avg");
int batch_size = 1;
int input_c = in_channels;
int input_h = in_height;
int input_w = in_width;
framework::DDim input_shape =
framework::make_ddim({batch_size, input_c, input_h, input_w});
std::vector<int64_t> output_shape_v({batch_size, input_c});
output_shape_v.push_back(
PoolOutputSize(input_h, kernel_h, pad_h, stride_h, ceil_mode));
output_shape_v.push_back(
PoolOutputSize(input_w, kernel_w, pad_w, stride_w, ceil_mode));
framework::DDim output_shape = framework::make_ddim(output_shape_v);
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::PoolOp<paddle_mobile::CPU, float>>
executor(program, "pool2d");
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["X"] = std::vector<std::string>({"input"});
outputs["Out"] = std::vector<std::string>({"output"});
paddle_mobile::framework::Tensor input;
SetupTensor<float>(&input, {1, 64, 112, 112}, static_cast<float>(0),
static_cast<float>(1));
auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
auto output =
executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(input, input_shape, -127, 127);
float *output_ptr = output->data<float>();
for (int j = 0; j < output->numel(); ++j) {
DLOG << " value of output: " << output_ptr[j];
auto output_var = scope.get()->Var("output");
framework::AttributeMap attrs;
attrs["pooling_type"].SetString(pooling_type);
attrs["ksize"].Set<vector<int>>(std::vector<int>({kernel_h, kernel_w}));
attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
attrs["ceil_mode"].Set<bool>(false);
attrs["global_pooling"].Set<bool>(false);
auto *op = new operators::PoolOp<CPU, float>("pool2d", inputs, outputs, attrs,
scope);
op->InferShape();
op->Init();
op->Run();
framework::Tensor output_cmp;
output_cmp.mutable_data<T>(output_shape);
if (pooling_type == "avg" && pad_h == 0 && pad_h == pad_w) {
PoolAvgPad0<T>(std::vector<int>{kernel_h, kernel_w},
std::vector<int>{stride_h, stride_w}, input, &output_cmp);
} else {
if (typeid(T) == typeid(int8_t)) {
operators::PoolBasic<int8_t, int32_t>(
pooling_type, std::vector<int>{kernel_h, kernel_w},
std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
input, &output_cmp);
} else {
operators::PoolBasic<float, float>(
pooling_type, std::vector<int>{kernel_h, kernel_w},
std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
input, &output_cmp);
}
}
// compare results
int eq = 0;
int neq = 0;
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
T *output_cmp_data = output_cmp.data<T>();
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
"The execution of test_pool_op is failed!");
if (output_data[i] == output_cmp_data[i]) {
++eq;
} else {
++neq;
}
}
std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
delete op;
return 0;
}
} // namespace paddle_mobile
int main(int argc, char *argv[]) {
if (argc < 4) {
LOG(paddle_mobile::kLOG_INFO)
<< "Usage:\n"
<< " ./test-pool-op in_channels in_height in_width \n"
<< " params:\n"
<< " -in_channels: int, input image's channels\n"
<< " -in_height: int, input image's height\n"
<< " -in_width: int, input image's width\n";
return 1;
}
int in_channels = atoi(argv[1]);
int in_height = atoi(argv[2]);
int in_width = atoi(argv[3]);
#if __ARM_NEON
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
paddle_mobile::TestPoolOp<float, 0, 0, 3, 1, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
paddle_mobile::TestPoolOp<float, 0, 0, 3, 0, 2>(in_channels, in_height,
in_width);
#endif
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 1, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 2, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 2>(in_channels, in_height,
in_width);
// kernel = 3, pad = 1, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 2>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 2>(in_channels, in_height,
in_width);
// kernel = 3, pad = 3, stride = 3
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3";
paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 3, 3>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 1>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 2
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 2>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 3
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 3>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 1>(in_channels, in_height,
in_width);
// kernel = 3, pad = 0, stride = 3
LOG(paddle_mobile::kLOG_INFO)
<< "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3";
paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 3>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 1>(in_channels, in_height,
in_width);
// kernel = 7, pad = 0, stride = 4
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4";
paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 4>(in_channels, in_height,
in_width);
// kernel = 5, pad = 0, stride = 1
LOG(paddle_mobile::kLOG_INFO)
<< "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1";
paddle_mobile::TestPoolOp<float, 0, 1, 5, 0, 1>(in_channels, in_height,
in_width);
}
......@@ -213,6 +213,8 @@ if(NOT FOUND_MATCH)
set(FUSION_CONVADD_OP ON)
set(FUSION_CONVADDPRELU_OP ON)
set(FUSION_CONVADDRELU_OP ON)
set(FUSION_CONVADDRELU_INT8_OP ON)
set(FUSION_FC_INT8_OP ON)
set(FUSION_FC_OP ON)
set(LRN_OP ON)
set(MUL_OP ON)
......@@ -312,6 +314,9 @@ endif()
if (FUSION_CONVADDRELU_OP)
add_definitions(-DFUSION_CONVADDRELU_OP)
endif()
if (FUSION_CONVADDRELU_INT8_OP)
add_definitions(-DFUSION_CONVADDRELU_INT8_OP)
endif()
if (FUSION_CONVADDPRELU_OP)
add_definitions(-DFUSION_CONVADDPRELU_OP)
endif()
......@@ -321,6 +326,9 @@ endif()
if (FUSION_FC_OP)
add_definitions(-DFUSION_FC_OP)
endif()
if(FUSION_FC_INT8_OP)
add_definitions(-DFUSION_FC_INT8_OP)
endif()
if (LRN_OP)
add_definitions(-DLRN_OP)
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册