提交 5e5aaeb0 编写于 作者: L liuruilong

Merge remote-tracking branch 'upstream/develop' into develop

...@@ -26,6 +26,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; ...@@ -26,6 +26,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const char *G_OP_TYPE_FC = "fusion_fc"; const char *G_OP_TYPE_FC = "fusion_fc";
...@@ -79,6 +80,7 @@ std::unordered_map< ...@@ -79,6 +80,7 @@ std::unordered_map<
{G_OP_TYPE_BOX_CODER, {G_OP_TYPE_BOX_CODER,
{{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}}, {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
{G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}}, {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}}, {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
......
...@@ -90,6 +90,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; ...@@ -90,6 +90,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
extern const char *G_OP_TYPE_FC; extern const char *G_OP_TYPE_FC;
extern const char *G_OP_TYPE_FUSION_CONV_ADD; extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU; extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU; extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
......
...@@ -12,27 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,27 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <errno.h> #include "api.h"
#include <fcntl.h> #include <fcntl.h>
#include <pthread.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <algorithm> #include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring> #include <cstring>
#include "bias_scale.h"
#include "api.h" #include "filter.h"
#include "image.h"
#define FPGA_TEST_MODE #define FPGA_TEST_MODE
#ifdef FPGA_TEST_MODE
#include "common/log.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -164,5 +155,59 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -164,5 +155,59 @@ int PerformBypass(const struct BypassArgs &args) {
return do_ioctl(IOCTL_CONFIG_BYPASS, &args); return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
} }
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->mutable_data<float>();
size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data);
}
void format_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
size_t memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
ofm_tensor->reset_data_ptr(fpga_malloc(memory_size));
}
void format_filter(framework::Tensor *filter_tensor, int group_num) {
auto dims = filter_tensor->dims();
int num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
float max_value = filter::find_max(new_data, num * channel * height * width);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_fc_matrix(framework::Tensor *filter_tensor, int group_num,
int height, int width) {
auto dims = filter_tensor->dims();
PADDLE_MOBILE_ENFORCE(dims[0] % (height * width) == 0,
"Filter number should be divisible by group number");
int num = dims[1], channel = dims[0] / height / width;
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
float max_value = filter::find_max(new_data, num * channel * height * width);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
bias_scale::format_bias_scale_array(bias_scale_array,
element_num_per_division, num);
}
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <cstddef> #include <cstddef>
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include "framework/tensor.h"
// memory management; // memory management;
...@@ -175,6 +176,13 @@ int ComputeFpgaPool(const struct PoolingArgs& args); ...@@ -175,6 +176,13 @@ int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args); int ComputeFpgaEWAdd(const struct EWAddArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_filter(framework::Tensor* filter_tensor, int group_num);
void format_fc_matrix(framework::Tensor* filter_tensor, int group_num,
int height = 1, int width = 1);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -21,6 +21,7 @@ namespace fpga { ...@@ -21,6 +21,7 @@ namespace fpga {
namespace bias_scale { namespace bias_scale {
void align_element(float **data_in, int num_per_div_before_alignment, int num) { void align_element(float **data_in, int num_per_div_before_alignment, int num) {
int copynum = 0;
float *ptr_unaligned = *data_in; float *ptr_unaligned = *data_in;
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
...@@ -33,8 +34,20 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -33,8 +34,20 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
memset(ptr_aligned, 0, num_element * sizeof(float)); memset(ptr_aligned, 0, num_element * sizeof(float));
for (int i = 0; i < div_num; i++) { for (int i = 0; i < div_num; i++) {
memcpy(ptr_aligned + i * num_per_div_after_alignment, ptr_unaligned, if (i == div_num - 1) {
num_per_div_before_alignment * sizeof(float)); copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
} }
fpga_free(ptr_unaligned); fpga_free(ptr_unaligned);
...@@ -52,14 +65,22 @@ void interleave(float **data_in, int num_after_alignment) { ...@@ -52,14 +65,22 @@ void interleave(float **data_in, int num_after_alignment) {
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
4 * sizeof(float)); 4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4, memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment * sizeof(float) + 4 * i, ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
4 * sizeof(float));
} }
fpga_free(ptr_uninterleaved); fpga_free(ptr_uninterleaved);
*data_in = ptr_interleaved; *data_in = ptr_interleaved;
} }
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
align_element(bias_scale_array, element_num_per_division, num);
int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
interleave(bias_scale_array, div_num * element_num_after_division);
}
} // namespace bias_scale } // namespace bias_scale
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -22,6 +22,8 @@ namespace bias_scale { ...@@ -22,6 +22,8 @@ namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num); void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment); void interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
} // namespace bias_scale } // namespace bias_scale
} // namespace fpga } // namespace fpga
......
...@@ -19,21 +19,190 @@ namespace paddle_mobile { ...@@ -19,21 +19,190 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace filter { namespace filter {
void convert_to_hwc(float** data_in, int num, int channel, int height, int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
int width) {}
float find_max(float* data_in, int num) { return 0; } int calc_split_num(int num, int division_capacity) {
return (num + division_capacity - 1) / division_capacity;
}
void quantize(float* data_in, int num) {} int calc_division_number(int num, int group_num, int division_capacity) {
PADDLE_MOBILE_ENFORCE(num % group_num == 0,
"Filter number should be divisible by group number");
int split_num = calc_split_num(num, division_capacity);
PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
"Split number or group number should be 1");
return group_num * split_num;
}
void align_element(float** data_in, int num, int chw) {} int calc_num_per_div(int num, int group_num, int division_capacity) {
if (group_num == 1) {
if (num > division_capacity) {
return division_capacity;
} else {
return num;
}
} else {
return (num + group_num - 1) / group_num;
}
}
void align_num(float** data_in, int num_per_div_before_alignment, int num, void convert_to_hwc(char **data_in, int num, int channel, int height,
int chw) {} int width) {
char *tmp = *data_in;
int chw = channel * height * width;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));
for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + n * chw + offset_height + w * channel + c) =
*((*data_in)++);
}
}
}
}
void reorder(float** data_in, int num_after_alignment, int chw) {} *data_in = data_tmp;
fpga_free(tmp);
}
void interleave(float** data_in, int num_after_alignment, int chw) {} float find_max(float *data_in, int data_size) {
float max = 0.0;
for (int i = 0; i < data_size; ++i) {
float value = data_in[i];
float abs = value > 0 ? value : -value;
max = std::max(max, abs);
}
return max;
}
void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in;
float fix_range = 127;
float scale = fix_range / max;
char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) {
tmp_data[i] = (char)((*data_in)[i] * scale);
}
*data_in = (float *)tmp_data;
fpga_free(tmp);
}
void align_element(char **data_in, int num, int chw) {
int i = 0;
int j = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) {
printf("align %d \n", align_chw);
char *tmp = *data_in;
char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
memset(data_tmp, 0, num * align_chw);
for (j = 0; j < num; j++) {
memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void align_num(char **data_in, int num_per_div_before_alignment, int num,
int chw) {
int i = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
if (num_per_div_after_alignment != num_per_div_before_alignment) {
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));
memset(data_tmp, 0, num_element * sizeof(char));
for (i = 0; i < div_num; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void reorder(char **data_in, int num_after_alignment, int chw) {
int index = 0;
int new_index;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
(index / 16 % 2 * 4);
memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
chw_align);
}
*data_in = data_tmp;
fpga_free(tmp);
}
void interleave(char **data_in, int num_after_alignment, int chw) {
int i = 0;
int j = 0;
int k = 0;
int interleave_per_num = 16;
;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) {
for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(data_tmp + i * chw_align + interleave_per_num * j,
*data_in + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
*data_in + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num);
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void format_filter(float **data_in, int num, int channel, int height, int width,
int group_num, float max) {
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num;
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in;
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw);
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
}
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
......
...@@ -22,14 +22,15 @@ namespace fpga { ...@@ -22,14 +22,15 @@ namespace fpga {
namespace filter { namespace filter {
void convert_to_hwc(float** data_in, int num, int channel, int height, void convert_to_hwc(float** data_in, int num, int channel, int height,
int width); int width);
float find_max(float* data_in, int num); float find_max(float* data_in, int data_size);
void quantize(float* data_in, int num); void quantize(float** data_in, int data_size, float max);
void align_element(float** data_in, int num, int chw); void align_element(float** data_in, int num, int chw);
void align_num(float** data_in, int num_per_div_before_alignment, int num, void align_num(char** data_in, int num_per_div_before_alignment, int num,
int chw); int chw);
void reorder(float** data_in, int num_after_alignment, int chw); void reorder(float** data_in, int num_after_alignment, int chw);
void interleave(float** data_in, int num_after_alignment, int chw); void interleave(float** data_in, int num_after_alignment, int chw);
void format_filter(float** data_in, int num, int channel, int height, int width,
int group_num, float max);
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -11,3 +11,57 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,3 +11,57 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "image.h"
#include <memory.h>
#include "api.h"
namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float));
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_conv(float **data_in, int height, int cw) {
int i = 0;
int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *tmp = *data_in;
float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float));
memset(data_tmp, 0, height * align_cw * sizeof(float));
for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw),
cw * sizeof(float));
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width);
align_element_conv(data_in, height, channel * width);
}
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
...@@ -11,3 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,3 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width);
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
...@@ -254,30 +254,6 @@ class Tensor { ...@@ -254,30 +254,6 @@ class Tensor {
"Tensor's dims_ is out of bound. "); "Tensor's dims_ is out of bound. ");
} }
#ifdef PADDLE_MOBILE_FPGA
struct FPGAArgs {
friend class Tensor;
inline float *scale_pointer() { return scale_; }
inline float scale() { return *scale_; }
private:
float *scale_;
};
struct FPGAArgs fpga_args() const {
FPGAArgs args;
args.scale_ = scale.get();
return args;
}
void SetFpgaScale(float s) { *(scale.get()) = s; }
private:
std::shared_ptr<float> scale = std::make_shared<float>(0);
#endif
private: private:
/** /**
* @note Placeholder hides type T, so it doesn't appear as a * @note Placeholder hides type T, so it doesn't appear as a
...@@ -313,9 +289,12 @@ class Tensor { ...@@ -313,9 +289,12 @@ class Tensor {
virtual std::type_index type() const { return type_; } virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; } virtual void set_type(std::type_index type) { type_ = type; }
#ifndef PADDLE_MOBILE_FPGA
/*! the pointer of memory block. */ /*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_; std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
#else
std::shared_ptr<uint8_t> ptr_;
#endif
/*! the size of memory block. */ /*! the size of memory block. */
size_t size_; size_t size_;
...@@ -344,6 +323,34 @@ class Tensor { ...@@ -344,6 +323,34 @@ class Tensor {
* begins. * begins.
*/ */
size_t offset_; size_t offset_;
#ifdef PADDLE_MOBILE_FPGA
public:
inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);
}
struct FPGAArgs {
friend class Tensor;
inline float *scale_pointer() { return scale_; }
inline float scale() { return *scale_; }
private:
float *scale_;
};
struct FPGAArgs fpga_args() const {
FPGAArgs args;
args.scale_ = scale.get();
return args;
}
void SetFpgaScale(float s) { *(scale.get()) = s; }
private:
std::shared_ptr<float> scale = std::make_shared<float>(0);
#endif
}; };
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/fusion_conv_bn_add_relu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_bn_add_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionConvBNAddReluMatcher : public framework::FusionOpMatcher {
public:
FusionConvBNAddReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"},
{"Y", "BNY"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; }
std::vector<std::pair<int, std::string>> NeedCheck() {
DLOG << " conv bn add relu check add X ";
return {{2, "Y"}, {2, "X"}};
}
};
template <typename DeviceType, typename T>
class FusionConvBNAddReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionConvBNAddReluParam<DeviceType>,
operators::ConvBNAddReluKernel<DeviceType, T>> {
public:
FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvBNAddReluParam<DeviceType>,
operators::ConvBNAddReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvBNAddReluParam<DeviceType>,
operators::ConvBNAddReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
new FusionConvBNAddReluMatcher());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
new FusionConvBNAddReluMatcher());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
new FusionConvBNAddReluMatcher());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_bn_add_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_bn_add_relu);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/kernel/conv_bn_add_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvBNAddReluKernel<CPU, float>::Init(
FusionConvBNAddReluParam<CPU> *param) {
const Tensor *mean = param->InputMean();
const Tensor *variance = param->InputVariance();
const Tensor *scale = param->InputScale();
const Tensor *bias = param->InputBias();
const float epsilon = param->Epsilon();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
const int C = mean->numel();
float inv_std_ptr[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({C});
auto new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
return true;
}
template <>
void ConvBNAddReluKernel<CPU, float>::Compute(
const FusionConvBNAddReluParam<CPU> &param) const {
ConvBNAddReluCompute<float>(param);
}
template class ConvBNAddReluKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor new_bias = *param.NewBias();
Tensor new_scale = *param.NewScale();
Tensor *output = param.Output();
Tensor *bias1 = param.Bias();
int groups = param.Groups();
DLOG << "yangfei2";
DLOG << bias1->dims();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true, &new_scale,
&new_bias, g, bias_data.data<float>());
}
}
}
template <typename P>
void ConvBNAddReluCompute(const FusionConvBNAddReluParam<CPU> &param) {
Tensor Bias;
Bias.mutable_data<float>({param.Groups()});
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else {
ConvBNAddReluBasic(param);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVBNADDRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvBNAddReluKernel
: public OpKernelBase<DeviceType, FusionConvBNAddReluParam<DeviceType>> {
public:
void Compute(const FusionConvBNAddReluParam<DeviceType> &param) const;
bool Init(FusionConvBNAddReluParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -716,6 +716,27 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, ...@@ -716,6 +716,27 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
} }
} }
// 分块矩阵乘法
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale, float *new_bias,
float *bias) {
#pragma omp parallel for
for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) {
#if __aarch64__
// AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#else
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#endif
}
}
WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
}
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1) { std::string mode, float *bias, float *bias1) {
...@@ -1183,6 +1204,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1183,6 +1204,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
} }
} }
// C = A * B, batchnorm(C),C = C + bias; relu(C)
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias) {
int nc1 = nc / 4;
int _nc1 = nc % 4;
float *c_ptr, *C_ptr, *bias_ptr;
float32x4_t cv;
float32x4_t nbias;
float32x2_t scale;
float32x4_t biasv;
float32x4_t zero = vdupq_n_f32(0.0);
for (int i = 0; i < mc; ++i) {
c_ptr = c + i * NC;
C_ptr = C + i * ldc;
bias_ptr = bias + i * ldc;
nbias = vld1q_dup_f32(new_bias);
scale = vld1_dup_f32(new_scale);
new_bias++;
new_scale++;
float scale0 = vget_lane_f32(scale, 0);
for (int j = 0; j < nc1; ++j) {
cv = vld1q_f32(c_ptr);
biasv = vld1q_f32(bias_ptr);
cv = vmlaq_n_f32(nbias, cv, scale0);
cv = vaddq_f32(cv, biasv);
cv = vmaxq_f32(cv, zero);
vst1q_f32(C_ptr, cv);
c_ptr += 4;
C_ptr += 4;
bias_ptr += 4;
}
if (_nc1 != 0) {
cv = vld1q_f32(c_ptr);
biasv = vld1q_f32(bias_ptr);
cv = vmlaq_n_f32(nbias, cv, scale0);
cv = vaddq_f32(cv, biasv);
cv = vmaxq_f32(cv, zero);
if (_nc1 >= 1) {
vst1q_lane_f32(C_ptr, cv, 0);
C_ptr++;
}
if (_nc1 >= 2) {
vst1q_lane_f32(C_ptr, cv, 1);
C_ptr++;
}
if (_nc1 >= 3) {
vst1q_lane_f32(C_ptr, cv, 2);
}
}
}
}
#else #else
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
...@@ -2081,34 +2155,32 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2081,34 +2155,32 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
if (bias1 == nullptr) { if (bias1 == nullptr) {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
for (int j = 0; j < nc; ++j) { for (int j = 0; j < nc; ++j) {
float r = *c + *bias; float r = c[i * NC + j] + bias[i];
if (r < 0) { if (r < 0) {
r = *p; r *= p[i];
} }
c++; C[i * ldc + j] = r;
} }
bias++;
p++;
} }
} else { } else {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
for (int j = 0; j < nc; ++j) { for (int j = 0; j < nc; ++j) {
float r = *c + *bias; float r = c[i * NC + j] + bias[i];
r += *bias1; r += bias1[i * ldc + j];
if (r < 0) { if (r < 0) {
r *= *p; r *= p[i];
} }
c++; C[i * ldc + j] = r;
bias1++;
} }
bias++;
p++;
} }
} }
return; return;
} }
int nc1 = nc / 8; int nc1 = nc / 16;
int _nc1 = nc % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
int step = 4 * (ldc - nc); int step = 4 * (ldc - nc);
int step1 = 4 * (NC - nc); int step1 = 4 * (NC - nc);
...@@ -2120,6 +2192,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2120,6 +2192,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"loop_mc_%=: \n\t" "loop_mc_%=: \n\t"
"mov r5, %[nc1] \n\t" "mov r5, %[nc1] \n\t"
"mov r6, %[nc2] \n\t"
"vld1.32 {d0}, [%[bias]] \n\t" "vld1.32 {d0}, [%[bias]] \n\t"
"vld1.32 {d1}, [%[p]] \n\t" "vld1.32 {d1}, [%[p]] \n\t"
"vdup.32 q1, d0[0] \n\t" "vdup.32 q1, d0[0] \n\t"
...@@ -2131,20 +2204,64 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2131,20 +2204,64 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"pld [%[c], #32] \n\t" "pld [%[c], #32] \n\t"
"vld1.32 {q3, q4}, [%[c]]! \n\t" "vld1.32 {q3, q4}, [%[c]]! \n\t"
"vld1.32 {q9, q10}, [%[c]]! \n\t"
"vadd.f32 q3, q3, q1 \n\t" "vadd.f32 q3, q3, q1 \n\t"
"vadd.f32 q4, q4, q1 \n\t" "vadd.f32 q4, q4, q1 \n\t"
"vadd.f32 q9, q9, q1 \n\t"
"vadd.f32 q10, q10, q1 \n\t"
"vmax.f32 q5, q3, q14 \n\t" "vmax.f32 q5, q3, q14 \n\t"
"vmin.f32 q7, q3, q14 \n\t" "vmin.f32 q7, q3, q14 \n\t"
"vmax.f32 q6, q4, q14 \n\t" "vmax.f32 q6, q4, q14 \n\t"
"vmin.f32 q8, q4, q14 \n\t" "vmin.f32 q8, q4, q14 \n\t"
"vmax.f32 q11, q9, q14 \n\t"
"vmin.f32 q13, q9, q14 \n\t"
"vmax.f32 q12, q10, q14 \n\t"
"vmin.f32 q15, q10, q14 \n\t"
"vmla.f32 q5, q7, q2 \n\t" "vmla.f32 q5, q7, q2 \n\t"
"vmla.f32 q6, q8, q2 \n\t" "vmla.f32 q6, q8, q2 \n\t"
"vmla.f32 q11, q13, q2 \n\t"
"vmla.f32 q12, q15, q2 \n\t"
"vst1.32 {q5, q6}, [%[C]]! \n\t" "vst1.32 {q5, q6}, [%[C]]! \n\t"
"vst1.32 {q11, q12}, [%[C]]! \n\t"
"subs r5, r5, #1 \n\t" "subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t" "bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t" "end_nc1_%=: \n\t"
"subs r6, r6, #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q3}, [%[c]]! \n\t"
"vadd.f32 q3, q3, q1 \n\t"
"vmax.f32 q5, q3, q14 \n\t"
"vmin.f32 q7, q3, q14 \n\t"
"vmla.f32 q5, q7, q2 \n\t"
"vst1.32 {q5}, [%[C]]! \n\t"
"subs r6, r6, #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"vld1.32 {q4}, [%[c]]! \n\t"
"vadd.f32 q4, q4, q1 \n\t"
"vmax.f32 q6, q4, q14 \n\t"
"vmin.f32 q8, q4, q14 \n\t"
"vmla.f32 q6, q8, q2 \n\t"
"vst1.32 {q6}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
"add %[p], %[p], #4 \n\t" "add %[p], %[p], #4 \n\t"
"add %[bias], %[bias], #4 \n\t" "add %[bias], %[bias], #4 \n\t"
"add %[c], %[c], %[step1] \n\t" "add %[c], %[c], %[step1] \n\t"
...@@ -2155,10 +2272,11 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2155,10 +2272,11 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"end_mc_%=: \n\t" "end_mc_%=: \n\t"
: :
: [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
[step] "r"(step), [step1] "r"(step1), [p] "r"(p), [bias] "r"(bias), [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
[bias1] "r"(bias1) [bias] "r"(bias), [bias1] "r"(bias1)
: "memory", "r5", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8"); : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8");
} else { } else {
asm volatile( asm volatile(
"vmov.f32 q14, #0.0 \n\t" "vmov.f32 q14, #0.0 \n\t"
...@@ -2167,6 +2285,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2167,6 +2285,7 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"loop_mc_%=: \n\t" "loop_mc_%=: \n\t"
"mov r5, %[nc1] \n\t" "mov r5, %[nc1] \n\t"
"mov r6, %[nc2] \n\t"
"vld1.32 {d0}, [%[bias]] \n\t" "vld1.32 {d0}, [%[bias]] \n\t"
"vld1.32 {d1}, [%[p]] \n\t" "vld1.32 {d1}, [%[p]] \n\t"
"vdup.32 q1, d0[0] \n\t" "vdup.32 q1, d0[0] \n\t"
...@@ -2192,25 +2311,74 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2192,25 +2311,74 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
"vmla.f32 q6, q8, q2 \n\t" "vmla.f32 q6, q8, q2 \n\t"
"vst1.32 {q5, q6}, [%[C]]! \n\t" "vst1.32 {q5, q6}, [%[C]]! \n\t"
"vld1.32 {q3, q4}, [%[c]]! \n\t"
"vld1.32 {q9, q10}, [%[bias1]]! \n\t"
"vadd.f32 q3, q3, q1 \n\t"
"vadd.f32 q4, q4, q1 \n\t"
"vadd.f32 q3, q3, q9 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vmax.f32 q5, q3, q14 \n\t"
"vmin.f32 q7, q3, q14 \n\t"
"vmax.f32 q6, q4, q14 \n\t"
"vmin.f32 q8, q4, q14 \n\t"
"vmla.f32 q5, q7, q2 \n\t"
"vmla.f32 q6, q8, q2 \n\t"
"vst1.32 {q5, q6}, [%[C]]! \n\t"
"subs r5, r5, #1 \n\t" "subs r5, r5, #1 \n\t"
"bge loop_nc1_%= \n\t" "bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t" "end_nc1_%=: \n\t"
"subs r6, r6, #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q3}, [%[c]]! \n\t"
"vld1.32 {q9}, [%[bias1]]! \n\t"
"vadd.f32 q3, q3, q1 \n\t"
"vadd.f32 q3, q3, q9 \n\t"
"vmax.f32 q5, q3, q14 \n\t"
"vmin.f32 q7, q3, q14 \n\t"
"vmla.f32 q5, q7, q2 \n\t"
"vst1.32 {q5}, [%[C]]! \n\t"
"subs r6, r6, #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"sub %[bias1], %[bias1], %[nc3] \n\t"
"vld1.32 {q4}, [%[c]]! \n\t"
"vld1.32 {q10}, [%[bias1]]! \n\t"
"vadd.f32 q4, q4, q1 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vmax.f32 q6, q4, q14 \n\t"
"vmin.f32 q8, q4, q14 \n\t"
"vmla.f32 q6, q8, q2 \n\t"
"vst1.32 {q6}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
"add %[p], %[p], #4 \n\t" "add %[p], %[p], #4 \n\t"
"add %[bias], %[bias], #4 \n\t" "add %[bias], %[bias], #4 \n\t"
"add %[c], %[c], %[step1] \n\t" "add %[c], %[c], %[step1] \n\t"
"add %[C], %[C], %[step] \n\t" "add %[C], %[C], %[step] \n\t"
"add %[bias1], %[bias1], %[step] \n\t"
"subs %[mc], %[mc], #1 \n\t" "subs %[mc], %[mc], #1 \n\t"
"bge loop_mc_%= \n\t" "bge loop_mc_%= \n\t"
"end_mc_%=: \n\t" "end_mc_%=: \n\t"
: :
: [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
[step] "r"(step), [step1] "r"(step1), [p] "r"(p), [bias] "r"(bias), [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
[bias1] "r"(bias1) [bias] "r"(bias), [bias1] "r"(bias1)
: "memory", "r5", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q9", "q10"); "q8", "q9", "q10");
} }
} }
...@@ -2426,6 +2594,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, ...@@ -2426,6 +2594,59 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
"q8", "q10", "q11", "q12", "q13", "q14"); "q8", "q10", "q11", "q12", "q13", "q14");
} }
// C = A * B, batchnorm(C),C = C + bias; relu(C)
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias) {
int nc1 = nc / 4;
int _nc1 = nc % 4;
float *c_ptr, *C_ptr, *bias_ptr;
float32x4_t cv;
float32x4_t nbias;
float32x2_t scale;
float32x4_t biasv;
float32x4_t zero = vdupq_n_f32(0.0);
for (int i = 0; i < mc; ++i) {
c_ptr = c + i * NC;
C_ptr = C + i * ldc;
bias_ptr = bias + i * ldc;
nbias = vld1q_dup_f32(new_bias);
scale = vld1_dup_f32(new_scale);
new_bias++;
new_scale++;
float scale0 = vget_lane_f32(scale, 0);
for (int j = 0; j < nc1; ++j) {
cv = vld1q_f32(c_ptr);
biasv = vld1q_f32(bias_ptr);
cv = vmlaq_n_f32(nbias, cv, scale0);
cv = vaddq_f32(cv, biasv);
cv = vmaxq_f32(cv, zero);
vst1q_f32(C_ptr, cv);
c_ptr += 4;
C_ptr += 4;
bias_ptr += 4;
}
if (_nc1 != 0) {
cv = vld1q_f32(c_ptr);
biasv = vld1q_f32(bias_ptr);
cv = vmlaq_n_f32(nbias, cv, scale0);
cv = vaddq_f32(cv, biasv);
cv = vmaxq_f32(cv, zero);
if (_nc1 >= 1) {
vst1q_lane_f32(C_ptr, cv, 0);
C_ptr++;
}
if (_nc1 >= 2) {
vst1q_lane_f32(C_ptr, cv, 1);
C_ptr++;
}
if (_nc1 >= 3) {
vst1q_lane_f32(C_ptr, cv, 2);
}
}
}
}
/* /*
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) { void VecWriteBasic(int n, float *c, float *C, int ldc) {
...@@ -2835,7 +3056,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2835,7 +3056,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias) { bool relu, float *new_scale, float *new_bias, float *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
...@@ -2882,8 +3103,14 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -2882,8 +3103,14 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
#else #else
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#endif #endif
InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, if (bias == nullptr) {
&C(i, j), ldc, relu, new_scale + i, new_bias + i); InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
&C(i, j), ldc, relu, new_scale + i, new_bias + i);
} else {
InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC,
&C(i, j), ldc, relu, new_scale + i, new_bias + i,
bias + i * ldc + j);
}
} }
} }
...@@ -3071,7 +3298,8 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3071,7 +3298,8 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias) { bool relu, float *new_scale, float *new_bias,
float *bias) {
#ifdef _OPENMP #ifdef _OPENMP
int max_threads = omp_get_max_threads(); int max_threads = omp_get_max_threads();
#else #else
...@@ -3148,8 +3376,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3148,8 +3376,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, &C(i, 0), if (bias == nullptr) {
ldc, relu, new_scale + i, new_bias + i); InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, new_scale + i, new_bias + i);
} else {
InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, new_scale + i, new_bias + i,
bias + i * ldc);
}
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for
...@@ -3165,8 +3399,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3165,8 +3399,14 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, &C(0, j), if (bias == nullptr) {
ldc, relu, new_scale, new_bias); InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, new_scale, new_bias);
} else {
InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, new_scale, new_bias,
bias + j);
}
} }
} }
...@@ -3185,7 +3425,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3185,7 +3425,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
int max_threads = 1; int max_threads = 1;
#endif #endif
int L1 = 32 * 1024; int L1 = 8 * 1024;
KC = k; KC = k;
if (m > n) { if (m > n) {
// 对 A 分块 // 对 A 分块
......
...@@ -81,6 +81,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, ...@@ -81,6 +81,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc, const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); bool relu, float *new_scale, float *new_bias);
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale, float *new_bias,
float *bias);
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
...@@ -125,7 +129,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, ...@@ -125,7 +129,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias); float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1);
/* /*
// 向量矩阵乘法结果回写 // 向量矩阵乘法结果回写
// C = A * B // C = A * B
...@@ -152,8 +157,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -152,8 +157,7 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
// 32位 float 矩阵乘法, 并对结果进行 batchnrom // 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
...@@ -166,7 +170,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -166,7 +170,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本) // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias); bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
......
...@@ -56,7 +56,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -56,7 +56,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float beta, float alpha, framework::Tensor *matrix_out, float beta,
bool relu, framework::Tensor *new_scale, bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias, int group) { framework::Tensor *new_bias, int group, float *bias) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -79,12 +79,12 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -79,12 +79,12 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K, SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N, matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, new_scale->data<float>() + group, relu, new_scale->data<float>() + group,
new_bias->data<float>() + group); new_bias->data<float>() + group, bias);
#else #else
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
N, beta, matrix_out->data<float>(), N, relu, N, beta, matrix_out->data<float>(), N, relu,
new_scale->data<float>() + group, new_scale->data<float>() + group, new_bias->data<float>() + group,
new_bias->data<float>() + group); bias);
#endif #endif
} }
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
......
...@@ -32,7 +32,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, ...@@ -32,7 +32,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu, framework::Tensor *matrix_out, T beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias, framework::Tensor *new_scale, framework::Tensor *new_bias,
int group); int group, float *bias = nullptr);
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
......
...@@ -1472,6 +1472,119 @@ class FusionConvAddBNReluParam : public OpParam { ...@@ -1472,6 +1472,119 @@ class FusionConvAddBNReluParam : public OpParam {
}; };
#endif #endif
#ifdef FUSION_CONVBNADDRELU_OP
template <typename Dtype>
class FusionConvBNAddReluParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionConvBNAddReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
bias_ = InputYFrom<GType>(inputs, scope);
axis_ = GetAttr<int>("axis", attrs);
filter_ = FilterFrom<GType>(inputs, scope);
input_ = InputFrom<GType>(inputs, scope);
output_ = OutFrom<GType>(outputs, scope);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
dilations_ = GetAttr<vector<int>>("dilations", attrs);
groups = GetAttr<int>("groups", attrs);
input_bias_ = InputBiasFrom<GType>(inputs, scope);
input_mean_ = InputMeanFrom<GType>(inputs, scope);
input_scale_ = InputScaleFrom<GType>(inputs, scope);
input_variance_ = InputVarianceFrom<GType>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
keyBNY_ = getkey("BNY", inputs, 0);
keyX_ = getkey("X", inputs, 0);
keyY_ = getkey("Y", inputs, 0);
if (keyX_ == keyBNY_) {
bias_ = InputYFrom<GType>(inputs, scope);
} else if (keyY_ == keyBNY_) {
bias_ = InputXFrom<GType>(inputs, scope);
}
// is_test_ = GetAttr<bool>("is_test", attrs);
}
RType *Bias() const { return bias_; }
const int &Axis() const { return axis_; }
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
const vector<int> &Dilations() const { return dilations_; }
const int &Groups() const { return groups; }
const RType *InputBias() const { return input_bias_; }
const RType *InputMean() const { return input_mean_; }
const RType *InputScale() const { return input_scale_; }
const RType *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
const RType *NewScale() const { return new_scale_; }
const RType *NewBias() const { return new_bias_; }
protected:
RType *bias_;
int axis_;
RType *input_;
RType *output_;
RType *filter_;
vector<int> strides_;
vector<int> paddings_;
vector<int> dilations_;
int groups;
RType *input_bias_;
RType *input_mean_;
RType *input_scale_;
RType *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
RType *new_bias_;
RType *new_scale_;
std::string keyBNY_;
std::string keyX_;
std::string keyY_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
#ifdef FUSION_CONVBN_OP #ifdef FUSION_CONVBN_OP
template <typename Dtype> template <typename Dtype>
class FusionConvBNParam : public OpParam { class FusionConvBNParam : public OpParam {
......
...@@ -35,6 +35,9 @@ elseif("FPGAnets" IN_LIST NET) ...@@ -35,6 +35,9 @@ elseif("FPGAnets" IN_LIST NET)
ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h)
target_link_libraries(test-fpga-concat-op paddle-mobile) target_link_libraries(test-fpga-concat-op paddle-mobile)
ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h test_include.h)
target_link_libraries(test-format-data paddle-mobile)
elseif("mobilenetssd" IN_LIST NET) elseif("mobilenetssd" IN_LIST NET)
# gen test # gen test
ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
......
...@@ -83,8 +83,8 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { ...@@ -83,8 +83,8 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
} }
} }
paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, paddle_mobile::operators::math::SgemmWithBn(
c, ldc, relu, scale, bias); m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr);
int eq = 0; int eq = 0;
int neq = 0; int neq = 0;
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#include "fpga/api.h"
namespace frame = paddle_mobile::framework;
namespace fpga = paddle_mobile::fpga;
using std::cout;
using std::endl;
int main() {
std::vector<int> dims{1, 1, 3, 3};
std::vector<float> elements{1, 2, 3, 4, 5, 6, 7, 8, 9};
frame::DDim ddim = frame::make_ddim(dims);
frame::Tensor image(elements, ddim);
int num = image.numel();
float *data_ptr = image.mutable_data<float>();
for (int i = 0; i < num; i++) {
cout << data_ptr[i] << " ";
}
cout << endl;
fpga::format_image(&image);
data_ptr = image.mutable_data<float>();
for (int i = 0; i < 48; i++) {
cout << data_ptr[i] << " ";
}
cout << endl;
auto dd = image.dims();
cout << dims[0] << dims[1] << dims[2] << dims[3] << endl;
return 0;
}
...@@ -87,6 +87,7 @@ if ("resnet" IN_LIST NET) ...@@ -87,6 +87,7 @@ if ("resnet" IN_LIST NET)
set(ELEMENTWISEADD_OP ON) set(ELEMENTWISEADD_OP ON)
set(POOL_OP ON) set(POOL_OP ON)
set(BATCHNORM_OP ON) set(BATCHNORM_OP ON)
set(FUSION_CONVBNADDRELU_OP ON)
set(MUL_OP ON) set(MUL_OP ON)
set(RESHAPE_OP ON) set(RESHAPE_OP ON)
set(SOFTMAX_OP ON) set(SOFTMAX_OP ON)
...@@ -141,6 +142,7 @@ if(NOT FOUND_MATCH) ...@@ -141,6 +142,7 @@ if(NOT FOUND_MATCH)
set(FUSION_CONVADDADDPRELU_OP ON) set(FUSION_CONVADDADDPRELU_OP ON)
set(FUSION_DWCONVBNRELU_OP ON) set(FUSION_DWCONVBNRELU_OP ON)
set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBNRELU_OP ON)
set(FUSION_CONVBNADDRELU_OP ON)
set(PRELU_OP ON) set(PRELU_OP ON)
set(RESIZE_OP ON) set(RESIZE_OP ON)
set(SCALE_OP ON) set(SCALE_OP ON)
...@@ -244,6 +246,10 @@ if (FUSION_CONVBNRELU_OP) ...@@ -244,6 +246,10 @@ if (FUSION_CONVBNRELU_OP)
add_definitions(-DFUSION_CONVBNRELU_OP) add_definitions(-DFUSION_CONVBNRELU_OP)
endif() endif()
if (FUSION_CONVBNADDRELU_OP)
add_definitions(-DFUSION_CONVBNADDRELU_OP)
endif()
if (PRELU_OP) if (PRELU_OP)
add_definitions(-DPRELU_OP) add_definitions(-DPRELU_OP)
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册