提交 64a0feac 编写于 作者: C chonwhite

merged from 1.4.0

上级 9c15846a
文件模式从 100644 更改为 100755
......@@ -84,6 +84,14 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
}
}
int calc_pack_num(int num_per_group, int group, int division_capacity) {
auto n = 1;
while ((num_per_group * (group + n - 1) / n) > division_capacity) {
n++;
}
return (n);
}
void convert_to_hwc(int8_t* chw_data,
int8_t* hwc_data,
int num,
......@@ -221,7 +229,6 @@ int8_t* format_filter(float* data_in,
align_to_x(num_per_div_before_alignment, filter_num_alignment);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
......@@ -232,10 +239,9 @@ int8_t* format_filter(float* data_in,
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
float f_max = find_max(filter_start, chw);
int8_t* quantized_start = quantized_data + n * chw;
quantize(filter_start, quantized_start, chw, max);
filter_max.push_back(max);
filter_max.push_back(1);
}
int8_t* hwc_data =
......@@ -257,6 +263,7 @@ int8_t* format_filter(float* data_in,
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment);
int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
......
......@@ -31,6 +31,7 @@ int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity);
int calc_pack_num(int num_per_group, int group, int division_capacity);
float find_max(float* data_in, int data_size);
int8_t* format_filter(float* data_in,
......@@ -40,11 +41,13 @@ int8_t* format_filter(float* data_in,
int height,
int width,
int group_num,
float max, // NOLINT
float max,
std::vector<float>& filter_max); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width);
// void quantize_to_fp16(float** data_in, int num, int height, int width,
// float* scale_ptr);
size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr);
......
......@@ -28,7 +28,7 @@ limitations under the License. */
namespace paddle {
namespace zynqmp {
#define PADDLE_OS_LINUX
#define PADDLE_MOBILE_OS_LINUX
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
......@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) {
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
return ioctl(fd, req, arg);
#else
return -1;
......@@ -61,18 +61,32 @@ void reset_device() {
// memory management;
void *fpga_malloc(size_t size) {
#ifdef ENABLE_DEBUG
std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) {
if (ptr == MAP_FAILED) {
std::cout << "not enough memory !";
exit(-1);
}
if (errno == ENOMEM) {
std::cout << "mmap failed with not enough memory !";
exit(-1);
}
if (errno == EINVAL) {
std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
<< std::endl;
exit(-1);
}
if (ptr == NULL) {
std::cout << "NULL returned, errno=" << errno
<< ", mmap failed with other errors other than memory usage !"
<< std::endl;
exit(-1);
}
memory_map.insert(std::make_pair(ptr, size));
memory_size += size;
if (memory_size > memory_size_max) {
memory_size_max = memory_size;
}
......@@ -88,7 +102,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
size_t fpga_diagnose_memory(int detailed) {
size_t total = 0;
auto iter = memory_map.begin(); // std::map<void *, size_t>::iterator
auto iter = memory_map.begin();
while (iter != memory_map.end()) {
total += iter->second;
iter++;
......@@ -98,13 +112,15 @@ size_t fpga_diagnose_memory(int detailed) {
void fpga_free(void *ptr) {
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
auto iter = memory_map.find(ptr);
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
}
memory_size -= size;
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
munmap(ptr, size);
#else
free(ptr);
......@@ -231,6 +247,7 @@ int perform_bypass(const struct BypassArgs &args) {
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
}
args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale;
return ret;
......@@ -239,6 +256,26 @@ int perform_bypass(const struct BypassArgs &args) {
int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
int compute_fpga_scale(const struct ScaleArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Scale======";
std::cout << "scale_address:" << args.scale_address << std::endl;
std::cout << "bias_address:" << args.bias_address << std::endl;
std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_SCALE, &args);
}
......
......@@ -28,7 +28,6 @@ namespace zynqmp {
typedef int16_t half;
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
......@@ -44,14 +43,11 @@ enum DLayoutType {
};
enum ActiveType {
TYPE_RELU = 0,
TYPE_RELU6 = 1,
TYPE_LEAK_RELU = 2,
TYPE_SIGMOID = 3,
};
struct VersionArgs {
void* buffer;
TYPE_NONE = 0,
TYPE_RELU = 1,
TYPE_RELU6 = 2,
TYPE_LEAKY_RELU = 3,
TYPE_SIGMOID = 4,
};
struct DeviceInfo {
......@@ -67,6 +63,11 @@ struct DeviceInfo {
uint32_t reserved6;
};
struct VersionArgs {
void* buffer;
size_t size;
};
struct MemoryCopyArgs {
void* src;
void* dest;
......@@ -78,7 +79,9 @@ struct MemoryCacheArgs {
size_t size;
};
struct MemoryBarrierArgs {};
struct MemoryBarrierArgs {
uint16_t dummy;
};
struct BNArgs {
bool enabled;
......
......@@ -30,8 +30,13 @@ struct ReLUParam {
float leaky_relu_factor = 0.0f;
};
struct ActiveParam {
enum ActiveType type = TYPE_NONE;
float leaky_relu_factor;
};
struct PEParam {
ReLUParam relu;
ActiveParam activeParam;
};
struct InputParam : PEParam {
......@@ -100,24 +105,6 @@ struct DepthwiseConvParam : ConvParam {
Tensor* quantizedFilter_ = new Tensor();
};
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
enum PoolingType : int {
MAX = 0,
AVERAGE = 1,
......@@ -155,7 +142,8 @@ struct ElementwiseAddParam : PEParam {
struct ElementwiseMulParam : PEParam {
public:
std::vector<Tensor*> inputs;
Tensor* input_x;
Tensor* input_y = nullptr;
Tensor* output = nullptr;
};
......@@ -223,6 +211,17 @@ struct PriorBoxParam : PEParam {
float offset;
};
struct YoloBoxParam : PEParam {
Tensor* input;
Tensor* imgSize;
Tensor* outputBoxes;
Tensor* outputScores;
int downsampleRatio;
std::vector<int> anchors;
int classNum;
float confThresh;
};
struct ScaleParam : PEParam {
public:
Tensor* input = nullptr;
......@@ -255,5 +254,24 @@ struct CropParam : PEParam {
std::vector<int> offsets;
std::vector<int> shape;
};
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
} // namespace zynqmp
} // namespace paddle
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pes/conv_process.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
#include "lite/backends/fpga/KD/pes/split_pe.hpp"
namespace paddle {
namespace zynqmp {
......@@ -41,6 +42,8 @@ class ConvPE : public PE {
void apply() {
split_axis = fill_split_arg(param_);
split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
if (split_axis == 0 && param_.splitParams().size() > 1) {
ConcatParam& concat_param = concatPE_.param();
for (auto conv_param : param_.splitParams()) {
......@@ -51,110 +54,28 @@ class ConvPE : public PE {
concatPE_.apply();
}
if (split_channel) {
SplitParam& split_param = splitPE_.param();
split_param.input = param_.input;
for (auto conv_param : param_.splitParams()) {
split_param.outputs.push_back(&conv_param->input);
}
splitPE_.init();
splitPE_.apply();
}
if (DLEngine::get_instance().isZU3() &&
param_.input->shape().dimSize() == 4 &&
param_.input->shape().width() == 1 &&
param_.input->shape().width() >= 2048) {
param_.input->shape().channel() >= 2048) {
use_cpu_ = true;
}
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) { // NOLINT
// use_cpu_ = true;
}
if (!use_cpu_) { // NOLINT
if (!use_cpu_) {
// param_.filter->releaseData();
}
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape());
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.filter->shape().height();
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width;
float max = 0;
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend =
std::min(hstart + kernel_height, static_cast<int>(image_height));
int wend =
std::min(wstart + kernel_width, static_cast<int>(image_width));
hstart = std::max(hstart, static_cast<int>(0));
wstart = std::max(wstart, static_cast<int>(0));
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
for (int c = 0; c < image_channels; c++) {
for (int h = hstart; h < hend; h++) {
int hi = 0;
if (ph == 0) {
hi = h - hstart + image_pad_h;
} else {
hi = h - hstart;
}
for (int w = wstart; w < wend; w++) {
int wi = 0;
if (pw == 0) {
wi = w - wstart + image_pad_w;
} else {
wi = w - wstart;
}
const int index = (h * image_width + w) * image_channels + c;
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index];
sum += value;
}
}
}
if (param_.relu.enabled && sum < 0) {
sum = -sum;
}
if (sum > max) {
max = sum;
}
out[pool_index] = sum;
}
}
}
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
// exit(-1);
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
......@@ -164,93 +85,98 @@ class ConvPE : public PE {
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
// float16* data_out = output->data<float16>();
float* out = float_output.mutableData<float>(FP32, output->shape());
float* bias_data = param_.bias()->data<float>();
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel];
float max = 0;
int out_index = 0;
for (int i = 0; i < out_channel; i++) {
float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi;
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
// float32x4_t x0 = vld1q_f32(image);
// float32x4_t x1 = vld1q_f32(filter_ptr);
// float32x4_t r = vmulq_f32(x0, x1);
// vst1q_f32(out_ptr, r);
// image += 4;
// filter_ptr += 4;
// out_ptr += 4;
float value = image_addr[j] * filter_ptr[j];
mi[j] = value;
}
for (int h = 0; h < output->shape().height(); h++) {
for (int w = 0; w < output->shape().width(); w++) {
float sum = 0;
// #pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j];
sum += value;
}
sum += bias_data[i];
if (param_.relu.enabled && sum < 0) {
sum = 0;
}
if (sum > max) {
max = sum;
}
out_index = h * out_width * out_channel + w * out_channel + i;
out[out_index] = sum;
}
float sum = 0;
for (int j = 0; j < in_channel; j++) {
sum += mi[j];
}
out[i] = sum;
}
delete[] mi;
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
}
bool dispatch() {
fpga_reset();
if (use_cpu_) {
cpu_compute();
return true;
}
inplace_.leaky_relu_enable =
(param_.relu.leaky_relu_factor != 0) ? true : false;
inplace_.relu_enable =
inplace_.leaky_relu_enable ? false : param_.relu.enabled;
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
inplace_.power_enable = false;
inplace_.normalize_enable = false;
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor =
fp32_2_fp16(param_.relu.leaky_relu_factor);
fp32_2_fp16(param_.activeParam.leaky_relu_factor);
config_activation(activeParamterArgs);
}
}
std::vector<BasicConvParam*>& params = param_.splitParams();
if (split_channel) {
// splitPE_.param().input->saveToFile("input_image",true);
splitPE_.dispatch();
}
int ret = 0;
for (auto conv_param : params) {
// conv_param->input.printScale();
// if (split_channel) {
// conv_param->input.saveToFile("pack_image",true);
// }
ret |= compute_fpga_conv_basic(conv_param->args);
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
config_activation(activeParamterArgs);
}
......@@ -258,16 +184,29 @@ class ConvPE : public PE {
size_t size = params.size();
if (split_axis == 0 && ret == 0 && size > 1) {
// std::cout << "concat size:" << size << std::endl;
concatPE_.dispatch();
}
if (split_axis == 1 && ret == 0 && size > 1) {
// for (int n = 0; n < size - 1; n++) {
ElementwiseAddParam& add_param = addPE_.param();
add_param.inputs = {&params[0]->output, &params[1]->output};
add_param.output = param_.output;
addPE_.init();
addPE_.apply();
addPE_.dispatch();
// param_.output->printScale();
// params[0]->input.saveToFile("conv_1.txt");
// params[1]->input.saveToFile("conv_2.txt");
// params[0]->output.saveToFile("ew_o1.txt");
// params[1]->output.saveToFile("ew_o2.txt");
// std::cout << "\n ================== EW ================== \n";
// }
}
return ret == 0;
}
......@@ -275,8 +214,10 @@ class ConvPE : public PE {
private:
bool use_cpu_ = false;
bool split_channel = false;
ConvParam param_;
ConcatPE concatPE_;
SplitPE splitPE_;
ElementwiseAddPE addPE_;
int split_axis = 0;
InplaceArgs inplace_ = {0};
......
......@@ -53,6 +53,16 @@ inline int get_split_num(Tensor* filter) {
return filter::calc_split_num(aligned_num, div_capacity);
}
inline int get_pack_num(Tensor* filter, int group_num) {
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num_per_group = align_to_x(num / group_num, filter_num_alignment);
return filter::calc_pack_num(aligned_num_per_group, group_num, div_capacity);
}
inline void fill_scale_bias_const(ConvParam* param_) {
int channel = param_->output->shape().channel();
Shape sb_shape(N, {channel});
......@@ -117,6 +127,50 @@ inline void combine_add_bn_params(BatchnormParam* bn,
param_->bias()->setDataLocation(CPU);
}
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
inline void format_bias_scale_new(Tensor* bias,
Tensor* scale,
Tensor* scale_bias) {
Shape& bias_shape = bias->shape();
int channel = bias_shape.channel();
int repeat = 1;
int alignment = 16;
int length = channel;
if (channel % alignment != 0 || channel < alignment) {
int c_lcm = lcm_(channel, alignment);
repeat = c_lcm / (channel);
}
Shape shape(N, {2 * channel * repeat});
float16* scale_bias_data = scale_bias->mutableData<float16>(FP16, shape);
float* bias_data_float = bias->data<float>();
float* scale_data_float = scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value_bias = float_to_half(bias_data_float[j]);
scale_bias_data[i * length + j] = value_bias;
}
}
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value_scale = float_to_half(scale_data_float[j]);
scale_bias_data[i * length + j + length * repeat] = value_scale;
}
}
}
inline void format_scale_bias(Tensor* scale,
Tensor* bias,
Tensor* filter,
......@@ -180,8 +234,10 @@ inline void format_scale_bias(Tensor* scale,
inline void format_filter(Tensor* filter,
Tensor* quantized_filter,
int group,
std::vector<float>& scales) { // NOLINT
std::vector<float>& scales, // NOLINT
float max) {
float max_value = find_max(*filter);
// max_value = max; //TODO: global quantization for filter
Shape& filter_shape = filter->shape();
int mem_size;
......@@ -206,10 +262,22 @@ inline void format_filter(Tensor* filter,
memcpy(src, quantized_data, mem_size);
quantized_filter->flush();
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value);
}
fpga_free(quantized_data);
// for (size_t i = 0; i < max_values.size(); i++) {
// // scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f);
// }
// filter->saveToFile("filter.txt");
// std::ofstream ofs;
// ofs.open("quant.txt");
// for (int i = 0; i < mem_size; i++) {
// float value = quantized_data[i];
// ofs << value << std::endl;
// }
// ofs.close();
// exit(-1);
}
inline void format_dw_filter(Tensor* filter,
......@@ -256,18 +324,11 @@ inline void split_filter_num(const ConvParam& c_param) {
Tensor* out = param.output;
Tensor* filter = param.filter;
auto channel = out->shape().channel();
int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
int split_num = get_split_num(param.filter);
int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups;
float max = find_max(*filter);
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
......@@ -310,45 +371,29 @@ inline void split_filter_num(const ConvParam& c_param) {
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
if (param.groups != 1) {
int mem_factor =
32 / filter_num_per_div; // TODO(chonwhite): change 32 to param;
conv_param->filter.setMemScale(mem_factor);
}
std::vector<float> v; // TODO(chonwhite): change local variable name
format_filter(&new_filter, &(conv_param->filter), param.groups, v);
std::vector<float> v; // TODO(chonwhite) change variable name;
format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
conv_param->filter.setDataType(INT8);
int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
Tensor scale;
Tensor bias;
int chnnnel_start = i * filter_num_per_div;
Shape s_shape(N, {filter_num});
Shape s_shape(NC, {1, filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
}
for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
Shape sb_shape(N, {sb_num});
format_scale_bias(&scale,
&bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>();
args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>();
args.sb_address = bs_data;
args.sb_address = conv_param->scaleBias.data<float16>();
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
......@@ -364,7 +409,135 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0];
args.output.address = out_address;
args.output.scale_address = out_scale_address;
param.splitParams().push_back(conv_param);
}
}
inline void pack_channel_filter(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input;
Tensor* out = param.output;
Tensor* filter = param.filter;
int filter_num_alignment = filter::get_filter_num_alignment();
auto filter_num = filter->shape().num();
int pack_num = get_pack_num(param.filter, param.groups);
int group_per_pack = (param.groups + pack_num - 1) / pack_num;
int filter_per_group = filter_num / param.groups;
int filter_per_pack = filter_per_group * group_per_pack;
int channel_per_pack = filter->shape().channel() * group_per_pack;
float max = find_max(*filter);
Shape& out_shape = out->shape();
for (int i = 0; i < pack_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
conv_param->output.setDataLocation(Device);
conv_param->output.setAligned(true);
float16* out_address = nullptr;
float* out_scale_address = nullptr;
float16* input_address = nullptr;
ConvArgs& args = conv_param->args;
if (pack_num == 1) {
out_address = out->data<float16>();
out_scale_address = out->scale();
}
int new_group = param.groups;
int filter_current_pack = filter->shape().num();
int channel_current_pack = input->shape().channel();
new_group = i == pack_num - 1
? param.groups - (pack_num - 1) * group_per_pack
: group_per_pack;
filter_current_pack = new_group * filter_per_group;
channel_current_pack = new_group * filter->shape().channel();
if (pack_num == 1) {
input_address = input->data<float16>();
} else {
Shape in_shape(NCHW,
{1,
channel_current_pack,
input->shape().height(),
input->shape().width()});
input_address = conv_param->input.mutableData<float16>(FP16, in_shape);
}
if (pack_num != 1) {
Shape shape(
NHWC,
{1, out_shape.height(), out_shape.width(), filter_current_pack});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
}
Shape f_shape(NCHW,
{filter_current_pack,
filter->shape().channel(),
filter->shape().height(),
filter->shape().width()});
Tensor new_filter;
float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
int filter_hwc = filter->shape().height() * filter->shape().width() *
filter->shape().channel();
memcpy(new_filter_data,
filter->data<float>() + i * filter_per_pack * filter_hwc,
filter_current_pack * filter_hwc * sizeof(float));
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
float mem_factor = filter_num_alignment / filter_per_pack;
conv_param->filter.setMemScale(mem_factor);
std::vector<float> v; // TODO(chonwhite) change variable name
format_filter(&new_filter, &(conv_param->filter), new_group, v, max);
conv_param->filter.setDataType(INT8);
Tensor scale;
Tensor bias;
int chnnnel_start = i * filter_per_pack;
Shape s_shape(NC, {1, filter_current_pack});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_current_pack; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
}
for (int n = 0; n < filter_current_pack; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
conv_param->scaleBias.flush();
args.group_num = new_group;
args.sb_address = conv_param->scaleBias.data<float16>();
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
args.kernel.width = new_filter.shape().width();
args.filter_address = conv_param->filter.data<int8_t>();
args.filter_num = filter_current_pack;
args.filter_scale_address = conv_param->filter.scale();
args.image.address = input_address;
args.image.scale_address = input->scale();
args.image.channels = channel_current_pack;
args.image.width = input->shape().width();
args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0];
args.output.address = out_address;
......@@ -384,6 +557,8 @@ inline void split_channel(const ConvParam& c_param) {
Shape bs_shape(N, {channel});
float max = 1.0f;
for (int i = 0; i < num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
......@@ -407,7 +582,8 @@ inline void split_channel(const ConvParam& c_param) {
}
new_filter.flush();
std::vector<float> scales;
format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
format_filter(
&new_filter, &(conv_param->filter), param.groups, scales, max);
Tensor bias;
Tensor scale;
......@@ -420,7 +596,6 @@ inline void split_channel(const ConvParam& c_param) {
}
scale.flush();
bias.flush();
// Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale,
&bias,
&conv_param->filter,
......@@ -430,7 +605,6 @@ inline void split_channel(const ConvParam& c_param) {
ConvArgs& args = conv_param->args;
args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>();
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
......@@ -448,7 +622,6 @@ inline void split_channel(const ConvParam& c_param) {
args.image.height = conv_param->input.shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale();
......@@ -460,13 +633,17 @@ inline int fill_split_arg(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input;
Tensor* output = param.output;
if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 &&
input->shape().width() == 1) {
split_channel(c_param);
return 1;
} else {
} else if (param.groups == 1) {
split_filter_num(c_param);
return 0;
} else {
pack_channel_filter(c_param);
return 0;
}
}
......
......@@ -24,6 +24,17 @@ namespace zynqmp {
class DepthwiseConvPE : public PE {
public:
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
......@@ -37,17 +48,40 @@ class DepthwiseConvPE : public PE {
Tensor* output = param.output;
int channel = output->shape().channel();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
int repeat = 1;
int alignment = 16;
int length = channel;
if (channel % alignment != 0 || channel < alignment) {
int c_lcm = lcm_(channel, alignment);
repeat = c_lcm / (channel);
}
Shape shape(N, {channel * repeat});
float16* b_data = bias_.mutableData<float16>(FP16, shape);
if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16
for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]);
// for (int i = 0; i < channel; i++) {
// b_data[i] = float_to_half(new_bias_data[i]);
// }
// bias 按16对齐填充hw
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(new_bias_data[j]);
b_data[i * length + j] = value;
}
}
bias_.flush();
} else {
float16* new_bias_data = param_.bias()->data<float16>();
memcpy(b_data, new_bias_data, channel * sizeof(float16));
// memcpy(b_data, new_bias_data, channel * sizeof(float16));
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
// float16 value = float_to_half(bias_data_float[j]);
b_data[i * length + j] = new_bias_data[j];
}
}
bias_.flush();
}
......@@ -62,7 +96,7 @@ class DepthwiseConvPE : public PE {
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
......@@ -90,20 +124,33 @@ class DepthwiseConvPE : public PE {
args.sub_conv_num = 1;
param.args = args;
inplace_.relu_enable = param_.relu.enabled;
inplace_.power_enable = false;
inplace_.normalize_enable = false;
}
bool dispatch() {
param_.input->syncToDevice();
if (param_.relu.enabled) {
inplace_.relu_enable = param_.relu.enabled;
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
}
bool ret = compute_fpga_dwconv(param_.args) == 0;
if (param_.relu.enabled) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
config_inplace(inplace_);
}
return ret;
......
......@@ -58,15 +58,29 @@ class ElementwiseAddPE : public PE {
bool dispatch() {
param_.inputs[0]->syncToDevice();
param_.inputs[1]->syncToDevice();
InplaceArgs inplace_args = {0};
if (param_.relu.enabled) {
inplace_args.relu_enable = true;
config_inplace(inplace_args);
// InplaceArgs inplace_ = {0};
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
}
compute_fpga_ewadd(param_.ewargs);
if (param_.relu.enabled) {
inplace_args.relu_enable = false;
config_inplace(inplace_args);
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.leaky_relu_enable = false;
config_inplace(inplace_);
}
return true;
}
......@@ -75,6 +89,7 @@ class ElementwiseAddPE : public PE {
private:
ElementwiseAddParam param_;
InplaceArgs inplace_ = {0};
};
} // namespace zynqmp
......
......@@ -29,17 +29,17 @@ class ElementwiseMulPE : public PE {
}
void apply() {
Tensor* input = param_.inputs[0];
Tensor* input = param_.input_x;
Tensor* output = param_.output;
int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
int wc_aligned = align_to_x(param_.input_x->shape().numel(), 32);
Shape s(N, {wc_aligned});
float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
memset(bias_data, 0, wc_aligned * sizeof(float16));
ScaleArgs& args = args_;
args.scale_address = param_.inputs[1]->data<void>();
args.scale_address = param_.input_y->data<void>();
args.bias_address = bias_tensor.data<void>();
args.wc_alignment = wc_aligned;
args.channel_alignment = wc_aligned;
......
......@@ -37,10 +37,9 @@ class FullyConnectedPE : public PE {
ConvParam& convParam_ = convPE_.param();
Tensor* input = param_.input;
convParam_.input = param_.input;
num_ = param_.input->shape().num();
convParam_.output = param_.output;
// convParam_.relu = param_.relu;
convParam_.activeParam.type = param_.activeParam.type;
convParam_.groups = 1;
convParam_.strides = {1, 1};
convParam_.paddings = {0, 0};
......@@ -49,6 +48,9 @@ class FullyConnectedPE : public PE {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
// if (num == 2) {
// return;
// }
int height = param_.input->shape().height();
int width = param_.input->shape().width();
......@@ -66,6 +68,7 @@ class FullyConnectedPE : public PE {
new_filter_data[i * chw + j] = scale;
}
}
conv_filter->flush();
convParam_.filter = conv_filter;
......@@ -84,15 +87,51 @@ class FullyConnectedPE : public PE {
convPE_.apply();
}
bool dispatch() { return convPE_.dispatch(); }
void cpu_compute() {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
float* filter_data = param_.filter->data<float>();
float max = 0.0f;
Tensor* input = param_.input;
Tensor* output = param_.output;
float16* input_data = input->data<float16>();
float16* output_data = output->data<float16>();
for (int i = 0; i < num; i++) {
float sum = 0;
float bias = param_.bias->data<float>()[i];
for (int j = 0; j < chw; j++) {
float scale = filter_data[j * num + i];
float data = half_to_float(input_data[j]);
sum += scale * data;
}
output_data[i] = float_to_half(sum + bias);
if (max < output_data[i]) {
max = output_data[i];
}
}
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
}
bool dispatch() {
// int num = param_.filter->shape().channel();
// if (num == 2) {
// cpu_compute();
// return 1;
// } else {
return convPE_.dispatch();
// }
}
FullyConnectedParam& param() { return param_; }
private:
FullyConnectedParam param_;
ConvPE convPE_;
Tensor tempOut_;
int num_ = 1;
};
} // namespace zynqmp
} // namespace paddle
......@@ -76,7 +76,9 @@ class GRUPE : public PE {
reset_hidden_.mutableData<void>(FP16, hidden_shape);
ElementwiseMulParam& mul_param = mul_pe_.param();
mul_param.inputs = {&reset_gate_, &prev_hidden_};
// mul_param.inputs = {&reset_gate_, &prev_hidden_};
mul_param.input_x = &reset_gate_;
mul_param.input_y = &prev_hidden_;
mul_param.output = &reset_hidden_;
mul_pe_.init();
mul_pe_.apply();
......
......@@ -72,8 +72,10 @@ class NormPE : public PE {
input_float.mutableData<float>(FP32, param_.input->shape());
float_out.mutableData<float>(FP32, param_.output->shape());
// param_.input->syncToDevice();
input_float.copyFrom(param_.input);
input_float.syncToCPU();
// input_float.saveToFile("normalize_", true);
int channel = input_float.shape().channel();
int height = input_float.shape().height();
......@@ -85,6 +87,7 @@ class NormPE : public PE {
float* out_ptr = float_out.data<float>();
int loop = height * width;
#pragma omp parallel for
for (int i = 0; i < loop; i++) {
float sum = param_.epsilon;
for (int c = 0; c < channel; c++) {
......@@ -98,11 +101,26 @@ class NormPE : public PE {
}
}
float_out.flush();
// float_out.saveToFile("normalize_", true);
param_.output->copyFrom(&float_out);
}
bool dispatch() {
cpuCompute();
// std::cout << "CPU normalize ---------------------" << std::endl;
// param_.input->syncToDevice();
// // param_.input->saveToFile("normalize_fpga_", true);
// config_norm_param(norm_param_args_);
// inplace_args_.normalize_enable = true;
// config_inplace(inplace_args_);
// perform_bypass(bypass_args_);
// inplace_args_.normalize_enable = false;
// config_inplace(inplace_args_);
// compute_norm(norm_args_);
// param_.output->saveToFile("normalize_fpga_", true);
// std::cout << "FPGA normalize ---------------------" << std::endl;
return true;
}
......
......@@ -35,14 +35,17 @@ class PoolingPE : public PE {
Tensor* input = param_.input;
Tensor* output = param_.output;
uint32_t k_height = param_.kernelSize[0];
uint32_t k_width = param_.kernelSize[1];
uint32_t k_height = 1;
uint32_t k_width = 1;
if (param_.globalPooling) {
k_width = input->shape().width();
k_height = input->shape().height();
param_.kernelSize[0] = k_height;
param_.kernelSize[1] = k_width;
} else {
k_height = param_.kernelSize[0];
k_width = param_.kernelSize[1];
}
PoolingArgs args = {0};
......@@ -65,9 +68,12 @@ class PoolingPE : public PE {
args.out_width = output->shape().width();
param_.poolingArgs = args;
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// &&
// (k_width > 7 || k_height > 7);
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE;
// use_cpu_ = param_.type == AVERAGE;
}
void compute() {
......@@ -111,6 +117,8 @@ class PoolingPE : public PE {
for (int c = 0; c < image_channels; ++c) {
const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
float sum = 0;
// const int index =
// (hstart * image_width + wstart) * image_channels + c;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = (h * image_width + w) * image_channels + c;
......@@ -139,7 +147,9 @@ class PoolingPE : public PE {
Tensor float_input;
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
// float_input.saveToFile("pool_float.txt");
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0;
......@@ -156,6 +166,7 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
// exit(-1);
}
void cpu_compute() {
......@@ -185,6 +196,7 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
// exit(-1);
}
bool dispatch() {
......
......@@ -53,20 +53,37 @@ class SplitPE : public PE {
int64_t src_after = src_stride_numel[axis];
int64_t dst_after = dst_stride_numel[axis];
// PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
// "src and dst tensor should have the same dims
// size.");
for (int64_t i = 0; i < axis; ++i) {
if (i < axis) {
// PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
// dst_stride_numel[i] /
// dst_stride_numel[axis],
// "src and dst should have the same elements "
// "except the specified axis.");
} else if (i == axis) {
continue;
} else {
// PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
// "src and dst should have the same elements "
// "except the specified axis.");
}
}
for (int64_t i = 0; i < before; ++i) {
memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
memcpy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
}
}
void split3D() { int axis = param_.axis; }
void split3D() {
int axis = param_.axis;
// float16* dst = param_.output->data<float16>();
// std::vector<int>& dst_dims = ;
// StridedNumelCopyWithAxis();
}
bool dispatch() {
Tensor* input = param_.input;
......@@ -88,6 +105,7 @@ class SplitPE : public PE {
in_stride,
out_stride[axis]);
input_offset += out_stride[axis];
// out->flush();
}
return true;
}
......@@ -95,21 +113,26 @@ class SplitPE : public PE {
std::vector<Tensor*> outputs = param_.outputs;
int in_channel = input->shape().channel();
int split_channel = input->shape().channel() / param_.num;
// int split_channel = input->shape().channel() / param_.num;
int hw = input->shape().height() * input->shape().width();
float16* in_data = input->data<float16>();
for (int i = 0; i < hw; i++) {
int channel_stride = 0;
for (int n = 0; n < outputs.size(); n++) {
Tensor* out = outputs[n];
float16* out_data = out->data<float16>();
memcpy(out_data + i * split_channel,
in_data + i * in_channel + n * split_channel,
split_channel * sizeof(float16));
memcpy(out_data + i * out->shape().channel(),
in_data + i * in_channel + channel_stride,
out->shape().channel() * sizeof(float16));
channel_stride += out->shape().channel();
}
}
for (int n = 0; n < outputs.size(); n++) {
Tensor* out = outputs[n];
out->flush();
out->copyScaleFrom(input);
}
return true;
......@@ -120,5 +143,6 @@ class SplitPE : public PE {
private:
SplitParam param_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -300,14 +300,12 @@ class Tensor {
}
void flush() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
size_t memorySize = placeHolder_->memorySize();
fpga_flush(placeHolder_->data(), memorySize);
}
void invalidate() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
size_t memorySize = placeHolder_->memorySize();
fpga_invalidate(placeHolder_->data(), memorySize);
}
......@@ -396,6 +394,7 @@ class Tensor {
invalidate();
std::ofstream ofs;
ofs.open(path);
ofs << scale()[0] << " / " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) {
float value = 0;
......
......@@ -115,14 +115,14 @@ void TypeTargetTransformPass::AddIoCopyInst(
} else {
// TODO(MyPandaShaoxiang) should set same place with input?
auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
// Set the place for io_copy_output_arg node, the target should be equal to
// to.target()
// The precision and layout should be equal to from.precision(),
// from.layout()
#ifndef LITE_WITH_FPGA
// Set the place for io_copy_output_arg node, the target should be equal to
// to.target()
// The precision and layout should be equal to from.precision(),
// from.layout()
#ifndef LITE_WITH_FPGA
io_copy_output_arg->AsArg().type =
LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
#endif
#endif
auto* io_copy_inst = graph->NewInstructNode();
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
......
......@@ -138,6 +138,8 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
void RuntimeProgram::Run() {
for (auto& inst : instructions_) {
std::string op_type = inst.op()->op_info()->Type();
VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr()
<< " on Target " << TargetToStr(inst.kernel()->target());
#ifndef LITE_WITH_FPGA
if (op_type == "feed" || op_type == "fetch") continue;
......
......@@ -145,4 +145,3 @@ REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize();
......@@ -8,7 +8,7 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
......
......@@ -46,7 +46,10 @@ void ConvCompute::PrepareForRun() {
conv_param.dilations = *param.dilations;
fill_scale_bias_const(&conv_param);
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
conv_param.relu.enabled = param.fuse_relu;
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
dw_conv_pe_.init();
dw_conv_pe_.apply();
......@@ -65,7 +68,9 @@ void ConvCompute::PrepareForRun() {
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
}
conv_param.relu.enabled = param.fuse_relu;
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
conv_pe_.init();
conv_pe_.apply();
}
......@@ -77,9 +82,23 @@ void ConvCompute::Run() {
param.groups == param.x->ZynqTensor()->shape().channel()) {
dw_conv_pe_.dispatch();
} else {
zynqmp::ConvParam& conv_param = conv_pe_.param();
if (conv_param.output->shape().channel() == 12 &&
conv_param.output->shape().height() == 13) {
conv_param.input->saveToFile("conv_in", true);
conv_param.output->saveToFile("conv_o", true);
}
conv_pe_.dispatch();
if (conv_param.output->shape().channel() == 12 &&
conv_param.output->shape().height() == 13) {
// conv_param.input->saveToFile("conv_in", true);
conv_param.output->saveToFile("conv_out", true);
}
#ifdef FPGA_PRINT_TENSOR
zynqmp::ConvParam& conv_param = conv_pe_.param();
// zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output);
#endif
}
......
......@@ -33,7 +33,8 @@ void ElementwiseAddCompute::PrepareForRun() {
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis;
ew_param.relu.enabled = false;
ew_param.activeParam.type = zynqmp::TYPE_NONE;
pe_.init();
pe_.apply();
......@@ -56,7 +57,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis;
ew_param.relu.enabled = true;
ew_param.activeParam.type = zynqmp::TYPE_RELU;
pe_.init();
pe_.apply();
}
......@@ -76,7 +77,7 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_param.input = param.X->ZynqTensor();
scale_param.output = param.Out->ZynqTensor();
scale_param.relu.enabled = false;
scale_param.activeParam.type = zynqmp::TYPE_NONE;
int channel = scale_param.input->shape().channel();
zynqmp::Tensor* scale = new zynqmp::Tensor();
......
......@@ -34,7 +34,9 @@ void PoolCompute::PrepareForRun() {
zynqmp::PoolingParam& pool_param = pe_.param();
pool_param.input = param.x->ZynqTensor();
pool_param.output = param.output->ZynqTensor();
pool_param.relu.enabled = false;
pool_param.activeParam.type = zynqmp::TYPE_RELU;
pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
: zynqmp::PoolingType::AVERAGE;
pool_param.globalPooling = param.global_pooling;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册