提交 4ea3ca7a 编写于 作者: X xiaogang 提交者: GitHub

fix: update fpga backend and kernel (#2848)

* fix: update fpga backend and kernel
test=develop

* style: style fix
test=develop
上级 4c1bf784
...@@ -32,7 +32,8 @@ class Debugger { ...@@ -32,7 +32,8 @@ class Debugger {
} }
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) { void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
if (op_type != "conv") { // NOLINT if (op_config[op_type]) {
tensor->saveToFile(op_type, true);
} }
} }
...@@ -40,8 +41,21 @@ class Debugger { ...@@ -40,8 +41,21 @@ class Debugger {
std::unordered_map<std::string, bool> op_config; std::unordered_map<std::string, bool> op_config;
Debugger() { Debugger() {
op_config["concat"] = true; op_config["concat"] = true;
op_config["pooling"] = true;
op_config["conv"] = true; op_config["conv"] = true;
op_config["dwconv"] = true;
op_config["ew_add"] = true;
op_config["crop"] = true; op_config["crop"] = true;
op_config["feed"] = true;
op_config["mul"] = true;
op_config["fetch"] = true;
op_config["boxes"] = true;
op_config["scores"] = true;
op_config["nms"] = true;
op_config["pb_boxes"] = true;
op_config["pb_variances"] = true;
// op_config["fc"] = true;
op_config["softmax"] = true;
} }
}; };
...@@ -131,9 +145,7 @@ inline void save_tensor(const lite::Tensor* t, ...@@ -131,9 +145,7 @@ inline void save_tensor(const lite::Tensor* t,
chw_to_hwc(const_cast<lite::Tensor*>(t), dst); chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
data = dst; data = dst;
} }
save_float(data, name, t->numel()); save_float(data, name, t->numel());
delete[] dst; delete[] dst;
} }
} // namespace lite } // namespace lite
......
...@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) { ...@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
std::ofstream ofs; std::ofstream ofs;
ofs.open(name); ofs.open(name);
int8_t* data = static_cast<int8_t*> data_in; int8_t* data = static_cast<int8_t*>(data_in);
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
float value = data[i]; float value = data[i];
ofs << value << std::endl; ofs << value << std::endl;
...@@ -84,6 +84,14 @@ int calc_num_per_div(int num, int group_num, int division_capacity) { ...@@ -84,6 +84,14 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
} }
} }
int calc_pack_num(int num_per_group, int group, int division_capacity) {
auto n = 1;
while ((num_per_group * (group + n - 1) / n) > division_capacity) {
n++;
}
return (n);
}
void convert_to_hwc(int8_t* chw_data, void convert_to_hwc(int8_t* chw_data,
int8_t* hwc_data, int8_t* hwc_data,
int num, int num,
...@@ -231,10 +239,9 @@ int8_t* format_filter(float* data_in, ...@@ -231,10 +239,9 @@ int8_t* format_filter(float* data_in,
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw; float* filter_start = data_in + n * chw;
float f_max = find_max(filter_start, chw);
int8_t* quantized_start = quantized_data + n * chw; int8_t* quantized_start = quantized_data + n * chw;
quantize(filter_start, quantized_start, chw, max); quantize(filter_start, quantized_start, chw, max);
filter_max.push_back(max); filter_max.push_back(1);
} }
int8_t* hwc_data = int8_t* hwc_data =
...@@ -256,6 +263,7 @@ int8_t* format_filter(float* data_in, ...@@ -256,6 +263,7 @@ int8_t* format_filter(float* data_in,
int filter_num_alignment = get_filter_num_alignment(); int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment); align_to_x(num_per_div_before_alignment, filter_num_alignment);
int num_element = div_num * num_per_div_after_alignment * chw_aligned; int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data = int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t))); reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
......
...@@ -31,6 +31,7 @@ int calc_division_capacity(int chw); ...@@ -31,6 +31,7 @@ int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity); int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity); int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity);
int calc_pack_num(int num_per_group, int group, int division_capacity);
float find_max(float* data_in, int data_size); float find_max(float* data_in, int data_size);
int8_t* format_filter(float* data_in, int8_t* format_filter(float* data_in,
...@@ -40,11 +41,13 @@ int8_t* format_filter(float* data_in, ...@@ -40,11 +41,13 @@ int8_t* format_filter(float* data_in,
int height, int height,
int width, int width,
int group_num, int group_num,
float max, // NOLINT float max,
std::vector<float>& filter_max); // NOLINT std::vector<float>& filter_max); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width); void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width); size_t align_element_n(int16_t** data_in, int num, int height, int width);
// void quantize_to_fp16(float** data_in, int num, int height, int width,
// float* scale_ptr);
size_t format_dwconv_filter( size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr); float** data_in, int num, int height, int width, float* scale_ptr);
......
...@@ -28,7 +28,7 @@ limitations under the License. */ ...@@ -28,7 +28,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
#define PADDLE_OS_LINUX #define PADDLE_MOBILE_OS_LINUX
static int fd = -1; static int fd = -1;
static const char *device_path = "/dev/fpgadrv0"; static const char *device_path = "/dev/fpgadrv0";
...@@ -38,7 +38,7 @@ static size_t memory_size_max = 0; ...@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0; static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) { static inline int do_ioctl(uint64_t req, const void *arg) {
#ifdef PADDLE_OS_LINUX #ifdef PADDLE_MOBILE_OS_LINUX
return ioctl(fd, req, arg); return ioctl(fd, req, arg);
#else #else
return -1; return -1;
...@@ -61,17 +61,33 @@ void reset_device() { ...@@ -61,17 +61,33 @@ void reset_device() {
// memory management; // memory management;
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
#ifdef ENABLE_DEBUG #ifdef PADDLE_MOBILE_OS_LINUX
#endif
#ifdef PADDLE_OS_LINUX
void *ptr = reinterpret_cast<void *>( void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) { if (ptr == MAP_FAILED) {
std::cout << "not enough memory !"; std::cout << "not enough memory !";
exit(-1); exit(-1);
} }
if (errno == ENOMEM) {
std::cout << "mmap failed with not enough memory !";
exit(-1);
}
if (errno == EINVAL) {
std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
<< std::endl;
exit(-1);
}
if (ptr == NULL) {
std::cout << "NULL returned, errno=" << errno
<< ", mmap failed with other errors other than memory usage !"
<< std::endl;
exit(-1);
}
memory_map.insert(std::make_pair(ptr, size)); memory_map.insert(std::make_pair(ptr, size));
memory_size += size; memory_size += size;
if (memory_size > memory_size_max) { if (memory_size > memory_size_max) {
memory_size_max = memory_size; memory_size_max = memory_size;
} }
...@@ -87,7 +103,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; } ...@@ -87,7 +103,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
size_t fpga_diagnose_memory(int detailed) { size_t fpga_diagnose_memory(int detailed) {
size_t total = 0; size_t total = 0;
auto iter = memory_map.begin(); // std::map<void *, size_t>::iterator auto iter = memory_map.begin();
while (iter != memory_map.end()) { while (iter != memory_map.end()) {
total += iter->second; total += iter->second;
iter++; iter++;
...@@ -97,13 +113,15 @@ size_t fpga_diagnose_memory(int detailed) { ...@@ -97,13 +113,15 @@ size_t fpga_diagnose_memory(int detailed) {
void fpga_free(void *ptr) { void fpga_free(void *ptr) {
size_t size = 0; size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator auto iter = memory_map.find(ptr);
if (iter != memory_map.end()) { if (iter != memory_map.end()) {
size = iter->second; size = iter->second;
memory_map.erase(iter); memory_map.erase(iter);
} }
memory_size -= size; memory_size -= size;
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
munmap(ptr, size); munmap(ptr, size);
#else #else
free(ptr); free(ptr);
...@@ -230,6 +248,7 @@ int perform_bypass(const struct BypassArgs &args) { ...@@ -230,6 +248,7 @@ int perform_bypass(const struct BypassArgs &args) {
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]); scale = std::max(scale, scales[0]);
} }
args.output.scale_address[0] = scale; args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale; args.output.scale_address[1] = 1.0f / scale;
return ret; return ret;
...@@ -238,6 +257,26 @@ int perform_bypass(const struct BypassArgs &args) { ...@@ -238,6 +257,26 @@ int perform_bypass(const struct BypassArgs &args) {
int compute_fpga_concat(const struct ConcatArgs &args) { return -1; } int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
int compute_fpga_scale(const struct ScaleArgs &args) { int compute_fpga_scale(const struct ScaleArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Scale======";
std::cout << "scale_address:" << args.scale_address << std::endl;
std::cout << "bias_address:" << args.bias_address << std::endl;
std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_SCALE, &args); return do_ioctl(IOCTL_CONFIG_SCALE, &args);
} }
......
...@@ -28,7 +28,6 @@ namespace zynqmp { ...@@ -28,7 +28,6 @@ namespace zynqmp {
typedef int16_t half; typedef int16_t half;
#define IMAGE_ALIGNMENT 16 // Aligned to 16 #define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 #define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8 #define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16 #define BIAS_NUM_ALIGNMENT 16
...@@ -44,14 +43,11 @@ enum DLayoutType { ...@@ -44,14 +43,11 @@ enum DLayoutType {
}; };
enum ActiveType { enum ActiveType {
TYPE_RELU = 0, TYPE_NONE = 0,
TYPE_RELU6 = 1, TYPE_RELU = 1,
TYPE_LEAK_RELU = 2, TYPE_RELU6 = 2,
TYPE_SIGMOID = 3, TYPE_LEAKY_RELU = 3,
}; TYPE_SIGMOID = 4,
struct VersionArgs {
void* buffer;
}; };
struct DeviceInfo { struct DeviceInfo {
...@@ -67,6 +63,11 @@ struct DeviceInfo { ...@@ -67,6 +63,11 @@ struct DeviceInfo {
uint32_t reserved6; uint32_t reserved6;
}; };
struct VersionArgs {
void* buffer;
size_t size;
};
struct MemoryCopyArgs { struct MemoryCopyArgs {
void* src; void* src;
void* dest; void* dest;
...@@ -78,7 +79,9 @@ struct MemoryCacheArgs { ...@@ -78,7 +79,9 @@ struct MemoryCacheArgs {
size_t size; size_t size;
}; };
struct MemoryBarrierArgs {}; struct MemoryBarrierArgs {
uint16_t dummy;
};
struct BNArgs { struct BNArgs {
bool enabled; bool enabled;
......
...@@ -30,8 +30,13 @@ struct ReLUParam { ...@@ -30,8 +30,13 @@ struct ReLUParam {
float leaky_relu_factor = 0.0f; float leaky_relu_factor = 0.0f;
}; };
struct ActiveParam {
enum ActiveType type = TYPE_NONE;
float leaky_relu_factor;
};
struct PEParam { struct PEParam {
ReLUParam relu; ActiveParam activeParam;
}; };
struct InputParam : PEParam { struct InputParam : PEParam {
...@@ -100,24 +105,6 @@ struct DepthwiseConvParam : ConvParam { ...@@ -100,24 +105,6 @@ struct DepthwiseConvParam : ConvParam {
Tensor* quantizedFilter_ = new Tensor(); Tensor* quantizedFilter_ = new Tensor();
}; };
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
enum PoolingType : int { enum PoolingType : int {
MAX = 0, MAX = 0,
AVERAGE = 1, AVERAGE = 1,
...@@ -155,7 +142,8 @@ struct ElementwiseAddParam : PEParam { ...@@ -155,7 +142,8 @@ struct ElementwiseAddParam : PEParam {
struct ElementwiseMulParam : PEParam { struct ElementwiseMulParam : PEParam {
public: public:
std::vector<Tensor*> inputs; Tensor* input_x;
Tensor* input_y = nullptr;
Tensor* output = nullptr; Tensor* output = nullptr;
}; };
...@@ -223,6 +211,17 @@ struct PriorBoxParam : PEParam { ...@@ -223,6 +211,17 @@ struct PriorBoxParam : PEParam {
float offset; float offset;
}; };
struct YoloBoxParam : PEParam {
Tensor* input;
Tensor* imgSize;
Tensor* outputBoxes;
Tensor* outputScores;
int downsampleRatio;
std::vector<int> anchors;
int classNum;
float confThresh;
};
struct ScaleParam : PEParam { struct ScaleParam : PEParam {
public: public:
Tensor* input = nullptr; Tensor* input = nullptr;
...@@ -255,5 +254,24 @@ struct CropParam : PEParam { ...@@ -255,5 +254,24 @@ struct CropParam : PEParam {
std::vector<int> offsets; std::vector<int> offsets;
std::vector<int> shape; std::vector<int> shape;
}; };
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pes/conv_process.hpp" #include "lite/backends/fpga/KD/pes/conv_process.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/scale_pe.hpp" #include "lite/backends/fpga/KD/pes/scale_pe.hpp"
#include "lite/backends/fpga/KD/pes/split_pe.hpp"
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
...@@ -41,6 +42,8 @@ class ConvPE : public PE { ...@@ -41,6 +42,8 @@ class ConvPE : public PE {
void apply() { void apply() {
split_axis = fill_split_arg(param_); split_axis = fill_split_arg(param_);
split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
if (split_axis == 0 && param_.splitParams().size() > 1) { if (split_axis == 0 && param_.splitParams().size() > 1) {
ConcatParam& concat_param = concatPE_.param(); ConcatParam& concat_param = concatPE_.param();
for (auto conv_param : param_.splitParams()) { for (auto conv_param : param_.splitParams()) {
...@@ -51,107 +54,28 @@ class ConvPE : public PE { ...@@ -51,107 +54,28 @@ class ConvPE : public PE {
concatPE_.apply(); concatPE_.apply();
} }
if (split_channel) {
SplitParam& split_param = splitPE_.param();
split_param.input = param_.input;
for (auto conv_param : param_.splitParams()) {
split_param.outputs.push_back(&conv_param->input);
}
splitPE_.init();
splitPE_.apply();
}
if (DLEngine::get_instance().isZU3() && if (DLEngine::get_instance().isZU3() &&
param_.input->shape().dimSize() == 4 && param_.input->shape().dimSize() == 4 &&
param_.input->shape().width() == 1 && param_.input->shape().width() == 1 &&
param_.input->shape().width() >= 2048) { param_.input->shape().channel() >= 2048) {
use_cpu_ = true; use_cpu_ = true;
} }
if (!use_cpu_) {
if (param_.filter->shape().width() == 1 && // param_.filter->releaseData();
param_.filter->shape().height() == 1) { // NOLINT
}
if (!use_cpu_) { // NOLINT
} }
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape());
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
int image_height = input->shape().height(); // exit(-1);
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.filter->shape().height();
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width;
float max = 0;
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend =
std::min(hstart + kernel_height, static_cast<int>(image_height));
int wend =
std::min(wstart + kernel_width, static_cast<int>(image_width));
hstart = std::max(hstart, static_cast<int>(0));
wstart = std::max(wstart, static_cast<int>(0));
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
for (int c = 0; c < image_channels; c++) {
for (int h = hstart; h < hend; h++) {
int hi = 0;
if (ph == 0) {
hi = h - hstart + image_pad_h;
} else {
hi = h - hstart;
}
for (int w = wstart; w < wend; w++) {
int wi = 0;
if (pw == 0) {
wi = w - wstart + image_pad_w;
} else {
wi = w - wstart;
}
const int index = (h * image_width + w) * image_channels + c;
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index];
sum += value;
}
}
}
if (param_.relu.enabled && sum < 0) {
sum = -sum;
}
if (sum > max) {
max = sum;
}
out[pool_index] = sum;
}
}
}
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
} }
void cpu_compute() { void cpu_compute() {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
...@@ -161,93 +85,98 @@ class ConvPE : public PE { ...@@ -161,93 +85,98 @@ class ConvPE : public PE {
Tensor float_output; Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape()); float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
float_input.syncToCPU(); // float16* data_out = output->data<float16>();
float* out = float_output.mutableData<float>(FP32, output->shape()); float* out = float_output.mutableData<float>(FP32, output->shape());
float* bias_data = param_.bias()->data<float>();
int out_width = output->shape().width();
int out_channel = output->shape().channel(); int out_channel = output->shape().channel();
int in_channel = input->shape().channel(); int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>(); float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel]; float* mi = new float[in_channel];
float max = 0;
int out_index = 0;
for (int i = 0; i < out_channel; i++) { for (int i = 0; i < out_channel; i++) {
float* image = image_addr; float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel; float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi; float* out_ptr = mi;
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
// float32x4_t x0 = vld1q_f32(image);
// float32x4_t x1 = vld1q_f32(filter_ptr);
// float32x4_t r = vmulq_f32(x0, x1);
// vst1q_f32(out_ptr, r);
// image += 4;
// filter_ptr += 4;
// out_ptr += 4;
float value = image_addr[j] * filter_ptr[j];
mi[j] = value;
}
for (int h = 0; h < output->shape().height(); h++) { float sum = 0;
for (int w = 0; w < output->shape().width(); w++) { for (int j = 0; j < in_channel; j++) {
float sum = 0; sum += mi[j];
// #pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j];
sum += value;
}
sum += bias_data[i];
if (param_.relu.enabled && sum < 0) {
sum = 0;
}
if (sum > max) {
max = sum;
}
out_index = h * out_width * out_channel + w * out_channel + i;
out[out_index] = sum;
}
} }
out[i] = sum;
} }
delete[] mi; delete[] mi;
float_output.flush(); float_output.flush();
output->copyFrom(&float_output); output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
} }
bool dispatch() { bool dispatch() {
fpga_reset();
if (use_cpu_) { if (use_cpu_) {
cpu_compute(); cpu_compute();
return true; return true;
} }
inplace_.leaky_relu_enable = if (param_.activeParam.type == TYPE_RELU) {
(param_.relu.leaky_relu_factor != 0) ? true : false; inplace_.relu_enable = true;
inplace_.relu_enable = } else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.leaky_relu_enable ? false : param_.relu.enabled; inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
inplace_.power_enable = false; if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.normalize_enable = false; inplace_.relu6_enable || inplace_.sigmoid_enable) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
config_inplace(inplace_); config_inplace(inplace_);
if (inplace_.leaky_relu_enable) { if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU; activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor = activeParamterArgs.leaky_relu_factor =
fp32_2_fp16(param_.relu.leaky_relu_factor); fp32_2_fp16(param_.activeParam.leaky_relu_factor);
config_activation(activeParamterArgs); config_activation(activeParamterArgs);
} }
} }
std::vector<BasicConvParam*>& params = param_.splitParams(); std::vector<BasicConvParam*>& params = param_.splitParams();
if (split_channel) {
// splitPE_.param().input->saveToFile("input_image",true);
splitPE_.dispatch();
}
int ret = 0; int ret = 0;
for (auto conv_param : params) { for (auto conv_param : params) {
// conv_param->input.printScale();
// if (split_channel) {
// conv_param->input.saveToFile("pack_image",true);
// }
ret |= compute_fpga_conv_basic(conv_param->args); ret |= compute_fpga_conv_basic(conv_param->args);
} }
if (inplace_.relu_enable || inplace_.leaky_relu_enable) { if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false; inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false; inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
config_inplace(inplace_); config_inplace(inplace_);
if (inplace_.leaky_relu_enable) { if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU; activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0); activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
config_activation(activeParamterArgs); config_activation(activeParamterArgs);
} }
...@@ -255,16 +184,29 @@ class ConvPE : public PE { ...@@ -255,16 +184,29 @@ class ConvPE : public PE {
size_t size = params.size(); size_t size = params.size();
if (split_axis == 0 && ret == 0 && size > 1) { if (split_axis == 0 && ret == 0 && size > 1) {
// std::cout << "concat size:" << size << std::endl;
concatPE_.dispatch(); concatPE_.dispatch();
} }
if (split_axis == 1 && ret == 0 && size > 1) { if (split_axis == 1 && ret == 0 && size > 1) {
// for (int n = 0; n < size - 1; n++) {
ElementwiseAddParam& add_param = addPE_.param(); ElementwiseAddParam& add_param = addPE_.param();
add_param.inputs = {&params[0]->output, &params[1]->output}; add_param.inputs = {&params[0]->output, &params[1]->output};
add_param.output = param_.output; add_param.output = param_.output;
addPE_.init(); addPE_.init();
addPE_.apply(); addPE_.apply();
addPE_.dispatch(); addPE_.dispatch();
// param_.output->printScale();
// params[0]->input.saveToFile("conv_1.txt");
// params[1]->input.saveToFile("conv_2.txt");
// params[0]->output.saveToFile("ew_o1.txt");
// params[1]->output.saveToFile("ew_o2.txt");
// std::cout << "\n ================== EW ================== \n";
// }
} }
return ret == 0; return ret == 0;
} }
...@@ -272,8 +214,10 @@ class ConvPE : public PE { ...@@ -272,8 +214,10 @@ class ConvPE : public PE {
private: private:
bool use_cpu_ = false; bool use_cpu_ = false;
bool split_channel = false;
ConvParam param_; ConvParam param_;
ConcatPE concatPE_; ConcatPE concatPE_;
SplitPE splitPE_;
ElementwiseAddPE addPE_; ElementwiseAddPE addPE_;
int split_axis = 0; int split_axis = 0;
InplaceArgs inplace_ = {0}; InplaceArgs inplace_ = {0};
......
...@@ -53,6 +53,16 @@ inline int get_split_num(Tensor* filter) { ...@@ -53,6 +53,16 @@ inline int get_split_num(Tensor* filter) {
return filter::calc_split_num(aligned_num, div_capacity); return filter::calc_split_num(aligned_num, div_capacity);
} }
inline int get_pack_num(Tensor* filter, int group_num) {
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num_per_group = align_to_x(num / group_num, filter_num_alignment);
return filter::calc_pack_num(aligned_num_per_group, group_num, div_capacity);
}
inline void fill_scale_bias_const(ConvParam* param_) { inline void fill_scale_bias_const(ConvParam* param_) {
int channel = param_->output->shape().channel(); int channel = param_->output->shape().channel();
Shape sb_shape(N, {channel}); Shape sb_shape(N, {channel});
...@@ -117,6 +127,50 @@ inline void combine_add_bn_params(BatchnormParam* bn, ...@@ -117,6 +127,50 @@ inline void combine_add_bn_params(BatchnormParam* bn,
param_->bias()->setDataLocation(CPU); param_->bias()->setDataLocation(CPU);
} }
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
inline void format_bias_scale_new(Tensor* bias,
Tensor* scale,
Tensor* scale_bias) {
Shape& bias_shape = bias->shape();
int channel = bias_shape.channel();
int repeat = 1;
int alignment = 16;
int length = channel;
if (channel % alignment != 0 || channel < alignment) {
int c_lcm = lcm_(channel, alignment);
repeat = c_lcm / (channel);
}
Shape shape(N, {2 * channel * repeat});
float16* scale_bias_data = scale_bias->mutableData<float16>(FP16, shape);
float* bias_data_float = bias->data<float>();
float* scale_data_float = scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value_bias = float_to_half(bias_data_float[j]);
scale_bias_data[i * length + j] = value_bias;
}
}
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value_scale = float_to_half(scale_data_float[j]);
scale_bias_data[i * length + j + length * repeat] = value_scale;
}
}
}
inline void format_scale_bias(Tensor* scale, inline void format_scale_bias(Tensor* scale,
Tensor* bias, Tensor* bias,
Tensor* filter, Tensor* filter,
...@@ -180,8 +234,10 @@ inline void format_scale_bias(Tensor* scale, ...@@ -180,8 +234,10 @@ inline void format_scale_bias(Tensor* scale,
inline void format_filter(Tensor* filter, inline void format_filter(Tensor* filter,
Tensor* quantized_filter, Tensor* quantized_filter,
int group, int group,
std::vector<float>& scales) { // NOLINT std::vector<float>& scales, // NOLINT
float max) {
float max_value = find_max(*filter); float max_value = find_max(*filter);
// max_value = max; //TODO: global quantization for filter
Shape& filter_shape = filter->shape(); Shape& filter_shape = filter->shape();
int mem_size; int mem_size;
...@@ -206,10 +262,22 @@ inline void format_filter(Tensor* filter, ...@@ -206,10 +262,22 @@ inline void format_filter(Tensor* filter,
memcpy(src, quantized_data, mem_size); memcpy(src, quantized_data, mem_size);
quantized_filter->flush(); quantized_filter->flush();
fpga_free(quantized_data);
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value); // for (size_t i = 0; i < max_values.size(); i++) {
} // // scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f);
// }
// filter->saveToFile("filter.txt");
// std::ofstream ofs;
// ofs.open("quant.txt");
// for (int i = 0; i < mem_size; i++) {
// float value = quantized_data[i];
// ofs << value << std::endl;
// }
// ofs.close();
// exit(-1);
} }
inline void format_dw_filter(Tensor* filter, inline void format_dw_filter(Tensor* filter,
...@@ -256,17 +324,10 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -256,17 +324,10 @@ inline void split_filter_num(const ConvParam& c_param) {
Tensor* out = param.output; Tensor* out = param.output;
Tensor* filter = param.filter; Tensor* filter = param.filter;
auto channel = out->shape().channel(); auto channel = out->shape().channel();
int split_num = param.groups == 1 ? get_split_num(param.filter) : 1; int split_num = get_split_num(param.filter);
int filter_num_per_div = get_filter_num_per_div(filter, param.groups); int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() * float max = find_max(*filter);
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape(); Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) { for (int i = 0; i < split_num; i++) {
...@@ -310,45 +371,29 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -310,45 +371,29 @@ inline void split_filter_num(const ConvParam& c_param) {
new_filter.flush(); new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape); conv_param->filter.mutableData<float>(FP32, f_shape);
if (param.groups != 1) { std::vector<float> v; // TODO(chonwhite) change variable name;
int mem_factor = format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
32 / filter_num_per_div; // TODO(chonwhite): change 32 to param;
conv_param->filter.setMemScale(mem_factor);
}
std::vector<float> v; // TODO(chonwhite): change local variable name
format_filter(&new_filter, &(conv_param->filter), param.groups, v);
conv_param->filter.setDataType(INT8); conv_param->filter.setDataType(INT8);
int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
Tensor scale; Tensor scale;
Tensor bias; Tensor bias;
int chnnnel_start = i * filter_num_per_div; int chnnnel_start = i * filter_num_per_div;
Shape s_shape(N, {filter_num}); Shape s_shape(NC, {1, filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape); float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape); float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n]; scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
} }
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start]; bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
} }
Shape sb_shape(N, {sb_num}); format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
format_scale_bias(&scale,
&bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
conv_param->scaleBias.flush(); conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>();
args.group_num = param.groups; args.group_num = param.groups;
args.relu_enabled = param.relu.enabled; args.sb_address = conv_param->scaleBias.data<float16>();
args.sb_address = conv_param->scaleBias.data<float>();
args.sb_address = bs_data;
args.kernel.stride_h = param.strides[1]; args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0]; args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height(); args.kernel.height = new_filter.shape().height();
...@@ -372,6 +417,135 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -372,6 +417,135 @@ inline void split_filter_num(const ConvParam& c_param) {
} }
} }
inline void pack_channel_filter(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input;
Tensor* out = param.output;
Tensor* filter = param.filter;
int filter_num_alignment = filter::get_filter_num_alignment();
auto filter_num = filter->shape().num();
int pack_num = get_pack_num(param.filter, param.groups);
int group_per_pack = (param.groups + pack_num - 1) / pack_num;
int filter_per_group = filter_num / param.groups;
int filter_per_pack = filter_per_group * group_per_pack;
int channel_per_pack = filter->shape().channel() * group_per_pack;
float max = find_max(*filter);
Shape& out_shape = out->shape();
for (int i = 0; i < pack_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
conv_param->output.setDataLocation(Device);
conv_param->output.setAligned(true);
float16* out_address = nullptr;
float* out_scale_address = nullptr;
float16* input_address = nullptr;
ConvArgs& args = conv_param->args;
if (pack_num == 1) {
out_address = out->data<float16>();
out_scale_address = out->scale();
}
int new_group = param.groups;
int filter_current_pack = filter->shape().num();
int channel_current_pack = input->shape().channel();
new_group = i == pack_num - 1
? param.groups - (pack_num - 1) * group_per_pack
: group_per_pack;
filter_current_pack = new_group * filter_per_group;
channel_current_pack = new_group * filter->shape().channel();
if (pack_num == 1) {
input_address = input->data<float16>();
} else {
Shape in_shape(NCHW,
{1,
channel_current_pack,
input->shape().height(),
input->shape().width()});
input_address = conv_param->input.mutableData<float16>(FP16, in_shape);
}
if (pack_num != 1) {
Shape shape(
NHWC,
{1, out_shape.height(), out_shape.width(), filter_current_pack});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
}
Shape f_shape(NCHW,
{filter_current_pack,
filter->shape().channel(),
filter->shape().height(),
filter->shape().width()});
Tensor new_filter;
float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
int filter_hwc = filter->shape().height() * filter->shape().width() *
filter->shape().channel();
memcpy(new_filter_data,
filter->data<float>() + i * filter_per_pack * filter_hwc,
filter_current_pack * filter_hwc * sizeof(float));
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
float mem_factor = filter_num_alignment / filter_per_pack;
conv_param->filter.setMemScale(mem_factor);
std::vector<float> v; // TODO(chonwhite) change variable name
format_filter(&new_filter, &(conv_param->filter), new_group, v, max);
conv_param->filter.setDataType(INT8);
Tensor scale;
Tensor bias;
int chnnnel_start = i * filter_per_pack;
Shape s_shape(NC, {1, filter_current_pack});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_current_pack; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
}
for (int n = 0; n < filter_current_pack; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
conv_param->scaleBias.flush();
args.group_num = new_group;
args.sb_address = conv_param->scaleBias.data<float16>();
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
args.kernel.width = new_filter.shape().width();
args.filter_address = conv_param->filter.data<int8_t>();
args.filter_num = filter_current_pack;
args.filter_scale_address = conv_param->filter.scale();
args.image.address = input_address;
args.image.scale_address = input->scale();
args.image.channels = channel_current_pack;
args.image.width = input->shape().width();
args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0];
args.output.address = out_address;
args.output.scale_address = out_scale_address;
param.splitParams().push_back(conv_param);
}
}
inline void split_channel(const ConvParam& c_param) { inline void split_channel(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param); ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input; Tensor* input = param.input;
...@@ -383,6 +557,8 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -383,6 +557,8 @@ inline void split_channel(const ConvParam& c_param) {
Shape bs_shape(N, {channel}); Shape bs_shape(N, {channel});
float max = 1.0f;
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
BasicConvParam* conv_param = new BasicConvParam(); BasicConvParam* conv_param = new BasicConvParam();
...@@ -406,7 +582,8 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -406,7 +582,8 @@ inline void split_channel(const ConvParam& c_param) {
} }
new_filter.flush(); new_filter.flush();
std::vector<float> scales; std::vector<float> scales;
format_filter(&new_filter, &(conv_param->filter), param.groups, scales); format_filter(
&new_filter, &(conv_param->filter), param.groups, scales, max);
Tensor bias; Tensor bias;
Tensor scale; Tensor scale;
...@@ -428,7 +605,6 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -428,7 +605,6 @@ inline void split_channel(const ConvParam& c_param) {
ConvArgs& args = conv_param->args; ConvArgs& args = conv_param->args;
args.group_num = param.groups; args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>(); args.sb_address = conv_param->scaleBias.data<float>();
args.kernel.stride_h = param.strides[1]; args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0]; args.kernel.stride_w = param.strides[0];
...@@ -457,13 +633,17 @@ inline int fill_split_arg(const ConvParam& c_param) { ...@@ -457,13 +633,17 @@ inline int fill_split_arg(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param); ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input; Tensor* input = param.input;
Tensor* output = param.output; Tensor* output = param.output;
if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 && if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 &&
input->shape().width() == 1) { input->shape().width() == 1) {
split_channel(c_param); split_channel(c_param);
return 1; return 1;
} else { } else if (param.groups == 1) {
split_filter_num(c_param); split_filter_num(c_param);
return 0; return 0;
} else {
pack_channel_filter(c_param);
return 0;
} }
} }
......
...@@ -24,6 +24,17 @@ namespace zynqmp { ...@@ -24,6 +24,17 @@ namespace zynqmp {
class DepthwiseConvPE : public PE { class DepthwiseConvPE : public PE {
public: public:
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
bool init() { bool init() {
Tensor* output = param_.output; Tensor* output = param_.output;
output->setAligned(true); output->setAligned(true);
...@@ -37,17 +48,40 @@ class DepthwiseConvPE : public PE { ...@@ -37,17 +48,40 @@ class DepthwiseConvPE : public PE {
Tensor* output = param.output; Tensor* output = param.output;
int channel = output->shape().channel(); int channel = output->shape().channel();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape()); int repeat = 1;
int alignment = 16;
int length = channel;
if (channel % alignment != 0 || channel < alignment) {
int c_lcm = lcm_(channel, alignment);
repeat = c_lcm / (channel);
}
Shape shape(N, {channel * repeat});
float16* b_data = bias_.mutableData<float16>(FP16, shape);
if (param_.bias()->dataType() == FP32) { if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>(); float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16 // bias从float转换成float16
for (int i = 0; i < channel; i++) { // for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]); // b_data[i] = float_to_half(new_bias_data[i]);
// }
// bias 按16对齐填充hw
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(new_bias_data[j]);
b_data[i * length + j] = value;
}
} }
bias_.flush(); bias_.flush();
} else { } else {
float16* new_bias_data = param_.bias()->data<float16>(); float16* new_bias_data = param_.bias()->data<float16>();
memcpy(b_data, new_bias_data, channel * sizeof(float16)); // memcpy(b_data, new_bias_data, channel * sizeof(float16));
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
// float16 value = float_to_half(bias_data_float[j]);
b_data[i * length + j] = new_bias_data[j];
}
}
bias_.flush(); bias_.flush();
} }
...@@ -62,6 +96,8 @@ class DepthwiseConvPE : public PE { ...@@ -62,6 +96,8 @@ class DepthwiseConvPE : public PE {
float16* scale_data = param_.scale()->data<float16>(); float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>( float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape()); FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data, memcpy(filter_data,
scale_data, scale_data,
param.filter->shape().numel() * sizeof(float16)); param.filter->shape().numel() * sizeof(float16));
...@@ -89,20 +125,33 @@ class DepthwiseConvPE : public PE { ...@@ -89,20 +125,33 @@ class DepthwiseConvPE : public PE {
args.sub_conv_num = 1; args.sub_conv_num = 1;
param.args = args; param.args = args;
inplace_.relu_enable = param_.relu.enabled;
inplace_.power_enable = false; inplace_.power_enable = false;
inplace_.normalize_enable = false; inplace_.normalize_enable = false;
} }
bool dispatch() { bool dispatch() {
param_.input->syncToDevice(); param_.input->syncToDevice();
if (param_.relu.enabled) { if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = param_.relu.enabled; inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_); config_inplace(inplace_);
} }
bool ret = compute_fpga_dwconv(param_.args) == 0; bool ret = compute_fpga_dwconv(param_.args) == 0;
if (param_.relu.enabled) { if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false; inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
config_inplace(inplace_); config_inplace(inplace_);
} }
return ret; return ret;
......
...@@ -58,15 +58,29 @@ class ElementwiseAddPE : public PE { ...@@ -58,15 +58,29 @@ class ElementwiseAddPE : public PE {
bool dispatch() { bool dispatch() {
param_.inputs[0]->syncToDevice(); param_.inputs[0]->syncToDevice();
param_.inputs[1]->syncToDevice(); param_.inputs[1]->syncToDevice();
InplaceArgs inplace_args = {0}; // InplaceArgs inplace_ = {0};
if (param_.relu.enabled) {
inplace_args.relu_enable = true; if (param_.activeParam.type == TYPE_RELU) {
config_inplace(inplace_args); inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
} }
compute_fpga_ewadd(param_.ewargs); compute_fpga_ewadd(param_.ewargs);
if (param_.relu.enabled) { if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_args.relu_enable = false; inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_args); inplace_.relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.leaky_relu_enable = false;
config_inplace(inplace_);
} }
return true; return true;
} }
...@@ -75,6 +89,7 @@ class ElementwiseAddPE : public PE { ...@@ -75,6 +89,7 @@ class ElementwiseAddPE : public PE {
private: private:
ElementwiseAddParam param_; ElementwiseAddParam param_;
InplaceArgs inplace_ = {0};
}; };
} // namespace zynqmp } // namespace zynqmp
......
...@@ -29,17 +29,17 @@ class ElementwiseMulPE : public PE { ...@@ -29,17 +29,17 @@ class ElementwiseMulPE : public PE {
} }
void apply() { void apply() {
Tensor* input = param_.inputs[0]; Tensor* input = param_.input_x;
Tensor* output = param_.output; Tensor* output = param_.output;
int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32); int wc_aligned = align_to_x(param_.input_x->shape().numel(), 32);
Shape s(N, {wc_aligned}); Shape s(N, {wc_aligned});
float16* bias_data = bias_tensor.mutableData<float16>(FP16, s); float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
memset(bias_data, 0, wc_aligned * sizeof(float16)); memset(bias_data, 0, wc_aligned * sizeof(float16));
ScaleArgs& args = args_; ScaleArgs& args = args_;
args.scale_address = param_.inputs[1]->data<void>(); args.scale_address = param_.input_y->data<void>();
args.bias_address = bias_tensor.data<void>(); args.bias_address = bias_tensor.data<void>();
args.wc_alignment = wc_aligned; args.wc_alignment = wc_aligned;
args.channel_alignment = wc_aligned; args.channel_alignment = wc_aligned;
......
...@@ -37,10 +37,9 @@ class FullyConnectedPE : public PE { ...@@ -37,10 +37,9 @@ class FullyConnectedPE : public PE {
ConvParam& convParam_ = convPE_.param(); ConvParam& convParam_ = convPE_.param();
Tensor* input = param_.input; Tensor* input = param_.input;
convParam_.input = param_.input; convParam_.input = param_.input;
num_ = param_.input->shape().num();
convParam_.output = param_.output; convParam_.output = param_.output;
// convParam_.relu = param_.relu;
convParam_.activeParam.type = param_.activeParam.type;
convParam_.groups = 1; convParam_.groups = 1;
convParam_.strides = {1, 1}; convParam_.strides = {1, 1};
convParam_.paddings = {0, 0}; convParam_.paddings = {0, 0};
...@@ -49,6 +48,9 @@ class FullyConnectedPE : public PE { ...@@ -49,6 +48,9 @@ class FullyConnectedPE : public PE {
int num = param_.filter->shape().channel(); int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num(); int chw = param_.filter->shape().num();
// if (num == 2) {
// return;
// }
int height = param_.input->shape().height(); int height = param_.input->shape().height();
int width = param_.input->shape().width(); int width = param_.input->shape().width();
...@@ -66,6 +68,7 @@ class FullyConnectedPE : public PE { ...@@ -66,6 +68,7 @@ class FullyConnectedPE : public PE {
new_filter_data[i * chw + j] = scale; new_filter_data[i * chw + j] = scale;
} }
} }
conv_filter->flush(); conv_filter->flush();
convParam_.filter = conv_filter; convParam_.filter = conv_filter;
...@@ -84,15 +87,51 @@ class FullyConnectedPE : public PE { ...@@ -84,15 +87,51 @@ class FullyConnectedPE : public PE {
convPE_.apply(); convPE_.apply();
} }
bool dispatch() { return convPE_.dispatch(); } void cpu_compute() {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
float* filter_data = param_.filter->data<float>();
float max = 0.0f;
Tensor* input = param_.input;
Tensor* output = param_.output;
float16* input_data = input->data<float16>();
float16* output_data = output->data<float16>();
for (int i = 0; i < num; i++) {
float sum = 0;
float bias = param_.bias->data<float>()[i];
for (int j = 0; j < chw; j++) {
float scale = filter_data[j * num + i];
float data = half_to_float(input_data[j]);
sum += scale * data;
}
output_data[i] = float_to_half(sum + bias);
if (max < output_data[i]) {
max = output_data[i];
}
}
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
}
bool dispatch() {
// int num = param_.filter->shape().channel();
// if (num == 2) {
// cpu_compute();
// return 1;
// } else {
return convPE_.dispatch();
// }
}
FullyConnectedParam& param() { return param_; } FullyConnectedParam& param() { return param_; }
private: private:
FullyConnectedParam param_; FullyConnectedParam param_;
ConvPE convPE_; ConvPE convPE_;
Tensor tempOut_;
int num_ = 1;
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -47,8 +47,10 @@ class GRUPE : public PE { ...@@ -47,8 +47,10 @@ class GRUPE : public PE {
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}}; zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data = float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape); prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16)); memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
// copy 2/3 weight from param.weight;
zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}}; zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape); float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float)); memset(weight_data, 0, weight_shape.numel() * sizeof(float));
...@@ -74,7 +76,9 @@ class GRUPE : public PE { ...@@ -74,7 +76,9 @@ class GRUPE : public PE {
reset_hidden_.mutableData<void>(FP16, hidden_shape); reset_hidden_.mutableData<void>(FP16, hidden_shape);
ElementwiseMulParam& mul_param = mul_pe_.param(); ElementwiseMulParam& mul_param = mul_pe_.param();
mul_param.inputs = {&reset_gate_, &prev_hidden_}; // mul_param.inputs = {&reset_gate_, &prev_hidden_};
mul_param.input_x = &reset_gate_;
mul_param.input_y = &prev_hidden_;
mul_param.output = &reset_hidden_; mul_param.output = &reset_hidden_;
mul_pe_.init(); mul_pe_.init();
mul_pe_.apply(); mul_pe_.apply();
...@@ -115,11 +119,8 @@ class GRUPE : public PE { ...@@ -115,11 +119,8 @@ class GRUPE : public PE {
if (hidden_prev) { if (hidden_prev) {
// TODO(chonwhite): change to pre_out; // TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output); prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
} }
mul_pe_.dispatch(); mul_pe_.dispatch();
reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update; update_gate_data += stride_update;
reset_gate_data += stride_update; reset_gate_data += stride_update;
......
...@@ -72,8 +72,10 @@ class NormPE : public PE { ...@@ -72,8 +72,10 @@ class NormPE : public PE {
input_float.mutableData<float>(FP32, param_.input->shape()); input_float.mutableData<float>(FP32, param_.input->shape());
float_out.mutableData<float>(FP32, param_.output->shape()); float_out.mutableData<float>(FP32, param_.output->shape());
// param_.input->syncToDevice();
input_float.copyFrom(param_.input); input_float.copyFrom(param_.input);
input_float.syncToCPU(); input_float.syncToCPU();
// input_float.saveToFile("normalize_", true);
int channel = input_float.shape().channel(); int channel = input_float.shape().channel();
int height = input_float.shape().height(); int height = input_float.shape().height();
...@@ -85,6 +87,7 @@ class NormPE : public PE { ...@@ -85,6 +87,7 @@ class NormPE : public PE {
float* out_ptr = float_out.data<float>(); float* out_ptr = float_out.data<float>();
int loop = height * width; int loop = height * width;
#pragma omp parallel for
for (int i = 0; i < loop; i++) { for (int i = 0; i < loop; i++) {
float sum = param_.epsilon; float sum = param_.epsilon;
for (int c = 0; c < channel; c++) { for (int c = 0; c < channel; c++) {
...@@ -98,11 +101,26 @@ class NormPE : public PE { ...@@ -98,11 +101,26 @@ class NormPE : public PE {
} }
} }
float_out.flush(); float_out.flush();
// float_out.saveToFile("normalize_", true);
param_.output->copyFrom(&float_out); param_.output->copyFrom(&float_out);
} }
bool dispatch() { bool dispatch() {
cpuCompute(); cpuCompute();
// std::cout << "CPU normalize ---------------------" << std::endl;
// param_.input->syncToDevice();
// // param_.input->saveToFile("normalize_fpga_", true);
// config_norm_param(norm_param_args_);
// inplace_args_.normalize_enable = true;
// config_inplace(inplace_args_);
// perform_bypass(bypass_args_);
// inplace_args_.normalize_enable = false;
// config_inplace(inplace_args_);
// compute_norm(norm_args_);
// param_.output->saveToFile("normalize_fpga_", true);
// std::cout << "FPGA normalize ---------------------" << std::endl;
return true; return true;
} }
......
...@@ -35,14 +35,17 @@ class PoolingPE : public PE { ...@@ -35,14 +35,17 @@ class PoolingPE : public PE {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
uint32_t k_height = param_.kernelSize[0]; uint32_t k_height = 1;
uint32_t k_width = param_.kernelSize[1]; uint32_t k_width = 1;
if (param_.globalPooling) { if (param_.globalPooling) {
k_width = input->shape().width(); k_width = input->shape().width();
k_height = input->shape().height(); k_height = input->shape().height();
param_.kernelSize[0] = k_height; param_.kernelSize[0] = k_height;
param_.kernelSize[1] = k_width; param_.kernelSize[1] = k_width;
} else {
k_height = param_.kernelSize[0];
k_width = param_.kernelSize[1];
} }
PoolingArgs args = {0}; PoolingArgs args = {0};
...@@ -65,9 +68,12 @@ class PoolingPE : public PE { ...@@ -65,9 +68,12 @@ class PoolingPE : public PE {
args.out_width = output->shape().width(); args.out_width = output->shape().width();
param_.poolingArgs = args; param_.poolingArgs = args;
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// &&
// (k_width > 7 || k_height > 7);
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7); (k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE; // use_cpu_ = param_.type == AVERAGE;
} }
void compute() { void compute() {
...@@ -76,6 +82,7 @@ class PoolingPE : public PE { ...@@ -76,6 +82,7 @@ class PoolingPE : public PE {
input->syncToCPU(); input->syncToCPU();
Tensor float_input; Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape()); float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
float16* data_out = output->data<float16>(); float16* data_out = output->data<float16>();
...@@ -110,6 +117,8 @@ class PoolingPE : public PE { ...@@ -110,6 +117,8 @@ class PoolingPE : public PE {
for (int c = 0; c < image_channels; ++c) { for (int c = 0; c < image_channels; ++c) {
const int pool_index = (ph * pooled_width_ + pw) * image_channels + c; const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
float sum = 0; float sum = 0;
// const int index =
// (hstart * image_width + wstart) * image_channels + c;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
const int index = (h * image_width + w) * image_channels + c; const int index = (h * image_width + w) * image_channels + c;
...@@ -138,7 +147,9 @@ class PoolingPE : public PE { ...@@ -138,7 +147,9 @@ class PoolingPE : public PE {
Tensor float_input; Tensor float_input;
float_input.mutableData<float>(FP32, input->shape()); float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
// float_input.saveToFile("pool_float.txt");
float16* data_out = output->data<float16>(); float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1]; int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0; float scale_max = 0;
...@@ -155,6 +166,7 @@ class PoolingPE : public PE { ...@@ -155,6 +166,7 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f; output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max; output->scale()[1] = 127.0f / scale_max;
output->flush(); output->flush();
// exit(-1);
} }
void cpu_compute() { void cpu_compute() {
...@@ -184,11 +196,14 @@ class PoolingPE : public PE { ...@@ -184,11 +196,14 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f; output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max; output->scale()[1] = 127.0f / scale_max;
output->flush(); output->flush();
// exit(-1);
} }
bool dispatch() { bool dispatch() {
if (use_cpu_) { if (use_cpu_) {
// cpu_compute();
compute(); compute();
// exit(-1);
return true; return true;
} }
param_.input->syncToDevice(); param_.input->syncToDevice();
......
...@@ -89,7 +89,6 @@ class ScalePE : public PE { ...@@ -89,7 +89,6 @@ class ScalePE : public PE {
} }
} }
} }
float* scale_data_float = param_.scale->data<float>(); float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) { for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) { for (int j = 0; j < length; j++) {
......
...@@ -53,20 +53,37 @@ class SplitPE : public PE { ...@@ -53,20 +53,37 @@ class SplitPE : public PE {
int64_t src_after = src_stride_numel[axis]; int64_t src_after = src_stride_numel[axis];
int64_t dst_after = dst_stride_numel[axis]; int64_t dst_after = dst_stride_numel[axis];
// PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
// "src and dst tensor should have the same dims
// size.");
for (int64_t i = 0; i < axis; ++i) { for (int64_t i = 0; i < axis; ++i) {
if (i < axis) { if (i < axis) {
// PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
// dst_stride_numel[i] /
// dst_stride_numel[axis],
// "src and dst should have the same elements "
// "except the specified axis.");
} else if (i == axis) { } else if (i == axis) {
continue; continue;
} else { } else {
// PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
// "src and dst should have the same elements "
// "except the specified axis.");
} }
} }
for (int64_t i = 0; i < before; ++i) { for (int64_t i = 0; i < before; ++i) {
memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size); memcpy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
} }
} }
void split3D() { int axis = param_.axis; } void split3D() {
int axis = param_.axis;
// float16* dst = param_.output->data<float16>();
// std::vector<int>& dst_dims = ;
// StridedNumelCopyWithAxis();
}
bool dispatch() { bool dispatch() {
Tensor* input = param_.input; Tensor* input = param_.input;
...@@ -88,6 +105,7 @@ class SplitPE : public PE { ...@@ -88,6 +105,7 @@ class SplitPE : public PE {
in_stride, in_stride,
out_stride[axis]); out_stride[axis]);
input_offset += out_stride[axis]; input_offset += out_stride[axis];
// out->flush();
} }
return true; return true;
} }
...@@ -95,21 +113,26 @@ class SplitPE : public PE { ...@@ -95,21 +113,26 @@ class SplitPE : public PE {
std::vector<Tensor*> outputs = param_.outputs; std::vector<Tensor*> outputs = param_.outputs;
int in_channel = input->shape().channel(); int in_channel = input->shape().channel();
int split_channel = input->shape().channel() / param_.num; // int split_channel = input->shape().channel() / param_.num;
int hw = input->shape().height() * input->shape().width(); int hw = input->shape().height() * input->shape().width();
float16* in_data = input->data<float16>(); float16* in_data = input->data<float16>();
for (int i = 0; i < hw; i++) { for (int i = 0; i < hw; i++) {
int channel_stride = 0;
for (int n = 0; n < outputs.size(); n++) { for (int n = 0; n < outputs.size(); n++) {
Tensor* out = outputs[n]; Tensor* out = outputs[n];
float16* out_data = out->data<float16>(); float16* out_data = out->data<float16>();
memcpy(out_data + i * split_channel, memcpy(out_data + i * out->shape().channel(),
in_data + i * in_channel + n * split_channel, in_data + i * in_channel + channel_stride,
split_channel * sizeof(float16)); out->shape().channel() * sizeof(float16));
channel_stride += out->shape().channel();
} }
} }
for (int n = 0; n < outputs.size(); n++) { for (int n = 0; n < outputs.size(); n++) {
Tensor* out = outputs[n]; Tensor* out = outputs[n];
out->flush();
out->copyScaleFrom(input); out->copyScaleFrom(input);
} }
return true; return true;
...@@ -120,5 +143,6 @@ class SplitPE : public PE { ...@@ -120,5 +143,6 @@ class SplitPE : public PE {
private: private:
SplitParam param_; SplitParam param_;
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -300,14 +300,12 @@ class Tensor { ...@@ -300,14 +300,12 @@ class Tensor {
} }
void flush() { void flush() {
size_t memorySize = size_t memorySize = placeHolder_->memorySize();
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize); fpga_flush(placeHolder_->data(), memorySize);
} }
void invalidate() { void invalidate() {
size_t memorySize = size_t memorySize = placeHolder_->memorySize();
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_invalidate(placeHolder_->data(), memorySize); fpga_invalidate(placeHolder_->data(), memorySize);
} }
...@@ -385,6 +383,7 @@ class Tensor { ...@@ -385,6 +383,7 @@ class Tensor {
invalidate(); invalidate();
std::ofstream ofs; std::ofstream ofs;
ofs.open(path); ofs.open(path);
ofs << scale()[0] << " / " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) { for (int i = 0; i < shape_->numel(); i++) {
float value = 0; float value = 0;
......
...@@ -46,7 +46,10 @@ void ConvCompute::PrepareForRun() { ...@@ -46,7 +46,10 @@ void ConvCompute::PrepareForRun() {
conv_param.dilations = *param.dilations; conv_param.dilations = *param.dilations;
fill_scale_bias_const(&conv_param); fill_scale_bias_const(&conv_param);
conv_param.bias()->copyFrom(param.bias->ZynqTensor()); conv_param.bias()->copyFrom(param.bias->ZynqTensor());
conv_param.relu.enabled = param.fuse_relu;
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
dw_conv_pe_.init(); dw_conv_pe_.init();
dw_conv_pe_.apply(); dw_conv_pe_.apply();
...@@ -65,7 +68,16 @@ void ConvCompute::PrepareForRun() { ...@@ -65,7 +68,16 @@ void ConvCompute::PrepareForRun() {
conv_param.bias()->copyFrom(param.bias->ZynqTensor()); conv_param.bias()->copyFrom(param.bias->ZynqTensor());
} }
conv_param.relu.enabled = param.fuse_relu; if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
// conv_param.filter->saveToFile("conv_filter_", true);
// if (param.bias != nullptr) {
// std::cout << "param.bias != nullptr" << std::endl;
// conv_param.bias()->saveToFile("conv_bias_", true);
// }
conv_pe_.init(); conv_pe_.init();
conv_pe_.apply(); conv_pe_.apply();
} }
...@@ -76,8 +88,14 @@ void ConvCompute::Run() { ...@@ -76,8 +88,14 @@ void ConvCompute::Run() {
if (param.x->ZynqTensor()->shape().channel() != 1 && if (param.x->ZynqTensor()->shape().channel() != 1 &&
param.groups == param.x->ZynqTensor()->shape().channel()) { param.groups == param.x->ZynqTensor()->shape().channel()) {
dw_conv_pe_.dispatch(); dw_conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param();
Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
#endif
} else { } else {
// zynqmp::ConvParam& conv_param = conv_pe_.param();
conv_pe_.dispatch(); conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ConvParam& conv_param = conv_pe_.param(); zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output); Debugger::get_instance().registerOutput("conv", conv_param.output);
...@@ -103,3 +121,21 @@ REGISTER_LITE_KERNEL( ...@@ -103,3 +121,21 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(depthwise_conv2d,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ConvCompute,
def)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
...@@ -33,7 +33,8 @@ void ElementwiseAddCompute::PrepareForRun() { ...@@ -33,7 +33,8 @@ void ElementwiseAddCompute::PrepareForRun() {
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()}; ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor(); ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis; ew_param.axis = param.axis;
ew_param.relu.enabled = false;
ew_param.activeParam.type = zynqmp::TYPE_NONE;
pe_.init(); pe_.init();
pe_.apply(); pe_.apply();
...@@ -56,7 +57,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() { ...@@ -56,7 +57,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()}; ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor(); ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis; ew_param.axis = param.axis;
ew_param.relu.enabled = true; ew_param.activeParam.type = zynqmp::TYPE_RELU;
pe_.init(); pe_.init();
pe_.apply(); pe_.apply();
} }
...@@ -76,7 +77,7 @@ void ElementwiseMulCompute::PrepareForRun() { ...@@ -76,7 +77,7 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_param.input = param.X->ZynqTensor(); scale_param.input = param.X->ZynqTensor();
scale_param.output = param.Out->ZynqTensor(); scale_param.output = param.Out->ZynqTensor();
scale_param.relu.enabled = false; scale_param.activeParam.type = zynqmp::TYPE_NONE;
int channel = scale_param.input->shape().channel(); int channel = scale_param.input->shape().channel();
zynqmp::Tensor* scale = new zynqmp::Tensor(); zynqmp::Tensor* scale = new zynqmp::Tensor();
...@@ -124,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add, ...@@ -124,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
......
...@@ -93,18 +93,25 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param, ...@@ -93,18 +93,25 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
} }
// do elementwise add/sub/max... // do elementwise add/sub/max...
if (elt_type == "add") { if (elt_type == "add") {
for (int i = 0; i < batch; ++i) { // for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) { // for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num; // int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset; // const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j]; // const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset; // dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) { // for (int k = 0; k < num; ++k) {
*dout_ptr = sum(*din_ptr, diny_data); // *dout_ptr =
dout_ptr++; // zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr),
din_ptr++; // zynqmp::half_to_float(diny_data)));
} // dout_ptr++;
} // din_ptr++;
// }
// }
// }
int count = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
for (int i = 0; i < count; ++i) {
out_data[i] = zynqmp::float_to_half(sum(
zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i])));
} }
} else if (elt_type == "sub") { } else if (elt_type == "sub") {
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
...@@ -148,9 +155,9 @@ TEST(elementwise_add, compute) { ...@@ -148,9 +155,9 @@ TEST(elementwise_add, compute) {
lite::Tensor x, y, output, output_ref; lite::Tensor x, y, output, output_ref;
for (auto n : {1}) { for (auto n : {1}) {
for (auto c : {8}) { for (auto h : {72}) {
for (auto h : {8}) { for (auto w : {192}) {
for (auto w : {8}) { for (auto c : {24}) {
for (auto axis : {0}) { for (auto axis : {0}) {
for (auto yd : {std::vector<int64_t>({n, c, h, w})}) { for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w})); auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
...@@ -174,10 +181,16 @@ TEST(elementwise_add, compute) { ...@@ -174,10 +181,16 @@ TEST(elementwise_add, compute) {
auto* output_ref_data = auto* output_ref_data =
output_ref.mutable_data<float16>(TARGET(kFPGA)); output_ref.mutable_data<float16>(TARGET(kFPGA));
for (int i = 0; i < x_dim.production(); i++) { for (int i = 0; i < x_dim.production(); i++) {
x_data[i] = zynqmp::float_to_half(i); float sign = i % 3 == 0 ? -0.03 : 0.05f;
float x = sign * (i % 128);
std::cout << "x:" << x << std::endl;
x_data[i] = zynqmp::float_to_half(x);
} }
for (int i = 0; i < y_dim.production(); i++) { for (int i = 0; i < y_dim.production(); i++) {
y_data[i] = zynqmp::float_to_half(i); float sign = i % 3 == 0 ? -0.03 : 0.05f;
float y = sign * (i % 128);
std::cout << "y:" << y << std::endl;
y_data[i] = zynqmp::float_to_half(y);
} }
param.X = &x; param.X = &x;
param.Y = &y; param.Y = &y;
...@@ -190,7 +203,14 @@ TEST(elementwise_add, compute) { ...@@ -190,7 +203,14 @@ TEST(elementwise_add, compute) {
elementwise_compute_ref<float16>(param, "add", ""); elementwise_compute_ref<float16>(param, "add", "");
for (int i = 0; i < output.dims().production(); i++) { for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); std::cout << "output_data:"
<< zynqmp::half_to_float(output_data[i])
<< ",output_ref_data:"
<< zynqmp::half_to_float(output_ref_data[i])
<< std::endl;
EXPECT_NEAR(zynqmp::half_to_float(output_data[i]),
zynqmp::half_to_float(output_ref_data[i]),
1e-5);
} }
} }
} }
...@@ -209,73 +229,74 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) { ...@@ -209,73 +229,74 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
ASSERT_TRUE(fusion_elementwise_add_activation.front()); ASSERT_TRUE(fusion_elementwise_add_activation.front());
} }
TEST(fusion_elementwise_add_activation_fpga, init) { // TEST(fusion_elementwise_add_activation_fpga, init) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation; // ElementwiseAddActivationCompute fusion_elementwise_add_activation;
ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16)); // ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA)); // ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
} // }
TEST(fusion_elementwise_add_activation_fpga, compute) { // TEST(fusion_elementwise_add_activation_fpga, compute) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation; // ElementwiseAddActivationCompute fusion_elementwise_add_activation;
operators::FusionElementwiseActivationParam param; // operators::FusionElementwiseActivationParam param;
lite::Tensor x, y, output, output_ref; // lite::Tensor x, y, output, output_ref;
for (auto act_type : {"relu"}) { // for (auto act_type : {"relu"}) {
for (auto n : {1}) { // for (auto n : {1}) {
for (auto c : {8}) { // for (auto c : {8}) {
for (auto h : {8}) { // for (auto h : {8}) {
for (auto w : {8}) { // for (auto w : {8}) {
for (auto axis : {0}) { // for (auto axis : {0}) {
for (auto yd : {std::vector<int64_t>({n, c, h, w})}) { // for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w})); // auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
auto y_dim = DDim(yd); // auto y_dim = DDim(yd);
int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; // int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
if (axis_t + y_dim.size() > 4) continue; // if (axis_t + y_dim.size() > 4) continue;
bool flag = false; // bool flag = false;
for (int i = 0; i < y_dim.size(); i++) { // for (int i = 0; i < y_dim.size(); i++) {
if (x_dim[i + axis_t] != y_dim[i]) flag = true; // if (x_dim[i + axis_t] != y_dim[i]) flag = true;
} // }
if (flag) continue; // if (flag) continue;
x.Resize(x_dim); // x.Resize(x_dim);
y.Resize(y_dim); // y.Resize(y_dim);
output.Resize(x_dim); // output.Resize(x_dim);
output_ref.Resize(x_dim); // output_ref.Resize(x_dim);
auto* x_data = x.mutable_data<float16>(TARGET(kFPGA)); // auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
auto* y_data = y.mutable_data<float16>(TARGET(kFPGA)); // auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
auto* output_data = output.mutable_data<float16>(TARGET(kFPGA)); // auto* output_data =
auto* output_ref_data = // output.mutable_data<float16>(TARGET(kFPGA));
output_ref.mutable_data<float16>(TARGET(kFPGA)); // auto* output_ref_data =
for (int i = 0; i < x_dim.production(); i++) { // output_ref.mutable_data<float16>(TARGET(kFPGA));
float sign = i % 3 == 0 ? -1.0f : 1.0f; // for (int i = 0; i < x_dim.production(); i++) {
x_data[i] = zynqmp::float_to_half(i * sign); // float sign = i % 3 == 0 ? -1.0f : 1.0f;
} // x_data[i] = zynqmp::float_to_half(i * sign);
for (int i = 0; i < y_dim.production(); i++) { // }
float sign = i % 2 == 0 ? 0.5f : -0.5f; // for (int i = 0; i < y_dim.production(); i++) {
y_data[i] = zynqmp::float_to_half(i * sign); // float sign = i % 2 == 0 ? 0.5f : -0.5f;
} // y_data[i] = zynqmp::float_to_half(i * sign);
param.X = &x; // }
param.Y = &y; // param.X = &x;
param.axis = axis; // param.Y = &y;
param.Out = &output; // param.axis = axis;
param.act_type = act_type; // param.Out = &output;
fusion_elementwise_add_activation.SetParam(param); // param.act_type = act_type;
fusion_elementwise_add_activation.PrepareForRun(); // fusion_elementwise_add_activation.SetParam(param);
fusion_elementwise_add_activation.Run(); // fusion_elementwise_add_activation.PrepareForRun();
param.Out = &output_ref; // fusion_elementwise_add_activation.Run();
elementwise_compute_ref<float16>(param, "add", act_type); // param.Out = &output_ref;
for (int i = 0; i < output.dims().production(); i++) { // elementwise_compute_ref<float16>(param, "add", act_type);
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); // for (int i = 0; i < output.dims().production(); i++) {
} // EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
} // }
} // }
} // }
} // }
} // }
} // }
} // }
} // }
// }
} // namespace fpga } // namespace fpga
} // namespace kernels } // namespace kernels
...@@ -283,4 +304,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) { ...@@ -283,4 +304,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) {
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def); USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def); // USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
...@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL( ...@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(feed,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FeedCompute,
def_host)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
...@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() { ...@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() {
} }
void FetchCompute::Run() { void FetchCompute::Run() {
pe_.dispatch();
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto fetch_list = param.fetch_list;
if (fetch_list->size() <= static_cast<size_t>(param.col)) {
fetch_list->resize(param.col + 1);
}
Tensor& out = param.fetch_list->at(param.col);
out.Resize(param.input->dims());
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param(); zynqmp::OutputParam& fetch_param = pe_.param();
...@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch, ...@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny))}) DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(fetch, REGISTER_LITE_KERNEL(fetch,
...@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch, ...@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::FetchCompute, paddle::lite::kernels::fpga::FetchCompute,
host_host) host_host)
.BindInput("X", .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
{LiteType::GetTensorTy(TARGET(kHost), .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize(); .Finalize();
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <unistd.h> #include <unistd.h>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() { ...@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() {
void GRUCompute::Run() { void GRUCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
param.hidden->mutable_data<float>(); param.hidden->mutable_data<float>();
// inputs // inputs
auto input = param.input; auto input = param.input;
auto h0 = param.h0; auto h0 = param.h0;
...@@ -130,6 +132,7 @@ void GRUCompute::Run() { ...@@ -130,6 +132,7 @@ void GRUCompute::Run() {
// //3. // //3.
gru_value.prev_out_value = ordered_h0.mutable_data<float>(); gru_value.prev_out_value = ordered_h0.mutable_data<float>();
gru_tensors.pre_output = ordered_h0.ZynqTensor(); gru_tensors.pre_output = ordered_h0.ZynqTensor();
} else { } else {
gru_value.prev_out_value = nullptr; gru_value.prev_out_value = nullptr;
gru_tensors.pre_output = nullptr; gru_tensors.pre_output = nullptr;
...@@ -169,6 +172,7 @@ void GRUCompute::Run() { ...@@ -169,6 +172,7 @@ void GRUCompute::Run() {
float* hidden_data = float* hidden_data =
hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape); hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
gru_tensors.gate = &float_input; gru_tensors.gate = &float_input;
gru_tensors.output = &hidden_out; gru_tensors.output = &hidden_out;
...@@ -187,11 +191,6 @@ void GRUCompute::Run() { ...@@ -187,11 +191,6 @@ void GRUCompute::Run() {
*(batch_hidden->mutable_lod()) = batch_gate->lod(); *(batch_hidden->mutable_lod()) = batch_gate->lod();
batch_hidden->mutable_data<float>(); batch_hidden->mutable_data<float>();
to_seq(*batch_hidden, hidden); to_seq(*batch_hidden, hidden);
save_tensor(const_cast<Tensor*>(input), "_input.txt");
save_tensor(hidden, "_gru.txt");
exit(-1);
} }
} // namespace fpga } // namespace fpga
......
...@@ -119,7 +119,79 @@ class IoCopyFpgaToHostCompute ...@@ -119,7 +119,79 @@ class IoCopyFpgaToHostCompute
auto out_lod = param.y->mutable_lod(); auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod(); *out_lod = param.x->lod();
} }
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
void hwc_to_chw(float* chw_data,
float* hwc_data,
int num,
int channel,
int height,
int width) {
int chw = channel * height * width;
int wc = width * channel;
int wh = width * height;
int index = 0;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
index++;
}
}
}
}
}
class IoCopyFpgaToHostCHWCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
Tensor hwc;
hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
tempTensor.unalignImage();
hwc.ZynqTensor()->copyFrom(&tempTensor);
} else {
hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
int num = 1;
int channel = 1;
int height = 1;
int width = 1;
auto dims = param.y->ZynqTensor()->shape();
hwc_to_chw(chw_data,
hwc_data,
dims.num(),
dims.channel(),
dims.height(),
dims.width());
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; } std::string doc() const override { return "Copy IO from FPGA to HOST"; }
}; };
...@@ -170,7 +242,7 @@ REGISTER_LITE_KERNEL(io_copy, ...@@ -170,7 +242,7 @@ REGISTER_LITE_KERNEL(io_copy,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM), {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -179,8 +251,8 @@ REGISTER_LITE_KERNEL(io_copy, ...@@ -179,8 +251,8 @@ REGISTER_LITE_KERNEL(io_copy,
kFPGA, kFPGA,
kAny, kAny,
kAny, kAny,
paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute, paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
device_to_host_22) device_to_host_chw)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
......
...@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() { ...@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() {
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
outs->ZynqTensor()->copyFrom(out.ZynqTensor()); outs->ZynqTensor()->copyFrom(out.ZynqTensor());
} }
outs->Resize({static_cast<int64_t>(e - s), out_dim});
} }
} }
LoD lod; LoD lod;
......
...@@ -34,7 +34,9 @@ void PoolCompute::PrepareForRun() { ...@@ -34,7 +34,9 @@ void PoolCompute::PrepareForRun() {
zynqmp::PoolingParam& pool_param = pe_.param(); zynqmp::PoolingParam& pool_param = pe_.param();
pool_param.input = param.x->ZynqTensor(); pool_param.input = param.x->ZynqTensor();
pool_param.output = param.output->ZynqTensor(); pool_param.output = param.output->ZynqTensor();
pool_param.relu.enabled = false;
pool_param.activeParam.type = zynqmp::TYPE_RELU;
pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
: zynqmp::PoolingType::AVERAGE; : zynqmp::PoolingType::AVERAGE;
pool_param.globalPooling = param.global_pooling; pool_param.globalPooling = param.global_pooling;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册