未验证 提交 cde383dc 编写于 作者: C Chon 提交者: GitHub

[FPGA]merge Edgeboard internal codebase 1.5 into develop (#4392)


* classifications and yolov3 works
上级 af2770d3
......@@ -14,15 +14,19 @@
#pragma once
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include "lite/core/program.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
#define FPGA_PRINT_TENSOR
// uncomment line below to print tensors;
// #define FPGA_PRINT_TENSOR
class Debugger {
public:
......@@ -37,25 +41,34 @@ class Debugger {
}
}
void setEnable(bool en) { enabled_ = en; }
private:
std::map<std::string, bool> op_config;
bool enabled_ = false;
std::unordered_map<std::string, bool> op_config;
std::unordered_map<std::string, float> tick_tock_map;
Debugger() {
op_config["concat"] = true;
op_config["pooling"] = true;
op_config["conv"] = true;
op_config["dropout"] = true;
op_config["dwconv"] = true;
op_config["ew_add"] = true;
op_config["ew_mul"] = true;
op_config["crop"] = true;
op_config["feed"] = true;
op_config["mul"] = true;
op_config["fetch"] = true;
op_config["fc"] = true;
op_config["mul"] = true;
op_config["boxes"] = true;
op_config["scores"] = true;
op_config["nms"] = true;
op_config["pb_boxes"] = true;
op_config["pb_variances"] = true;
// op_config["fc"] = true;
op_config["reshape"] = true;
op_config["softmax"] = true;
op_config["split"] = true;
}
};
......
......@@ -21,7 +21,7 @@ DLEngine::DLEngine() {
open_device();
int ret = get_device_info(info_);
filter::set_filter_capacity(info_.filter_cap);
filter::set_colunm(info_.colunm);
filter::set_colunm(info_.column);
}
} // namespace zynqmp
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -28,15 +29,13 @@ class DLEngine {
return s_instance;
}
DeviceInfo& deviceInfo();
DeviceInfoArgs& deviceInfo();
bool isZU3() { return info_.device_type / 100 == 3; }
float* out_data = nullptr;
private:
DLEngine();
DeviceInfo info_;
DeviceInfoArgs info_;
};
} // namespace zynqmp
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/fpga_cv.hpp"
using paddle::zynqmp::float16;
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height) {
paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
paddle::zynqmp::config_inplace(inplace_args);
paddle::zynqmp::ImageInputArgs input_args = {nullptr};
input_args.address = nullptr;
input_args.scale_address = nullptr;
float16* input_image_address =
reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
input_width * input_height * input_channel * sizeof(float16)));
int index = 0;
for (int i = 0; i < input_width * input_height * input_channel; i++) {
input_image_address[i] = float16(1.0 * input[i]);
}
paddle::zynqmp::ResizeArgs resize_args = {0};
resize_args.input_width = input_width;
resize_args.input_height = input_height;
resize_args.image_channel = input_channel;
resize_args.output_width = output_width;
resize_args.output_height = output_height;
float height_ratio = static_cast<float>(input_height) /
static_cast<float>(resize_args.output_height);
float width_ratio = static_cast<float>(input_width) /
static_cast<float>(resize_args.output_width);
resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
int output_size =
resize_args.output_width * resize_args.output_height * input_channel;
float16* fpga_output = reinterpret_cast<float16*>(
paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
resize_args.input_image_address = input_image_address;
resize_args.output_image_address = fpga_output;
memset(fpga_output, 0, output_size * sizeof(float16));
paddle::zynqmp::fpga_flush(
input_image_address,
input_width * input_height * input_channel * sizeof(float16));
paddle::zynqmp::fpga_flush(resize_args.output_image_address,
output_size * sizeof(float16));
int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
if (ret == 0) {
paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
output_size * sizeof(float16));
}
for (int i = 0; i < output_size; i++) {
output[i] = fpga_output[i];
}
}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height);
......@@ -26,6 +26,7 @@ enum LayoutType {
N,
NC,
NCHW,
CNHW,
NHWC,
NHW,
};
......@@ -75,6 +76,19 @@ struct NHWC : Layout {
}
};
struct CNHW : Layout {
int numIndex() { return 1; }
int channelIndex() { return 0; }
int heightIndex() { return 2; }
int widthIndex() { return 3; }
int alignedElementCount(const std::vector<int>& dims) {
return dims[1] * dims[2] * align_image(dims[0] * dims[3]);
}
int elementCount(const std::vector<int>& dims) {
return dims[0] * dims[1] * dims[2] * dims[3];
}
};
struct NC : Layout {
int numIndex() { return 0; }
int channelIndex() { return 1; }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define PADDLE_LITE_ZU5
#define FPGA_PRINT_MODE
#define PADDLE_LITE_PROFILE
......@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
std::ofstream ofs;
ofs.open(name);
int8_t* data = static_cast<int8_t*>(data_in);
int8_t* data = reinterpret_cast<int8_t*>(data_in);
for (int i = 0; i < size; i++) {
float value = data[i];
ofs << value << std::endl;
......@@ -86,6 +86,11 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
int calc_pack_num(int num_per_group, int group, int division_capacity) {
auto n = 1;
if (num_per_group * group % division_capacity == 0) {
n = num_per_group * group / division_capacity;
return n;
}
while ((num_per_group * (group + n - 1) / n) > division_capacity) {
n++;
}
......@@ -239,9 +244,10 @@ int8_t* format_filter(float* data_in,
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
float f_max = find_max(filter_start, chw);
int8_t* quantized_start = quantized_data + n * chw;
quantize(filter_start, quantized_start, chw, max);
filter_max.push_back(1);
quantize(filter_start, quantized_start, chw, f_max);
filter_max.push_back(f_max);
}
int8_t* hwc_data =
......@@ -377,7 +383,6 @@ size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
convert_to_hwn(quantize_data, num, height, width);
size_t size = align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data,
......@@ -385,6 +390,7 @@ size_t format_dwconv_filter(
sizeof(int16_t));
return size;
}
} // namespace filter
} // namespace zynqmp
} // namespace paddle
......@@ -28,7 +28,7 @@ limitations under the License. */
namespace paddle {
namespace zynqmp {
#define PADDLE_MOBILE_OS_LINUX
#define PADDLE_OS_LINUX
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
......@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) {
#ifdef PADDLE_MOBILE_OS_LINUX
#ifdef PADDLE_OS_LINUX
return ioctl(fd, req, arg);
#else
return -1;
......@@ -48,6 +48,11 @@ static inline int do_ioctl(uint64_t req, const void *arg) {
int open_device() {
if (fd == -1) {
fd = open(device_path, O_RDWR);
if (fd == -1) {
std::cout << "please check if driver has insmoded!" << std::endl;
exit(-1);
}
}
return fd;
}
......@@ -61,28 +66,33 @@ void reset_device() {
// memory management;
void *fpga_malloc(size_t size) {
#ifdef PADDLE_MOBILE_OS_LINUX
#ifdef PADDLE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == MAP_FAILED) {
std::cout << "not enough memory !";
exit(-1);
}
if (errno == ENOMEM) {
std::cout << "mmap failed with not enough memory !";
exit(-1);
}
if (errno == EINVAL) {
std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
<< std::endl;
exit(-1);
if (errno == ENOMEM) {
std::cout << "mmap failed with not enough memory ! (size=" << size << ")"
<< std::endl;
throw(-1);
}
if (errno == EINVAL) {
std::cout << "mmap failed with invalid arguments ! (size=" << size << ")"
<< std::endl;
throw(-1);
}
std::cout << "mmap failed with other than memory usage and invalid "
"arguments! errno="
<< errno << ", (size=" << size << ")" << std::endl;
throw(-1);
}
if (ptr == NULL) {
std::cout << "NULL returned, errno=" << errno
<< ", mmap failed with other errors other than memory usage !"
<< ", null retured, mmap failed with other errors other than "
"memory usage !"
<< std::endl;
exit(-1);
throw(-1);
}
memory_map.insert(std::make_pair(ptr, size));
......@@ -103,7 +113,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
size_t fpga_diagnose_memory(int detailed) {
size_t total = 0;
auto iter = memory_map.begin();
auto iter = memory_map.begin(); // std::map<void *, size_t>::iterator
while (iter != memory_map.end()) {
total += iter->second;
iter++;
......@@ -113,7 +123,7 @@ size_t fpga_diagnose_memory(int detailed) {
void fpga_free(void *ptr) {
size_t size = 0;
auto iter = memory_map.find(ptr);
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
......@@ -121,7 +131,8 @@ void fpga_free(void *ptr) {
memory_size -= size;
#ifdef PADDLE_MOBILE_OS_LINUX
#ifdef PADDLE_OS_LINUX
munmap(ptr, size);
#else
free(ptr);
......@@ -175,19 +186,6 @@ int compute_fpga_conv_basic(const struct ConvArgs &args) {
return do_ioctl(IOCTL_CONFIG_CONV, &args);
}
int compute_fpga_conv(const struct SplitConvArgs &args) {
int split_num = args.split_num;
int ret = -1;
for (int i = 0; i < split_num; i++) {
ret = compute_fpga_conv_basic(args.conv_arg[i]);
}
if (split_num > 1) {
exit(-1);
}
return ret;
}
int compute_fpga_pool(const struct PoolingArgs &args) {
return do_ioctl(IOCTL_CONFIG_POOLING, &args);
}
......@@ -196,9 +194,8 @@ int compute_fpga_ewadd(const struct EWAddArgs &args) {
return do_ioctl(IOCTL_CONFIG_EW, &args);
}
int get_device_info(const struct DeviceInfo &args) {
int ret = do_ioctl(IOCTL_DEVICE_INFO, &args);
return ret;
int get_device_info(const struct DeviceInfoArgs &args) {
return do_ioctl(IOCTL_DEVICE_INFO, &args);
}
int perform_bypass(const struct BypassArgs &args) {
......@@ -257,26 +254,6 @@ int perform_bypass(const struct BypassArgs &args) {
int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
int compute_fpga_scale(const struct ScaleArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Scale======";
std::cout << "scale_address:" << args.scale_address << std::endl;
std::cout << "bias_address:" << args.bias_address << std::endl;
std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_SCALE, &args);
}
......@@ -288,6 +265,10 @@ int config_activation(const struct ActiveParamterArgs &args) {
return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
}
int config_global_pool(const struct GlobalPoolArgs &args) {
return do_ioctl(IOCTL_CONFIG_GLOBAL_POOL_PARAMETER, &args);
}
int config_inplace(const struct InplaceArgs &args) {
return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
}
......@@ -304,6 +285,10 @@ int compute_fpga_resize(const struct ResizeArgs &args) {
return do_ioctl(IOCTL_CONFIG_RESIZE, &args);
}
int compute_preprocess(const struct PreprocessArgs &args) {
return do_ioctl(IOCTL_PREPROCESS, &args);
}
int16_t fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
......
......@@ -29,6 +29,9 @@ typedef int16_t half;
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
// #define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 replace
// by filter.hpp "get_filter_num_alignment()"
// #define FILTER_ELEMENT_ALIGNMENT 64 // Filter element number aligned to 64
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
......@@ -50,11 +53,11 @@ enum ActiveType {
TYPE_SIGMOID = 4,
};
struct DeviceInfo {
struct DeviceInfoArgs {
uint32_t filter_cap;
uint32_t version;
uint16_t device_type;
uint32_t colunm;
uint32_t column;
uint32_t reserved1;
uint32_t reserved2;
uint32_t reserved3;
......@@ -114,6 +117,14 @@ struct ImageOutputArgs {
float* scale_address; // output scale address;
};
struct DeconvArgs {
bool enabled;
uint16_t sub_kernel_num; // which is the stride of deconv, means that deconv
// will be divided into several sub conv operation
uint16_t invalid_col_num; // which will be dumped in the left and right for
// each row directly in FPGA
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias are interlaced;
......@@ -123,6 +134,7 @@ struct ConvArgs {
uint32_t group_num;
uint32_t dilation;
struct DeconvArgs deconv;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
......@@ -189,6 +201,29 @@ struct NormalizeArgs {
uint32_t* output_scale_address;
};
struct PreprocessArgs {
void* input_image_address;
void* output_image_address;
uint32_t input_width;
uint32_t input_height;
uint32_t output_width;
uint32_t output_height;
uint32_t height_ratio;
uint32_t width_ratio;
uint16_t mean0;
uint16_t mean1;
uint16_t mean2;
uint16_t scale0;
uint16_t scale1;
uint16_t scale2;
uint32_t rd_ring_buf_size;
uint32_t wr_ring_buf_size;
uint32_t vedio_in_fomat;
uint32_t vedio_out_fomat;
uint32_t vedio_source;
bool mean_scale_enabled;
};
struct ResizeArgs {
void* input_image_address;
void* output_image_address;
......@@ -214,10 +249,14 @@ struct NormalizeParameterArgs {
};
struct ActiveParamterArgs {
ActiveType type;
enum ActiveType type;
uint16_t leaky_relu_factor;
};
struct GlobalPoolArgs {
uint16_t global_pool_factor;
};
struct InplaceArgs {
bool leaky_relu_enable;
bool relu_enable;
......@@ -225,6 +264,7 @@ struct InplaceArgs {
bool relu6_enable;
bool power_enable;
bool normalize_enable;
bool global_pool_en;
};
struct FpgaRegWriteArgs {
......@@ -238,13 +278,13 @@ struct FpgaRegReadArgs {
};
struct FpgaResetArgs {
uint32_t val;
uint32_t dummy;
};
#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
// #define IOCTL_MEMORY_MAGIC (('M' + 'E' + 'M' + 'Y') / 4)
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
#define IOCTL_DEVICE_INFO _IOW(IOCTL_FPGA_MAGIC, 100, struct DeviceInfo)
#define IOCTL_SEPARATOR_0 10
......@@ -263,7 +303,6 @@ struct FpgaResetArgs {
#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs)
#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs)
#define IOCTL_CONFIG_RESIZE _IOW(IOCTL_FPGA_MAGIC, 30, struct ResizeArgs)
#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs)
#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs)
......@@ -273,61 +312,19 @@ struct FpgaResetArgs {
_IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
#define IOCTL_CONFIG_GLOBAL_POOL_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 44, struct GlobalPoolArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
//============================== API =============================
struct DeconvArgs {
uint32_t sub_conv_num;
uint32_t group_num;
uint32_t filter_num;
uint32_t omit_size;
uint32_t sub_output_width;
uint32_t sub_output_height;
struct ImageOutputArgs output;
struct SplitConvArgs* split_conv_args;
};
struct SplitArgs {
uint32_t image_num;
int16_t* image_in;
float* scale_in;
void** images_out;
float** scales_out;
uint32_t* out_channel_nums;
uint32_t height;
uint32_t width;
};
#define IOCTL_DEVICE_INFO _IOW(IOCTL_FPGA_MAGIC, 100, struct DeviceInfoArgs)
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t height;
uint32_t width;
};
#define IOCTL_SEPARATOR_2 200
#define IOCTL_PREPROCESS _IOW(IOCTL_FPGA_MAGIC, 201, struct PreprocessArgs)
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct GroupConvArgs {
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct SplitConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
//============================== API =============================
inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
int open_device();
......@@ -345,11 +342,10 @@ void fpga_copy(void* dst, const void* src, int size);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
int get_device_info(const struct DeviceInfo& args);
int get_device_info(const struct DeviceInfoArgs& args);
int perform_bypass(const struct BypassArgs& args);
int compute_fpga_conv_basic(const struct ConvArgs& args);
int compute_fpga_conv(const struct SplitConvArgs& args);
int compute_fpga_pool(const struct PoolingArgs& args);
int compute_fpga_ewadd(const struct EWAddArgs& args);
int compute_fpga_scale(const struct ScaleArgs& args);
......@@ -357,6 +353,7 @@ int compute_fpga_concat(const struct ConcatArgs& args);
int compute_fpga_resize(const struct ResizeArgs& args);
int config_activation(const struct ActiveParamterArgs& args);
int config_global_pool(const struct GlobalPoolArgs& args);
int config_power(const struct PowerArgs& args);
int compute_fpga_dwconv(const struct DWconvArgs& args);
int config_norm_param(const struct NormalizeParameterArgs& args);
......@@ -368,6 +365,7 @@ int flush_cache(void* addr, int size);
int invalidate_cache(void* addr, int size);
int fpga_reset();
int compute_preprocess(const struct PreprocessArgs& args);
int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num);
......
......@@ -78,31 +78,59 @@ struct ConvParam : PEParam {
Tensor* filter = nullptr;
int groups = 1;
bool deconv = false;
std::vector<int> strides;
std::vector<int> paddings;
std::vector<int> kernelSize;
std::vector<int> dilations;
Tensor* scale() { return scale_; }
Tensor* scale() { return &scale_; }
Tensor* bias() { return bias_; }
Tensor* bias() { return &bias_; }
std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
~ConvParam() {
for (BasicConvParam* p : splitParams_) {
delete p;
}
splitParams_.clear();
}
protected:
std::vector<BasicConvParam*> splitParams_;
Tensor* scale_ = new Tensor();
Tensor* bias_ = new Tensor();
Tensor scale_;
Tensor bias_;
};
struct BasicDWConvParam {
Tensor input;
Tensor output;
Tensor filter;
Tensor bias;
DWconvArgs args;
Tensor quantizedFilter;
Tensor quantizedBias;
};
struct DepthwiseConvSplitParam : ConvParam {
public:
DWconvArgs args;
std::vector<BasicDWConvParam*>& splitParams() { return splitParams_; }
protected:
std::vector<BasicDWConvParam*> splitParams_;
};
struct DepthwiseConvParam : ConvParam {
public:
Tensor* quantizedFilter() { return quantizedFilter_; }
Tensor* quantizedFilter() { return &quantizedFilter_; }
DWconvArgs args;
protected:
Tensor* quantizedFilter_ = new Tensor();
Tensor quantizedFilter_;
};
enum PoolingType : int {
......@@ -124,6 +152,16 @@ struct PoolingParam : PEParam {
PoolingArgs poolingArgs = {0};
};
struct PoolingSplitParam : ConvParam {
public:
PoolingArgs args;
std::vector<PoolingParam*>& splitParams() { return splitParams_; }
protected:
std::vector<PoolingParam*> splitParams_;
};
struct ConcatParam : PEParam {
public:
std::vector<Tensor*> inputs;
......@@ -154,13 +192,13 @@ struct FullyConnectedParam : PEParam {
Tensor* bias = nullptr;
Tensor* output = nullptr;
Tensor* quantizedFilter() { return quantizedFilter_; }
Tensor* quantizedFilter() { return &quantizedFilter_; }
Tensor* biasScale() { return biasScale_; }
Tensor* biasScale() { return &biasScale_; }
protected:
Tensor* quantizedFilter_ = new Tensor();
Tensor* biasScale_ = new Tensor();
Tensor quantizedFilter_;
Tensor biasScale_;
};
struct SoftmaxParam : PEParam {
......@@ -229,15 +267,15 @@ struct ScaleParam : PEParam {
Tensor* scale = nullptr;
Tensor* bias = nullptr;
Tensor* alignedScale() { return alignedScale_; }
Tensor* alignedScale() { return &alignedScale_; }
Tensor* alignedBias() { return alignedBias_; }
Tensor* alignedBias() { return &alignedBias_; }
ScaleArgs args = {0};
protected:
Tensor* alignedScale_ = new Tensor();
Tensor* alignedBias_ = new Tensor();
Tensor alignedScale_;
Tensor alignedBias_;
};
struct ResizeParam : PEParam {
......
......@@ -29,6 +29,11 @@ class ConcatPE : public PE {
Tensor* output = param_.output;
output->setAligned(false);
output->setDataLocation(CPU);
bool cacheable = true;
for (auto in : param_.inputs) {
cacheable &= in->cacheable();
}
output->setCacheable(cacheable);
return true;
}
......
......@@ -32,6 +32,107 @@ namespace zynqmp {
class ConvPE : public PE {
public:
void cpu_conv_half_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
Shape& input_shape = input->shape();
Shape& out_shape = output->shape();
int image_height = input_shape.height();
int image_width = input_shape.width();
int image_channels = input_shape.channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[0];
int kernel_height = param_.filter->shape().height();
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
int dilation_rate = 1;
int out_channel = out_shape.channel();
int pooled_height_ = out_shape.height();
int pooled_width_ = out_shape.width();
int filter_chw = image_channels * kernel_height * kernel_width;
int kernel_rw = kernel_width + (dilation_rate - 1) * (kernel_width - 1);
int kernel_rh = kernel_height + (dilation_rate - 1) * (kernel_height - 1);
float* weight = param_.filter->data<float>();
Tensor float_input;
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
input->syncToDevice();
float_input.copyFrom(input);
float_input.invalidate();
float_input.saveToFile("fi", true);
float* out = float_output.mutableData<float>(FP32, output->shape());
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend = std::min(hstart + kernel_rh, static_cast<int>(image_height));
int wend = std::min(wstart + kernel_rw, static_cast<int>(image_width));
int hstart_plus =
dilation_rate *
ceil(static_cast<float>(image_pad_h - ph * kernel_step_h)) /
static_cast<float>(dilation_rate) -
image_pad_h + ph * kernel_step_h;
int wstart_plus =
dilation_rate *
ceil(static_cast<float>(image_pad_w - pw * kernel_step_w) /
static_cast<float>(dilation_rate)) -
image_pad_w + pw * kernel_step_w;
int hstart_ = hstart < 0 ? hstart_plus : hstart;
int wstart_ = wstart < 0 ? wstart_plus : wstart;
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
for (int c = 0; c < image_channels; c++) {
for (int h = hstart_; h < hend; h += dilation_rate) {
int hi = 0;
if (hstart < 0) {
hi = (kernel_rh - (hend - h)) / dilation_rate;
} else {
hi = (h - hstart_) / dilation_rate;
}
for (int w = wstart_; w < wend; w += dilation_rate) {
int wi = 0;
if (wstart < 0) {
wi = (kernel_rw - (wend - w)) / dilation_rate;
} else {
wi = (w - wstart_) / dilation_rate;
}
const int index = (h * image_width + w) * image_channels + c;
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * weight[weight_index];
sum += value;
}
}
}
float s = param_.scale()->data<float>()[oc];
float b = param_.bias()->data<float>()[oc];
out[pool_index] = sum * s + b;
}
}
}
float_output.flush();
float_output.saveToFile("fo", true);
output->copyFrom(&float_output);
output->invalidate();
output->saveToFile("out", true);
// exit(-1);
}
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
......@@ -40,28 +141,30 @@ class ConvPE : public PE {
}
void apply() {
split_axis = fill_split_arg(param_);
split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
if (split_axis == 0 && param_.splitParams().size() > 1) {
ConcatParam& concat_param = concatPE_.param();
for (auto conv_param : param_.splitParams()) {
concat_param.inputs.push_back(&conv_param->output);
if (param_.deconv == false) {
split_axis = fill_split_arg(param_);
split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
if (split_axis == 0 && param_.splitParams().size() > 1) {
ConcatParam& concat_param = concatPE_.param();
for (auto conv_param : param_.splitParams()) {
concat_param.inputs.push_back(&conv_param->output);
}
concat_param.output = param_.output;
concatPE_.init();
concatPE_.apply();
}
concat_param.output = param_.output;
concatPE_.init();
concatPE_.apply();
}
if (split_channel) {
SplitParam& split_param = splitPE_.param();
split_param.input = param_.input;
for (auto conv_param : param_.splitParams()) {
split_param.outputs.push_back(&conv_param->input);
if (split_channel) {
SplitParam& split_param = splitPE_.param();
split_param.input = param_.input;
for (auto conv_param : param_.splitParams()) {
split_param.outputs.push_back(&conv_param->input);
}
splitPE_.init();
splitPE_.apply();
}
splitPE_.init();
splitPE_.apply();
}
if (DLEngine::get_instance().isZU3() &&
......@@ -70,8 +173,9 @@ class ConvPE : public PE {
param_.input->shape().channel() >= 2048) {
use_cpu_ = true;
}
if (!use_cpu_) {
// param_.filter->releaseData();
param_.filter->releaseData();
}
// exit(-1);
......@@ -120,16 +224,17 @@ class ConvPE : public PE {
}
delete[] mi;
float_output.flush();
output->flush();
output->copyFrom(&float_output);
output->invalidate();
}
bool dispatch() {
fpga_reset();
if (use_cpu_) {
cpu_compute();
return true;
}
inplace_.global_pool_en = false;
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
......@@ -146,24 +251,20 @@ class ConvPE : public PE {
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor =
fp32_2_fp16(param_.activeParam.leaky_relu_factor);
float_to_half(param_.activeParam.leaky_relu_factor);
config_activation(activeParamterArgs);
}
}
std::vector<BasicConvParam*>& params = param_.splitParams();
if (split_channel) {
if (split_channel && param_.deconv == false) {
// splitPE_.param().input->saveToFile("input_image",true);
splitPE_.dispatch();
}
int ret = 0;
for (auto conv_param : params) {
// conv_param->input.printScale();
// if (split_channel) {
// conv_param->input.saveToFile("pack_image",true);
// }
ret |= compute_fpga_conv_basic(conv_param->args);
}
......@@ -173,18 +274,18 @@ class ConvPE : public PE {
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.global_pool_en = false;
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
if (param_.activeParam.type == TYPE_LEAKY_RELU) {
activeParamterArgs.type = TYPE_LEAKY_RELU;
activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
activeParamterArgs.leaky_relu_factor = float_to_half(0);
config_activation(activeParamterArgs);
}
}
size_t size = params.size();
if (split_axis == 0 && ret == 0 && size > 1) {
// std::cout << "concat size:" << size << std::endl;
if (split_axis == 0 && ret == 0 && size > 1 && param_.deconv == false) {
concatPE_.dispatch();
}
if (split_axis == 1 && ret == 0 && size > 1) {
......
......@@ -171,6 +171,30 @@ inline void format_bias_scale_new(Tensor* bias,
}
}
inline void format_16_bias(Tensor* bias, Tensor* quantized_bias, int channel) {
int repeat = 1;
int alignment = 16;
int length = channel;
if (channel % alignment != 0 || channel < alignment) {
int c_lcm = lcm_(channel, alignment);
repeat = c_lcm / (channel);
}
Shape shape(N, {channel * repeat});
float16* quantized_bias_data =
quantized_bias->mutableData<float16>(FP16, shape);
float* bias_data = bias->data<float>();
// bias aligned to 16 by hw;
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(bias_data[j]);
quantized_bias_data[i * length + j] = value;
}
}
quantized_bias->flush();
}
inline void format_scale_bias(Tensor* scale,
Tensor* bias,
Tensor* filter,
......@@ -237,7 +261,6 @@ inline void format_filter(Tensor* filter,
std::vector<float>& scales, // NOLINT
float max) {
float max_value = find_max(*filter);
// max_value = max; //TODO: global quantization for filter
Shape& filter_shape = filter->shape();
int mem_size;
......@@ -264,20 +287,9 @@ inline void format_filter(Tensor* filter,
quantized_filter->flush();
fpga_free(quantized_data);
// for (size_t i = 0; i < max_values.size(); i++) {
// // scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f);
// }
// filter->saveToFile("filter.txt");
// std::ofstream ofs;
// ofs.open("quant.txt");
// for (int i = 0; i < mem_size; i++) {
// float value = quantized_data[i];
// ofs << value << std::endl;
// }
// ofs.close();
// exit(-1);
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value);
}
}
inline void format_dw_filter(Tensor* filter,
......@@ -371,8 +383,9 @@ inline void split_filter_num(const ConvParam& c_param) {
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
std::vector<float> v; // TODO(chonwhite) change variable name;
format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
std::vector<float> quant_scale;
format_filter(
&new_filter, &(conv_param->filter), param.groups, quant_scale, max);
conv_param->filter.setDataType(INT8);
Tensor scale;
......@@ -384,14 +397,14 @@ inline void split_filter_num(const ConvParam& c_param) {
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
scale_data[n] =
param.scale()->data<float>()[n + chnnnel_start] * quant_scale[n];
}
for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
format_bias_scale_new(&bias, &scale, &conv_param->scaleBias);
conv_param->scaleBias.flush();
args.group_num = param.groups;
args.sb_address = conv_param->scaleBias.data<float16>();
args.kernel.stride_h = param.strides[1];
......@@ -431,7 +444,6 @@ inline void pack_channel_filter(const ConvParam& c_param) {
int channel_per_pack = filter->shape().channel() * group_per_pack;
float max = find_max(*filter);
Shape& out_shape = out->shape();
for (int i = 0; i < pack_num; i++) {
......@@ -500,8 +512,9 @@ inline void pack_channel_filter(const ConvParam& c_param) {
float mem_factor = filter_num_alignment / filter_per_pack;
conv_param->filter.setMemScale(mem_factor);
std::vector<float> v; // TODO(chonwhite) change variable name
format_filter(&new_filter, &(conv_param->filter), new_group, v, max);
std::vector<float> quant_scale;
format_filter(
&new_filter, &(conv_param->filter), new_group, quant_scale, max);
conv_param->filter.setDataType(INT8);
Tensor scale;
......@@ -513,7 +526,8 @@ inline void pack_channel_filter(const ConvParam& c_param) {
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_current_pack; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
scale_data[n] =
param.scale()->data<float>()[n + chnnnel_start] * quant_scale[n];
}
for (int n = 0; n < filter_current_pack; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
......@@ -591,7 +605,7 @@ inline void split_channel(const ConvParam& c_param) {
float* bias_data = bias.mutableData<float>(FP32, bs_shape);
float* scale_data = scale.mutableData<float>(FP32, bs_shape);
for (int c = 0; c < channel; c++) {
scale_data[c] = 1;
scale_data[c] = scales[c];
bias_data[c] = param.bias()->data<float>()[c] / num;
}
scale.flush();
......@@ -666,6 +680,105 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
return ret == 0;
}
inline void dwconv_split_channel(DepthwiseConvSplitParam& param) { // NOLINT
Tensor* input = param.input;
Tensor* output = param.output;
Tensor* filter = param.filter;
input->syncToCPU();
int h_kernel = filter->shape().height();
int w_kernel = filter->shape().width();
int c = input->shape().channel();
int w = input->shape().width();
int wc_h_kernel = w * c * h_kernel;
int dwconv_limit = 131072;
int num = ceil(wc_h_kernel * 1.0f / dwconv_limit);
while (input->shape().channel() % num != 0) {
num++;
}
int channel = input->shape().channel() / num;
if (channel % 16 != 0) {
std::cout << "input channel must div by 16" << std::endl;
// throw -1;
}
Shape bs_shape(N, {channel});
float16* output_address = nullptr;
float16* input_address = nullptr;
float* out_scale_address = nullptr;
for (int i = 0; i < num; i++) {
BasicDWConvParam* dwconv_param = new BasicDWConvParam();
// input && output;
Shape in_shape(
NCHW, {1, channel, input->shape().height(), input->shape().width()});
if (num == 1) {
input_address = input->data<float16>();
output_address = output->data<float16>();
out_scale_address = output->scale();
} else {
input_address = dwconv_param->input.mutableData<float16>(FP16, in_shape);
output_address =
dwconv_param->output.mutableData<float16>(FP16, in_shape);
out_scale_address = dwconv_param->output.scale();
}
// filter transformation;
Shape f_shape(NCHW, {channel, 1, h_kernel, w_kernel});
Tensor split_filter;
float* split_filter_data = split_filter.mutableData<float>(FP32, f_shape);
int filter_hwc = h_kernel * w_kernel * channel;
memcpy(split_filter_data,
filter->data<float>() + i * filter_hwc,
filter_hwc * sizeof(float));
split_filter.flush();
Tensor split_scale;
Tensor split_bias;
float* scale_data = split_scale.mutableData<float>(FP32, bs_shape);
float* bias_data = split_bias.mutableData<float>(FP32, bs_shape);
for (int c = 0; c < channel; c++) {
scale_data[c] = param.scale()->data<float>()[i * channel + c];
bias_data[c] = param.bias()->data<float>()[i * channel + c];
}
split_bias.flush();
Tensor quantized_filter = dwconv_param->quantizedFilter;
Tensor quantized_bias = dwconv_param->quantizedBias;
quantized_filter.mutableData<float16>(FP16, f_shape);
quantized_bias.mutableData<float16>(FP16, f_shape);
format_dw_filter(
&split_filter, &(dwconv_param->quantizedFilter), scale_data);
format_16_bias(&split_bias, &(dwconv_param->quantizedBias), channel);
DWconvArgs& args = dwconv_param->args;
args.bias_address = dwconv_param->quantizedBias.data<float16>();
args.filter_address = dwconv_param->quantizedFilter.data<float16>();
args.kernel.width = f_shape.height();
args.kernel.height = f_shape.width();
args.kernel.stride_w = param.strides[0];
args.kernel.stride_h = param.strides[1];
args.image.address = input_address;
args.image.channels = channel;
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_width = param.paddings[0];
args.image.pad_height = param.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output_address;
args.output.scale_address = out_scale_address;
args.out_width = param.output->shape().width();
args.out_height = param.output->shape().height();
args.sub_conv_num = 1;
param.splitParams().push_back(dwconv_param);
}
}
} // namespace zynqmp
} // namespace paddle
......
......@@ -61,11 +61,6 @@ class DepthwiseConvPE : public PE {
float16* b_data = bias_.mutableData<float16>(FP16, shape);
if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16
// for (int i = 0; i < channel; i++) {
// b_data[i] = float_to_half(new_bias_data[i]);
// }
// bias 按16对齐填充hw
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(new_bias_data[j]);
......@@ -75,10 +70,8 @@ class DepthwiseConvPE : public PE {
bias_.flush();
} else {
float16* new_bias_data = param_.bias()->data<float16>();
// memcpy(b_data, new_bias_data, channel * sizeof(float16));
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
// float16 value = float_to_half(bias_data_float[j]);
b_data[i * length + j] = new_bias_data[j];
}
}
......@@ -92,12 +85,10 @@ class DepthwiseConvPE : public PE {
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
} else {
// filter 全为1时,且channal为对齐时
// TODO(chonwhite) filter fall one and channel aligned case
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/conv_process.hpp"
namespace paddle {
namespace zynqmp {
class DepthwiseConvSplitPE : public PE {
public:
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
output->setDataLocation(Device);
return true;
}
void apply() {
DepthwiseConvSplitParam& param = param_;
Tensor* input = param.input;
Tensor* output = param.output;
int channel = output->shape().channel();
dwconv_split_channel(param);
if (param.splitParams().size() > 1) {
SplitParam& split_param = splitPE_.param();
split_param.input = param_.input;
for (auto dwconv_param : param_.splitParams()) {
split_param.outputs.push_back(&dwconv_param->input);
}
splitPE_.init();
splitPE_.apply();
ConcatParam& concat_param = concatPE_.param();
for (auto dwconv_param : param_.splitParams()) {
concat_param.inputs.push_back(&dwconv_param->output);
}
concat_param.output = param_.output;
concatPE_.init();
concatPE_.apply();
}
}
bool dispatch() {
param_.input->syncToDevice();
if (param_.activeParam.type == TYPE_RELU) {
inplace_.relu_enable = true;
} else if (param_.activeParam.type == TYPE_RELU6) {
inplace_.relu6_enable = true;
} else if (param_.activeParam.type == TYPE_SIGMOID) {
inplace_.sigmoid_enable = true;
} else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
inplace_.leaky_relu_enable = true;
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
config_inplace(inplace_);
}
std::vector<BasicDWConvParam*>& params = param_.splitParams();
if (params.size() > 1) {
splitPE_.dispatch();
}
int ret = 0;
for (auto dwconv_param : params) {
ret |= compute_fpga_dwconv(dwconv_param->args);
}
if (params.size() > 1) {
concatPE_.dispatch();
}
if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
inplace_.relu6_enable || inplace_.sigmoid_enable) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
config_inplace(inplace_);
}
return ret;
}
DepthwiseConvSplitParam& param() { return param_; }
private:
DepthwiseConvSplitParam param_;
ConcatPE concatPE_;
SplitPE splitPE_;
InplaceArgs inplace_ = {0};
};
} // namespace zynqmp
} // namespace paddle
......@@ -38,7 +38,7 @@ class FullyConnectedPE : public PE {
Tensor* input = param_.input;
convParam_.input = param_.input;
convParam_.output = param_.output;
// convParam_.relu = param_.relu;
convParam_.activeParam.type = param_.activeParam.type;
convParam_.groups = 1;
convParam_.strides = {1, 1};
......@@ -48,9 +48,6 @@ class FullyConnectedPE : public PE {
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
// if (num == 2) {
// return;
// }
int height = param_.input->shape().height();
int width = param_.input->shape().width();
......
......@@ -41,7 +41,9 @@ class InputPE : public PE {
src = &half_tensor;
}
output->mutableData<void>();
src->alignImage(output, true);
src->alignImage();
output->copyFrom(src);
// src->alignImage(output, true);
return true;
}
......
......@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle {
namespace zynqmp {
class NormPE : public PE {
public:
bool init() {
......@@ -106,21 +107,19 @@ class NormPE : public PE {
}
bool dispatch() {
cpuCompute();
// std::cout << "CPU normalize ---------------------" << std::endl;
// param_.input->syncToDevice();
// // param_.input->saveToFile("normalize_fpga_", true);
// config_norm_param(norm_param_args_);
// inplace_args_.normalize_enable = true;
// config_inplace(inplace_args_);
// perform_bypass(bypass_args_);
// inplace_args_.normalize_enable = false;
// config_inplace(inplace_args_);
// compute_norm(norm_args_);
// param_.output->saveToFile("normalize_fpga_", true);
// cpuCompute();
// std::cout << "FPGA normalize ---------------------" << std::endl;
param_.input->syncToDevice();
config_norm_param(norm_param_args_);
inplace_args_.normalize_enable = true;
config_inplace(inplace_args_);
perform_bypass(bypass_args_);
inplace_args_.normalize_enable = false;
config_inplace(inplace_args_);
compute_norm(norm_args_);
return true;
}
......@@ -135,5 +134,6 @@ class NormPE : public PE {
NormalizeArgs norm_args_ = {0};
};
} // namespace zynqmp
} // namespace paddle
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
......@@ -25,8 +26,6 @@ class OutputPE : public PE {
bool init() {
Tensor* output = param_.output;
output->setAligned(false);
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
return true;
}
......@@ -43,15 +42,7 @@ class OutputPE : public PE {
} else {
output->copyFrom(input);
}
//
output->syncToCPU();
if (DLEngine::get_instance().out_data == nullptr) {
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
}
memcpy(DLEngine::get_instance().out_data,
output->data<void>(),
output->shape().numel() * sizeof(float));
return true;
}
......
......@@ -50,14 +50,17 @@ class PoolingPE : public PE {
PoolingArgs args = {0};
args.mode = param_.type;
auto paddings = *param_.paddings;
args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
if (param_.globalPooling) {
args.kernel_reciprocal = fp32_2_fp16(1.0f);
} else {
args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
}
args.image.address = input->data<float16>();
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_height = paddings[0];
args.image.pad_width = paddings[2];
args.image.pad_height = param_.paddings[0];
args.image.pad_width = param_.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output->mutableData<float16>();
args.output.scale_address = output->scale();
......@@ -69,11 +72,8 @@ class PoolingPE : public PE {
args.out_width = output->shape().width();
param_.poolingArgs = args;
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// && (k_width > 7 || k_height > 7);
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 255 || k_height > 255);
// use_cpu_ = param_.type == AVERAGE;
}
void compute() {
......@@ -86,13 +86,12 @@ class PoolingPE : public PE {
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
auto paddings = *param_.paddings;
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = paddings[0];
int image_pad_w = paddings[2];
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0];
int kernel_step_h = param_.strides[0];
......@@ -118,8 +117,7 @@ class PoolingPE : public PE {
for (int c = 0; c < image_channels; ++c) {
const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
float sum = 0;
// const int index =
// (hstart * image_width + wstart) * image_channels + c;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = (h * image_width + w) * image_channels + c;
......@@ -127,6 +125,7 @@ class PoolingPE : public PE {
sum += value;
}
}
float value = sum / kernel;
if (value > max) {
max = value;
......@@ -148,7 +147,6 @@ class PoolingPE : public PE {
Tensor float_input;
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
// float_input.saveToFile("pool_float.txt");
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
......@@ -167,7 +165,6 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
// exit(-1);
}
void cpu_compute() {
......@@ -197,18 +194,41 @@ class PoolingPE : public PE {
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
// exit(-1);
}
bool dispatch() {
if (use_cpu_) {
// cpu_compute();
compute();
// exit(-1);
return true;
}
param_.input->syncToDevice();
return compute_fpga_pool(param_.poolingArgs) == 0;
if (param_.globalPooling) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.global_pool_en = true;
config_inplace(inplace_);
int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0];
globalPoolArgs.global_pool_factor =
float_to_half(1.0f / (kernel_height * kernel_width));
config_global_pool(globalPoolArgs);
}
int ret = (compute_fpga_pool(param_.poolingArgs) == 0);
if (param_.globalPooling) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.global_pool_en = false;
config_inplace(inplace_);
globalPoolArgs.global_pool_factor = float_to_half(0);
config_global_pool(globalPoolArgs);
}
return ret;
}
PoolingParam& param() { return param_; }
......@@ -216,6 +236,8 @@ class PoolingPE : public PE {
private:
PoolingParam param_;
bool use_cpu_;
InplaceArgs inplace_ = {0};
GlobalPoolArgs globalPoolArgs;
};
} // namespace zynqmp
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef pooling_process_hpp
#define pooling_process_hpp
#include <string.h>
#include <cmath>
#include <vector>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
namespace paddle {
namespace zynqmp {
inline void pooling_split_channel(
PoolingParam& param, // NOLINT
std::vector<PoolingParam*>& splitParams) { // NOLINT
Tensor* input = param.input;
Tensor* output = param.output;
input->syncToCPU();
int h_kernel = param.kernelSize[0];
int w_kernel = param.kernelSize[1];
if (param.globalPooling) {
h_kernel = input->shape().height();
w_kernel = input->shape().width();
}
int c = input->shape().channel();
int w = input->shape().width();
int wc_h_kernel = w * c * h_kernel;
int dwconv_limit = 131072;
int num = ceil(wc_h_kernel * 1.0f / dwconv_limit);
while (input->shape().channel() % num != 0) {
num++;
}
int channel = ceil(input->shape().channel() * 1.0f / num);
float16* output_address = nullptr;
float16* input_address = nullptr;
float* out_scale_address = nullptr;
for (int i = 0; i < num; i++) {
PoolingParam* pooling_param = new PoolingParam();
// input && output;
Shape in_shape(
NCHW, {1, channel, input->shape().height(), input->shape().width()});
Shape out_shape(
NCHW, {1, channel, output->shape().height(), output->shape().width()});
if (num == 1) {
pooling_param->input = input;
pooling_param->output = output;
input_address = input->data<float16>();
output_address = output->data<float16>();
out_scale_address = output->scale();
} else {
pooling_param->input = new Tensor();
pooling_param->output = new Tensor();
input_address =
pooling_param->input->mutableData<float16>(FP16, in_shape);
output_address =
pooling_param->output->mutableData<float16>(FP16, out_shape);
out_scale_address = pooling_param->output->scale();
}
PoolingArgs& args = pooling_param->poolingArgs;
args.mode = param.type;
args.kernel_reciprocal = fp32_2_fp16(1.0f / (w_kernel * h_kernel));
if (param.globalPooling) {
args.kernel_reciprocal = fp32_2_fp16(1.0f);
}
args.image.address = input_address;
args.image.channels = channel;
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_height = param.paddings[0];
args.image.pad_width = param.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output_address;
args.output.scale_address = out_scale_address;
args.kernel.height = h_kernel;
args.kernel.width = w_kernel;
args.kernel.stride_h = param.strides[0];
args.kernel.stride_w = param.strides[1];
args.out_height = output->shape().height();
args.out_width = output->shape().width();
splitParams.push_back(pooling_param);
}
}
} // namespace zynqmp
} // namespace paddle
#endif /* conv_process_hpp */
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/pooling_process.hpp"
#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
#include "lite/backends/fpga/KD/pes/split_pe.hpp"
namespace paddle {
namespace zynqmp {
class PoolingSplitPE : public PE {
public:
inline int gcd_(int a, int b) {
while (b) {
int temp = a;
a = b;
b = temp % b;
}
return a;
}
inline int lcm_(int a, int b) { return a * b / gcd_(a, b); }
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
output->setDataLocation(Device);
return true;
}
void apply() {
PoolingParam& param = param_;
Tensor* input = param.input;
Tensor* output = param.output;
int channel = output->shape().channel();
int k_height = param_.kernelSize[1];
int k_width = param_.kernelSize[0];
if (param_.globalPooling) {
k_width = input->shape().width();
k_height = input->shape().height();
param_.kernelSize[0] = k_height;
param_.kernelSize[1] = k_width;
} else {
k_height = param_.kernelSize[0];
k_width = param_.kernelSize[1];
}
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 255 || k_height > 255);
if (use_cpu_) {
return;
}
pooling_split_channel(param, splitParams_);
if (splitParams_.size() > 1) {
SplitParam& split_param = splitPE_.param();
split_param.input = param_.input;
for (auto pooling_param : splitParams_) {
split_param.outputs.push_back(pooling_param->input);
}
splitPE_.init();
splitPE_.apply();
ConcatParam& concat_param = concatPE_.param();
for (auto pooling_param : splitParams_) {
concat_param.inputs.push_back(pooling_param->output);
}
concat_param.output = param_.output;
concatPE_.init();
concatPE_.apply();
}
}
void compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0];
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = output->shape().width();
int kernel = kernel_height * kernel_width;
float max = 0;
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend = std::min(hstart + kernel_height, image_height);
int wend = std::min(wstart + kernel_width, image_width);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
kernel = (hend - hstart) * (wend - wstart);
for (int c = 0; c < image_channels; ++c) {
const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
float sum = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = (h * image_width + w) * image_channels + c;
float value = image_addr[index];
// ofs_out << value << std::endl;
sum += value;
}
}
float value = sum / kernel;
if (value > max) {
max = value;
}
data_out[pool_index] = float_to_half(value);
}
}
}
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
output->flush();
}
bool dispatch() {
Tensor* output = param_.output;
param_.input->syncToDevice();
if (use_cpu_) {
compute();
return true;
}
if (splitParams_.size() > 1) {
splitPE_.dispatch();
}
int ret = 0;
int index = 0;
InplaceArgs inplace_ = {0};
GlobalPoolArgs globalPoolArgs;
if (param_.globalPooling) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.global_pool_en = true;
config_inplace(inplace_);
int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0];
globalPoolArgs.global_pool_factor =
fp32_2_fp16(1.0f / (kernel_height * kernel_width));
config_global_pool(globalPoolArgs);
}
for (auto pooling_param : splitParams_) {
ret |= compute_fpga_pool(pooling_param->poolingArgs);
float* scale_address = pooling_param->poolingArgs.output.scale_address;
output->scale()[0] = scale_address[0];
output->scale()[1] = scale_address[1];
}
if (param_.globalPooling) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
inplace_.relu6_enable = false;
inplace_.sigmoid_enable = false;
inplace_.global_pool_en = false;
config_inplace(inplace_);
globalPoolArgs.global_pool_factor = fp32_2_fp16(1.0f);
config_global_pool(globalPoolArgs);
}
if (splitParams_.size() > 1) {
concatPE_.dispatch();
}
return ret;
}
~PoolingSplitPE() {
for (auto pooling_param : splitParams_) {
if (splitParams_.size() > 1) {
delete pooling_param->input;
delete pooling_param->output;
delete pooling_param;
}
}
splitParams_.clear();
}
PoolingParam& param() { return param_; }
private:
PoolingParam param_;
ConcatPE concatPE_;
SplitPE splitPE_;
std::vector<PoolingParam*> splitParams_;
bool use_cpu_ = false;
};
} // namespace zynqmp
} // namespace paddle
......@@ -93,8 +93,8 @@ void PriorBoxPE::compute_prior_box() {
const float &step_h = param.stepH;
const float &offset = param.offset;
Tensor *output_boxes = this->cachedBoxes_;
Tensor *output_variances = this->cachedVariances_;
Tensor *output_boxes = this->cachedBoxes_.get();
Tensor *output_variances = this->cachedVariances_.get();
Tensor boxes;
Tensor variances;
......@@ -241,7 +241,6 @@ void PriorBoxPE::compute_prior_box() {
}
boxes.flush();
boxes.syncToCPU();
variances.flush();
output_boxes->copyFrom(&boxes);
output_variances->copyFrom(&variances);
......@@ -251,8 +250,8 @@ void PriorBoxPE::apply() {}
bool PriorBoxPE::dispatch() {
if (cachedBoxes_ == nullptr) {
cachedBoxes_ = new Tensor();
cachedVariances_ = new Tensor();
cachedBoxes_.reset(new Tensor());
cachedVariances_.reset(new Tensor());
cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
cachedBoxes_->setDataLocation(CPU);
......@@ -260,12 +259,14 @@ bool PriorBoxPE::dispatch() {
compute_prior_box();
}
param_.outputBoxes->copyFrom(this->cachedBoxes_);
param_.outputBoxes->copyFrom(this->cachedBoxes_.get());
param_.outputVariances->copyFrom(this->cachedVariances_.get());
param_.outputVariances->copyFrom(this->cachedVariances_);
param_.outputBoxes->flush();
param_.outputBoxes->syncToCPU();
param_.outputVariances->flush();
param_.outputBoxes->setCached(true);
param_.outputVariances->setCached(true);
return true;
}
} // namespace zynqmp
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
......@@ -23,9 +24,11 @@ class PriorBoxPE : public PE {
public:
bool init() {
param_.outputBoxes->setAligned(false);
param_.outputVariances->setAligned(false);
param_.outputBoxes->setDataLocation(CPU);
param_.outputBoxes->setCacheable(true);
param_.outputVariances->setAligned(false);
param_.outputVariances->setDataLocation(CPU);
param_.outputVariances->setCacheable(true);
return true;
}
......@@ -37,8 +40,9 @@ class PriorBoxPE : public PE {
private:
PriorBoxParam param_;
Tensor* cachedBoxes_ = nullptr;
Tensor* cachedVariances_ = nullptr;
// TODO(chonwhite) use unique_ptr;
std::unique_ptr<Tensor> cachedBoxes_;
std::unique_ptr<Tensor> cachedVariances_;
void compute_prior_box();
};
......
......@@ -23,43 +23,27 @@ class ReluPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
output->setDataLocation(Device);
output->setAligned(param_.input->aligned());
output->setDataLocation(CPU);
return true;
}
void apply() {
Tensor* src = param_.input;
args_.input_data_type = DATA_TYPE_FP16;
args_.output_data_type = DATA_TYPE_FP16;
args_.input_layout_type = LAYOUT_HWC;
args_.output_layout_type = LAYOUT_HWC;
args_.image = {.address = src->data<void>(),
.scale_address = src->scale(),
.channels = (uint32_t)src->shape().channel(),
.width = (uint32_t)src->shape().width(),
.height = (uint32_t)src->shape().height(),
.pad_width = 0u,
.pad_height = 0u};
args_.output = {
.address = param_.output->data<void>(),
.scale_address = param_.output->scale(),
};
inplace_.relu_enable = false;
inplace_.power_enable = false;
inplace_.normalize_enable = false;
}
void apply() {}
bool dispatch() {
inplace_.relu_enable = true;
config_inplace(inplace_);
param_.input->syncToDevice();
param_.output->copyFrom(param_.input);
param_.output->invalidate();
inplace_.relu_enable = false;
config_inplace(inplace_);
param_.input->invalidate();
int16_t* input_data = param_.input->data<int16_t>();
float16* out_data = param_.output->data<float16>();
for (int i = 0; i < param_.input->shape().alignedElementCount(); i++) {
int16_t v = param_.input->data<float16>()[i];
if (v > 0) {
out_data[i] = input_data[i];
} else {
out_data[i] = zero;
}
}
param_.output->copyScaleFrom(param_.input);
param_.output->flush();
return true;
}
......@@ -67,8 +51,7 @@ class ReluPE : public PE {
private:
InputParam param_;
BypassArgs args_;
InplaceArgs inplace_;
float16 zero = float_to_half(0.0f);
};
} // namespace zynqmp
......
......@@ -73,9 +73,38 @@ class ResizePE : public PE {
scale[0] = max / 127.0;
scale[1] = 127.0 / max;
}
void cpu_compute() {
Shape& in_shape = param_.input->shape();
Shape& out_shape = param_.output->shape();
int channel = in_shape.channel();
int in_height = in_shape.height();
int in_width = in_shape.width();
int out_width = out_shape.width();
int factor = out_shape.width() / in_shape.width();
param_.input->syncToCPU();
for (int h = 0; h < in_height; h++) {
for (int w = 0; w < in_width; w++) {
int src_index = in_width * channel * h + w * channel;
float16* src = param_.input->data<float16>() + src_index;
for (int v = 0; v < factor; v++) {
for (int i = 0; i < factor; i++) {
int dst_index = out_width * channel * h * factor +
out_width * channel * v + w * channel * factor +
channel * i;
float16* dst = param_.output->data<float16>() + dst_index;
memcpy(dst, src, channel * sizeof(float16));
}
}
}
}
param_.output->flush();
param_.output->copyScaleFrom(param_.input);
}
bool dispatch() {
bool ret = compute_fpga_resize(args_) == 0;
cpu_compute();
return true;
}
......
......@@ -141,22 +141,22 @@ class ScalePE : public PE {
Tensor* output = param_.output;
Tensor float_input;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
input->syncToCPU();
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
float* scale_data = param_.scale->data<float>();
float16* scale_data = param_.scale->data<float16>();
int wh = input->shape().width() * input->shape().height();
float16* in_data = input->data<float16>();
float max = 0;
for (int i = 0; i < wh; i++) {
for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c];
float x = image_addr[index];
float y = half_to_float(scale_data[c]);
float value = x * y;
data_out[index] = float_to_half(value);
if (value < 0) {
......@@ -180,7 +180,6 @@ class ScalePE : public PE {
param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
dw_param.quantizedFilter()->flush();
}
param_.input->syncToDevice();
......
......@@ -59,6 +59,7 @@ static void softmax(Tensor *X, Tensor *Y) {
int batch_size = X->shape().num();
int num_classes = dims[X->shape().dimSize() - 1];
int channels = X->shape().numel() / batch_size / num_classes;
float *x = X->data<float>();
float *y = Y->mutableData<float>();
......@@ -140,11 +141,11 @@ bool SoftmaxPE::init() {
bool SoftmaxPE::dispatch() {
Tensor *input = param_.input;
Tensor *output = param_.output;
input->syncToCPU();
Tensor float_input;
Tensor float_output;
float_input.mutableData<float>(DataType::FP32, input->shape());
input->syncToDevice();
float_input.copyFrom(input);
float *out_data =
......@@ -154,6 +155,7 @@ bool SoftmaxPE::dispatch() {
float_output.flush();
output->copyFrom(&float_output);
output->flush();
return true;
}
......
......@@ -105,7 +105,7 @@ class SplitPE : public PE {
in_stride,
out_stride[axis]);
input_offset += out_stride[axis];
// out->flush();
out->flush();
}
return true;
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
namespace paddle {
namespace zynqmp {
float sigmoid(float x) { return 1.0 / (1.0 + std::exp(-x)); }
inline void GetYoloBox(float* box,
const float* x,
const int* anchors,
int w,
int h,
int an_idx,
int grid_size,
int input_size,
int index,
int img_height,
int img_width) {
box[0] = (w + sigmoid(x[index])) * img_width * 1.0f / grid_size;
box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size;
box[2] = std::exp(x[index + 2]) * anchors[2 * an_idx] * img_width * 1.0f /
input_size;
box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] * img_height *
1.0f / input_size;
}
inline int GetEntryIndex(int batch,
int an_idx,
int hw_idx,
int an_num,
int an_stride,
int stride,
int entry) {
return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
}
inline void CalcDetectionBox(float* boxes,
float* box,
const int box_idx,
const int img_height,
const int img_width) {
boxes[box_idx] = box[0] - box[2] / 2;
boxes[box_idx + 1] = box[1] - box[3] / 2;
boxes[box_idx + 2] = box[0] + box[2] / 2;
boxes[box_idx + 3] = box[1] + box[3] / 2;
boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0;
boxes[box_idx + 1] = boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
boxes[box_idx + 2] =
boxes[box_idx + 2] < img_width - 1 ? boxes[box_idx + 2] : (img_width - 1);
boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 ? boxes[box_idx + 3]
: (img_height - 1);
}
inline void CalcLabelScore(float* scores,
const float* input,
const int label_idx,
const int score_idx,
const int class_num,
const float conf) {
for (int i = 0; i < class_num; i++) {
scores[score_idx + i] = conf * sigmoid(input[label_idx + i]);
}
}
class YoloBoxPE : public PE {
public:
bool init() {
param_.outputBoxes->setAligned(false);
param_.outputScores->setAligned(false);
param_.outputBoxes->setDataLocation(CPU);
param_.outputScores->setDataLocation(CPU);
return true;
}
bool dispatch() {
auto* input = param_.input;
auto* imgsize = param_.imgSize;
auto* boxes = param_.outputBoxes;
auto* scores = param_.outputScores;
auto anchors = param_.anchors;
int class_num = param_.classNum;
float conf_thresh = param_.confThresh;
int downsample_ratio = param_.downsampleRatio;
const int num = input->shape().num();
const int height = input->shape().height();
const int width = input->shape().width();
const int box_num = boxes->shape().channel();
const int an_num = anchors.size() / 2;
int input_size = downsample_ratio * height;
const int stride = height * width;
const int an_stride = (class_num + 5) * stride;
Tensor anchors_;
Shape anchors_shape(N, {an_num * 2});
auto anchors_data = anchors_.mutableData<int32_t>(INT32, anchors_shape);
std::copy(anchors.begin(), anchors.end(), anchors_data);
input->syncToCPU();
Tensor input_float;
input_float.setDataLocation(CPU);
float* input_data = input_float.mutableData<float>(FP32, input->shape());
input_float.setAligned(input->aligned());
input_float.copyFrom(input);
input_float.unalignImage();
int32_t* imgsize_data = imgsize->mutableData<int32_t>();
Tensor boxes_float;
Tensor scores_float;
boxes_float.setDataLocation(CPU);
float* boxes_float_data =
boxes_float.mutableData<float>(FP32, boxes->shape());
memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float));
scores_float.setDataLocation(CPU);
float* scores_float_data =
scores_float.mutableData<float>(FP32, scores->shape());
memset(scores_float_data, 0, scores->shape().numel() * sizeof(float));
float box[4];
int img_height = imgsize_data[0];
int img_width = imgsize_data[1];
int channel = input_float.shape().channel();
int count = 0;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int n = 0; n < an_num; n++) {
int obj_idx =
channel * width * h + channel * w + n * (5 + class_num) + 4;
float conf = sigmoid(input_data[obj_idx]);
if (conf < conf_thresh) {
count++;
continue;
}
int box_idx =
channel * width * h + channel * w + n * (5 + class_num) + 0;
GetYoloBox(box,
input_data,
anchors_data,
w,
h,
n,
height,
input_size,
box_idx,
img_height,
img_width);
box_idx = h * an_num * 4 * width + an_num * 4 * w + n * 4;
CalcDetectionBox(
boxes_float_data, box, box_idx, img_height, img_width);
int label_idx =
channel * width * h + channel * w + n * (5 + class_num) + 5;
int score_idx = h * an_num * class_num * width +
an_num * class_num * w + n * class_num;
CalcLabelScore(scores_float_data,
input_data,
label_idx,
score_idx,
class_num,
conf);
}
}
}
boxes->copyFrom(&boxes_float);
scores->copyFrom(&scores_float);
input->setAligned(true);
}
void apply() {}
YoloBoxParam& param() { return param_; }
private:
YoloBoxParam param_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -32,6 +32,10 @@ static struct N n_;
class Shape {
public:
std::function<int(Shape& s)> aligment_fuction = [](Shape& s) { // NOLINT
return s.layout_->alignedElementCount(s.dims_);
};
explicit Shape(std::vector<int> dims) { dims_ = dims; }
Shape(LayoutType type, std::vector<int> dims) {
......@@ -44,6 +48,10 @@ class Shape {
setLayoutType(src.layoutType_);
}
void setAligmentFunction(std::function<int(Shape& s)> f) { // NOLINT
aligment_fuction = f;
}
bool shouldAlign() {
return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_);
}
......@@ -72,13 +80,11 @@ class Shape {
std::vector<int> dims() { return dims_; }
size_t memorySize(int cellSize) {
return layout_->alignedElementCount(dims_) * cellSize;
}
size_t memorySize(int cellSize) { return aligment_fuction(*this) * cellSize; }
int numel() { return layout_->elementCount(dims_); }
int alignedElementCount() { return layout_->alignedElementCount(dims_); }
int alignedElementCount() { return aligment_fuction(*this); }
void setLayoutType(LayoutType layout) {
this->layoutType_ = layout;
......
......@@ -38,6 +38,7 @@ enum DataType : int {
FP16 = 1,
INT8 = 2,
INT32 = 3,
INT64 = 4,
};
enum DataSyncStatus : int {
......@@ -58,6 +59,8 @@ inline int CellSize(DataType type) {
return sizeof(int32_t);
case INT8:
return sizeof(int8_t);
case INT64:
return sizeof(int64_t);
default:
return 0;
}
......@@ -66,17 +69,16 @@ inline int CellSize(DataType type) {
class PlaceHolder {
public:
PlaceHolder() {}
explicit PlaceHolder(size_t size) {
size_ = size;
data_ = fpga_malloc(size_);
memset(data_, 0, size);
fpga_flush(data_, size);
}
void* data() { return data_; }
void set_data(const void* ptr) { data_ = const_cast<void*>(ptr); }
size_t memorySize() { return size_; }
void set_size(size_t new_size) { size_ = new_size; }
~PlaceHolder() { fpga_free(data_); }
......@@ -99,7 +101,7 @@ class Tensor {
return nullptr;
}
void* ptr = reinterpret_cast<char*>(this->placeHolder_->data()) +
offset * CellSize(dataType_);
offset_ * CellSize(dataType_);
return reinterpret_cast<Dtype*>(ptr);
}
......@@ -116,7 +118,7 @@ class Tensor {
template <typename Dtype>
Dtype* mutableData() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
shape_->memorySize(CellSize(dataType_)) * mem_factor_ + 16;
if (placeHolder_ != nullptr) {
if (memorySize > placeHolder_->memorySize()) {
placeHolder_.reset(new PlaceHolder(memorySize));
......@@ -134,6 +136,10 @@ class Tensor {
return placeHolder_->memorySize();
}
void setMemScale(float mem_factor) { mem_factor_ = mem_factor; }
void setOffset(int offset) { offset_ = offset; }
void setDataType(DataType dataType) { this->dataType_ = dataType; }
DataType dataType() { return this->dataType_; }
......@@ -240,10 +246,6 @@ class Tensor {
}
}
void setMemScale(float scale_factor) {
this->mem_scale_factor_ = scale_factor;
}
void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
......@@ -254,7 +256,7 @@ class Tensor {
this->dataType_ = src->dataType_;
this->aligned_ = src->aligned_;
this->dateLocation_ = src->dateLocation_;
this->offset = offset;
this->offset_ = offset;
shape_ = new Shape(const_cast<Shape&>(shape));
}
......@@ -279,16 +281,13 @@ class Tensor {
.height = 1,
.pad_width = 0u,
.pad_height = 0u};
ImageOutputArgs output = {
args.output = {
.address = data<void>(), .scale_address = scale(),
};
args.output = output;
src->syncToDevice();
size_t aligned_remainder = src->shape().numel() % 16;
if (aligned_remainder > 0) {
size_t dtype_size =
src->dataType_ == FP32 ? sizeof(float) : sizeof(float16);
size_t dtype_size = CellSize(src->dataType_);
void* dst = src->data<char>() + src->shape().numel() * dtype_size;
memset(dst, 0, aligned_remainder * dtype_size);
fpga_flush(dst, aligned_remainder * dtype_size);
......@@ -299,14 +298,10 @@ class Tensor {
this->invalidate();
}
void flush() {
size_t memorySize = placeHolder_->memorySize();
fpga_flush(placeHolder_->data(), memorySize);
}
void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
void invalidate() {
size_t memorySize = placeHolder_->memorySize();
fpga_invalidate(placeHolder_->data(), memorySize);
fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
}
void sync() {
......@@ -348,16 +343,17 @@ class Tensor {
}
}
void printScale(std::string type) { printScale(); }
std::string dimsFileName() {
return paddle::lite::to_string(shape_->num()) + "_" +
paddle::lite::to_string(shape_->channel()) + "_" +
paddle::lite::to_string(shape_->height()) + "_" +
paddle::lite::to_string(shape_->width()) + ".txt";
return std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width()) + ".txt";
}
void saveToFile() { std::string path = dimsFileName(); }
void saveToFile() {
std::string path = dimsFileName();
// saveToFile(path);
}
void saveToFile(std::string prefix, bool with_shape) {
std::string path = prefix;
......@@ -371,34 +367,61 @@ class Tensor {
void saveToFile(std::string path) {
syncToCPU();
invalidate();
std::ofstream ofs;
static int counter = 0;
std::string npath = paddle::lite::to_string(counter) + "_" + path;
std::string npath = std::to_string(counter) + "_" + path;
counter++;
save_file_with_name(npath);
}
void save_file_with_name(std::string path) {
invalidate();
Tensor* t = this;
Tensor unaligned;
if (this->aligned_) {
unaligned.dataType_ = this->dataType_;
unaligned.aligned_ = this->aligned_;
unaligned.mutableData<void>(dataType_, *shape_);
unaligned.copyFrom(this);
unaligned.unalignImage();
unaligned.syncToCPU();
t = &unaligned;
}
std::ofstream ofs;
ofs.open(path);
ofs << scale()[0] << " / " << scale()[1] << std::endl;
ofs << "type:" << dataType_ << " scale: " << scale()[0] << " id:" << id_
<< std::endl;
for (int i = 0; i < shape_->numel(); i++) {
float value = 0;
if (dataType_ == FP32) {
value = data<float>()[i];
} else if (dataType_ == FP16) {
value = half_to_float(data<float16>()[i]);
} else {
value = data<int8_t>()[i];
switch (dataType_) {
case FP16:
value = half_to_float(t->data<float16>()[i]);
break;
case FP32:
value = t->data<float>()[i];
break;
case INT8:
value = t->data<int8_t>()[i];
break;
case INT32:
value = data<int32_t>()[i];
break;
case INT64:
value = data<int64_t>()[i];
break;
default:
std::cout << "Unknown type!! \n";
exit(-1);
}
ofs << value << std::endl;
}
ofs.close();
}
void releaseData() { placeHolder_.reset(); }
void readFromFile(std::string path) {
std::ifstream file_stream;
file_stream.open(path);
......@@ -408,48 +431,25 @@ class Tensor {
int num = shape_->numel();
invalidate();
float max = 0.0f;
if (dataType_ == FP16) {
float16* data = mutableData<float16>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = float_to_half(value);
}
} else {
float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = value;
}
float16* data = mutableData<float16>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = float_to_half(value);
}
flush();
placeHolder_->scale_[0] = max / 127.0f;
placeHolder_->scale_[1] = 127.0f / max;
}
friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
os << "tensor:"
<< "\n";
os << "dims: {";
for (int i = 0; i < tensor.shape().dimSize(); ++i) {
os << tensor.shape()[i] << " ";
}
os << "}\n";
for (int i = 0; i < tensor.shape().numel(); i++) {
float value = 0;
if (tensor.dataType() == FP32) {
value = tensor.data<float>()[i];
} else {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
}
void setCacheable(bool cacheable) { cacheable_ = cacheable; }
bool cacheable() { return cacheable_; }
void setCached(bool cached) { cached_ = cached; }
bool cached() { return cached_; }
~Tensor() {
if (shape_ != nullptr) {
......@@ -459,8 +459,10 @@ class Tensor {
}
private:
int offset = 0;
float mem_scale_factor_ = 1.0f;
bool cacheable_ = false;
bool cached_ = false;
int offset_ = 0;
float mem_factor_ = 1.0f;
std::shared_ptr<PlaceHolder> placeHolder_;
Shape* shape_ = nullptr;
DataType dataType_ = FP32;
......
......@@ -22,21 +22,17 @@ using value_type = int64_t;
value_type DDimLite::production() const {
value_type res = 1;
for (size_t i = 0; i < this->size(); i++) {
res *= (*this)[i];
for (size_t i = 0; i < data_.size(); i++) {
res *= data_[i];
}
return res;
}
value_type DDimLite::count(int start, int end) const {
if (start < 0) {
start = 0;
}
if (end > size()) {
end = size();
}
start = (std::max)(start, 0);
end = (std::min)(end, static_cast<int>(data_.size()));
if (end < start) {
end = start;
return 0;
}
value_type sum = 1;
for (auto i = start; i < end; ++i) {
......@@ -46,11 +42,13 @@ value_type DDimLite::count(int start, int end) const {
}
DDimLite DDimLite::Slice(int start, int end) const {
std::vector<value_type> vec;
start = (std::max)(start, 0);
end = (std::min)(end, static_cast<int>(data_.size()));
std::vector<value_type> new_dim(end - start);
for (int i = start; i < end; i++) {
vec.push_back((*this)[i]);
new_dim[i - start] = data_[i];
}
return DDimLite(vec);
return DDim(new_dim);
}
std::string DDimLite::repr() const {
......@@ -69,7 +67,7 @@ std::string DDimLite::repr() const {
}
void TensorLite::ShareDataWith(const TensorLite &other) {
buffer_ = other.buffer_;
buffer_ = other.buffer_; // TODO(chonwhite) delete buffer;
dims_ = other.dims_;
zynq_tensor_ = other.zynq_tensor_;
target_ = other.target_;
......@@ -78,30 +76,35 @@ void TensorLite::ShareDataWith(const TensorLite &other) {
throw - 1;
}
void *TensorLite::mutable_data(size_t memory_size) {
memory_size_ = memory_size;
buffer_->ResetLazy(target_, memory_size_);
// throw -1;
std::cout << memory_size << std::endl;
return buffer_->data();
}
void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
target_ = target;
return mutable_data(memory_size);
}
void TensorLite::CopyDataFrom(const TensorLite &other) {
dims_ = other.dims_;
target_ = other.target_;
lod_ = other.lod_;
auto dt = zynq_tensor_->dataType();
auto shape = other.zynq_tensor_->shape();
if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor());
}
auto dt = zynq_tensor_->dataType();
Resize(other.dims());
auto shape = other.zynq_tensor_->shape();
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
this->ZynqTensor()->copyFrom(other.ZynqTensor());
precision_ = other.precision_;
memcpy(this->ZynqTensor()->data<void>(),
other.ZynqTensor()->data<void>(),
other.ZynqTensor()->shape().numel() * sizeof(float));
}
void *TensorLite::mutable_data(size_t memory_size) {
memory_size_ = memory_size; // TODO(chonwhite) delete buffer;
buffer_->ResetLazy(target_, memory_size_);
return buffer_->data();
}
void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
target_ = target;
return mutable_data(memory_size);
}
} // namespace lite
......
......@@ -78,7 +78,11 @@ class DDimLite {
}
friend bool operator!=(const DDimLite &a, const DDimLite &b) {
return !(a == b);
if (a.size() != b.size()) return true;
for (size_t i = 0; i < a.size(); i++) {
if (a[i] != b[i]) return true;
}
return false;
}
private:
......@@ -93,7 +97,7 @@ class TensorLite {
TensorLite() : buffer_(std::make_shared<Buffer>()) {}
template <typename DType, typename DimT, TargetType Target>
void Assign(DType *data, const DimT &dim) {
void Assign(const DType *data, const DimT &dim) {
Resize(dim);
auto *dst = mutable_data<DType, void>(Target);
CopySync<Target>(
......@@ -107,10 +111,11 @@ class TensorLite {
template <typename T, typename R = T>
const R *data() const {
return zynq_tensor_->data<R>() + offset_;
// return zynq_tensor_->data<R>();
}
void Resize(const DDimLite &ddim) { dims_ = ddim; }
void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
void Resize(const std::vector<int64_t> &x) { dims_.ConstructFrom(x); }
const DDimLite &dims() const { return dims_; }
int64_t numel() const { return dims_.production(); }
......@@ -142,7 +147,16 @@ class TensorLite {
void *mutable_data(size_t memory_size);
void *mutable_data(TargetType target, size_t memory_size);
const void *raw_data() const { return buffer_->data(); }
const void *raw_data() const {
return buffer_->data();
} // TODO(chonwhite) delete buffer;
void clear() {
// zynq_tensor_->releaseData();
if (zynq_tensor_) {
memset(zynq_tensor_->data<void>(), 0, zynq_tensor_->memorySize());
}
}
size_t data_size() const { return this->dims().production(); }
......@@ -150,17 +164,19 @@ class TensorLite {
size_t offset() const { return offset_; }
bool IsInitialized() const { return buffer_->data(); }
void clear() {
buffer_->Free();
offset_ = 0;
}
bool IsInitialized() const {
return buffer_->data();
} // TODO(chonwhite) delete buffer;
// Other share data to this.
void ShareDataWith(const TensorLite &other);
void CopyDataFrom(const TensorLite &other);
void ResetBuffer(std::shared_ptr<Buffer> buffer, size_t memory_size) {
// TODO(chonwhite) deal with buffer;
}
template <typename T>
TensorLite Slice(int64_t begin, int64_t end) const;
......@@ -169,7 +185,7 @@ class TensorLite {
TargetType target() const { return target_; }
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_.get(); }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
os << "Tensor:" << '\n';
......@@ -189,7 +205,7 @@ class TensorLite {
// set values of precision_ and persistable_ after updating it.
// If your tensor is just a temp tensor, such as activations,
// you can ignore these two attributes.
PrecisionType precision_{PrecisionType::kUnk};
PrecisionType precision_{PrecisionType::kFloat};
bool persistable_{false};
DDimLite dims_;
......@@ -198,12 +214,62 @@ class TensorLite {
size_t memory_size_{};
size_t offset_{0};
zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
std::shared_ptr<zynqmp::Tensor> zynq_tensor_;
template <typename T>
void mutable_data_internal();
};
template <typename T>
zynqmp::DataType get_date_type() {
zynqmp::DataType data_type = zynqmp::FP32;
if (typeid(T) == typeid(float)) {
data_type = zynqmp::FP32;
}
if (typeid(T) == typeid(zynqmp::float16)) {
data_type = zynqmp::FP16;
}
if (typeid(T) == typeid(int)) {
data_type = zynqmp::INT32;
}
if (typeid(T) == typeid(int32_t)) {
data_type = zynqmp::INT32;
}
if (typeid(T) == typeid(int8_t)) {
data_type = zynqmp::INT8;
}
if (typeid(T) == typeid(int64_t)) {
data_type = zynqmp::INT64;
}
return data_type;
}
template <typename T>
PrecisionType get_precistion_type() {
PrecisionType data_type = PrecisionType::kUnk;
if (typeid(T) == typeid(float)) {
data_type = PrecisionType::kFloat;
}
if (typeid(T) == typeid(zynqmp::float16)) {
data_type = PrecisionType::kFP16;
}
if (typeid(T) == typeid(int)) {
data_type = PrecisionType::kInt32;
}
if (typeid(T) == typeid(int32_t)) {
data_type = PrecisionType::kInt32;
}
if (typeid(T) == typeid(int8_t)) {
data_type = PrecisionType::kInt8;
}
if (typeid(T) == typeid(int64_t)) {
data_type = PrecisionType::kInt64;
}
return data_type;
}
template <typename T, typename R>
R *TensorLite::mutable_data() {
std::vector<int> v;
......@@ -229,14 +295,13 @@ R *TensorLite::mutable_data() {
break;
}
zynqmp::Shape input_shape(layout_type, v);
zynqmp::DataType data_type = get_date_type<T>();
precision_ = get_precistion_type<T>();
zynqmp::DataType data_type = zynqmp::FP32;
if (typeid(T) == typeid(float)) {
data_type = zynqmp::FP32;
}
if (typeid(T) == typeid(zynqmp::float16)) {
data_type = zynqmp::FP16;
if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor());
}
return zynq_tensor_->mutableData<R>(data_type, input_shape);
}
......@@ -268,14 +333,13 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
memcpy(dst_data,
src_data + static_cast<size_t>(begin * base) * sizeof(T),
dst_dims.production() * sizeof(T));
dst.ZynqTensor()->saveToFile("_slice", true);
return dst;
}
}
template <typename T>
void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
// TODO(chonwhite) delete this function;
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/program.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
class Monitor {
public:
static Monitor& get_instance() {
static Monitor s_instance;
return s_instance;
}
void inferStart() {}
void preRun(Instruction& inst) { // NOLINT
auto op = const_cast<OpLite*>(inst.op());
auto op_type = op->Type();
VLOG(4) << "Running op:" << op_type << " on " << inst.kernel()->name();
}
void postRun(Instruction& inst) { // NOLINT
auto op = const_cast<OpLite*>(inst.op());
auto op_info = op->op_info();
auto in_names = op_info->input_names();
static std::vector<std::string> tensor_names = {};
auto should_print = [tensor_names](std::string& name) -> bool {
if (std::find(tensor_names.begin(), tensor_names.end(), name) !=
tensor_names.end()) {
return true;
}
return false;
};
auto out_args = op_info->output_names();
for (auto name : out_args) {
VLOG(4) << "\n out_tensor:" << name;
auto* var = op->scope()->FindVar(name);
if (var->IsType<lite::Tensor>()) {
lite::Tensor* tensor =
const_cast<lite::Tensor*>(&var->Get<lite::Tensor>());
if (tensor->ZynqTensor() != nullptr) {
std::string substr = "/";
std::size_t found = name.rfind(substr);
VLOG(4) << "\n out_tensor:::" << name << "," << found;
if (found != std::string::npos) {
name.replace(found, substr.length(), "_");
}
VLOG(4) << "\n out_tensor:::" << name;
if (tensor->ZynqTensor() != nullptr && should_print(name)) {
tensor->ZynqTensor()->saveToFile(name, true);
}
}
}
}
}
void inferEnd() {}
private:
};
} // namespace lite
} // namespace paddle
......@@ -44,6 +44,7 @@ lite_cc_library(mir_passes
elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
fpga_kernel_place_correct_pass.cc
type_target_cast_pass.cc
type_layout_cast_pass.cc
type_precision_cast_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fpga_kernel_place_correct_pass.h"
#include <memory>
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void KernelPlaceCorrectPass::Apply(const std::unique_ptr<SSAGraph> &graph) {
CorrectArgumentPlace(graph.get());
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(kernel_place_correct_pass,
paddle::lite::mir::KernelPlaceCorrectPass)
.BindTargets({TARGET(kFPGA)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
namespace mir {
/*
* Correct the place of the variables in the SSAGrpah, it will inference the
* variables' place by the kernels outputs them.
*/
class KernelPlaceCorrectPass : public DebugPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
private:
void CorrectArgumentPlace(SSAGraph* graph) {
auto& valid_places = graph->valid_places();
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference
// for
// tool operator
if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
VLOG(3) << "inst.op_type() == 'io_copy', continue";
if (inst.op_type() == "io_copy") continue;
}
// deal with inputs
VLOG(4) << "checking op " << inst.op_info()->Repr();
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
auto in = x->inlinks.front();
if (!in) {
break;
}
auto out = x->outlinks.front();
auto p = in->AsArg().type->precision();
std::string node_name = out->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
auto op_type = inst.op_type();
if (op_type == "reshape" || op_type == "reshape2") {
for (auto* x_in : x->inlinks) {
std::string in_name =
get_argname(x_in->AsArg().name, inst.op_info()->inputs());
if (in_name == "X") {
in = x_in;
}
}
p = in->AsArg().type->precision();
if (p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kHost);
UpdateTensor(inst, in, out, TargetType::kHost);
}
}
if (inst.op_type() == "fetch") {
UpdateTarget(inst, TargetType::kFPGA);
}
if (inst.op_type() == "split" || inst.op_type() == "transpose" ||
inst.op_type() == "transpose2") {
if (p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
for (auto* x_out : x->outlinks) {
UpdateTensor(inst, in, x_out, TargetType::kARM);
}
}
}
if (inst.op_type() == "concat") {
if (p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
UpdateTensor(inst, in, out, TargetType::kARM);
}
}
if (inst.op_type() == "elementwise_mul") {
UpdateTarget(inst, TargetType::kFPGA);
for (auto* in : x->inlinks) {
std::string in_name =
get_argname(in->AsArg().name, inst.op_info()->inputs());
if (in_name == "Y") {
in = in;
p = in->AsArg().type->precision();
std::unique_ptr<KernelBase> best_match;
for (auto& k : inst.kernels()) {
auto kp = k->GetInputDeclType(in_name)->precision();
if (kp == p) {
best_match = std::move(k);
}
}
inst.kernels().clear();
inst.kernels().emplace_back(std::move(best_match));
break;
}
}
}
}
}
// Update me's kUnk fields by other's fields.
void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT
auto new_place = inst.place();
new_place.target = new_target;
if (new_target == TargetType::kARM) {
new_place.precision = PrecisionType::kFloat;
new_place.layout = DataLayoutType::kNCHW;
}
if (new_target == TargetType::kHost) {
new_place.precision = PrecisionType::kFloat;
new_place.layout = DataLayoutType::kNCHW;
}
std::vector<Place> places;
places.push_back(new_place);
inst.ResetKernels(places);
}
void UpdateTensor(mir::Node::Stmt& inst, // NOLINT
Node* in,
Node* out,
TargetType new_target = TargetType::kUnk) {
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
std::string arg_name =
get_argname(out->AsArg().name, inst.op_info()->outputs());
std::string in_name =
get_argname(in->AsArg().name, inst.op_info()->inputs());
auto type = inst.picked_kernel().GetInputDeclType(in_name);
auto tmp_ptype = in->AsArg().type->precision();
auto tmp_target = type->target();
auto tmp_layout = type->layout();
if (new_target == TargetType::kARM) {
tmp_target = TargetType::kARM;
tmp_ptype = PrecisionType::kFloat;
tmp_layout = DataLayoutType::kNCHW;
}
if (new_target == TargetType::kHost) {
tmp_target = TargetType::kHost;
tmp_ptype = PrecisionType::kFloat;
tmp_layout = DataLayoutType::kNCHW;
}
out->AsArg().type =
LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -32,9 +32,12 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(graph.get());
#endif
#endif
fusion::FcFuser fuser2(false);
fuser2(graph.get());
#ifdef LITE_WITH_FPGA
fusion::FcFuser fpga_fuser(true);
fpga_fuser(graph.get());
#endif
}
} // namespace mir
......
......@@ -51,10 +51,17 @@ class IoCopyKernelPickPass : public StmtPass {
// directly.
if (TargetCompatibleTo(*outy, *out_arg_ty)) {
LOG(INFO) << "get a IOCopy kernel";
if (kernel->target() == TargetType::kFPGA) {
node.outlinks.front()->AsArg().type = LiteType::GetTensorTy(
kernel->target(), kernel->precision(), kernel->layout());
}
auto x = std::move(kernel);
kernels.clear();
kernels.emplace_back(std::move(x));
is_found = true;
break;
}
}
......
......@@ -148,6 +148,49 @@ class StaticKernelPickPass : public mir::StmtPass {
}
}
if (kernel.target() == TARGET(kFPGA)) {
VLOG(4) << "alias:" << kernel.alias();
/**
* we want to use fpga kernel as much as possible, so we give it a very
*high score,
* so this kernel can be picked, it may be not the best option, and we
*shall correct
* it in kernel_place_correct_pass
*
* 4000 is a arbitrary high score that can purpress all the other kernels.
**/
final_score = 4000;
for (size_t i = 0; i < in_names.size(); ++i) {
std::string tmp;
CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
if (in_types.count(in_names[i]) &&
in_types.at(in_names[i]) ==
kernel.GetInputDeclType(tmp)->precision()) {
final_score += 100; // multiple inputs pick the most matched one;
}
}
for (size_t i = 0; i < out_names.size(); ++i) {
std::string tmp;
CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
VLOG(4) << tmp << " == "
<< PrecisionToStr(kernel.GetOutputDeclType(tmp)->precision());
if (out_types.count(out_names[i]) > 0) {
VLOG(4) << "decType: "
<< PrecisionToStr(kernel.GetOutputDeclType(tmp)->precision());
VLOG(4) << "cout:" << out_types.count(out_names[i]) << " type_name: "
<< PrecisionToStr(out_types.at(out_names[i]));
}
if (out_types.count(out_names[i]) &&
out_types.at(out_names[i]) ==
kernel.GetOutputDeclType(tmp)->precision()) {
final_score += 100; // multiple outputs pick the most matched one;
}
}
}
VLOG(4) << "[score(final)]:" << final_score;
VLOG(2) << "-------- pick summary for " << instruct.op_type()
<< " --------";
......
......@@ -80,99 +80,100 @@ class Optimizer {
InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
if (passes.empty() || passes.size() == 1) {
std::vector<std::string> passes_local{{
"lite_quant_dequant_fuse_pass", //
"weight_quantization_preprocess_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
"lite_conv_conv_fuse_pass", //
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
"lite_var_conv_2d_activation_fuse_pass", //
"lite_match_matrix_activation_fuse_pass", //
"lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", //
"lite_scales_fuse_pass", //
"lite_sequence_reverse_embedding_fuse_pass", //
"elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", //
"lite_scale_activation_fuse_pass", //
std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", //
"weight_quantization_preprocess_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
"lite_conv_conv_fuse_pass", //
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
"lite_var_conv_2d_activation_fuse_pass", //
"lite_match_matrix_activation_fuse_pass", //
"lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", //
"lite_scales_fuse_pass", //
"lite_sequence_reverse_embedding_fuse_pass", //
"elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", //
"lite_scale_activation_fuse_pass", //
#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
(defined LITE_WITH_ARM)
"lite_elementwise_activation_fuse_pass", //
"lite_elementwise_activation_fuse_pass", //
#endif
"identity_dropout_eliminate_pass",
"__xpu__resnet_fuse_pass",
"__xpu__resnet_d_fuse_pass",
"__xpu__resnet_cbam_fuse_pass",
"__xpu__conv2d_fuse_pass",
"__xpu__conv2d_link_previous_out_max_pass",
"__xpu__sfa_head_meanstd_fuse_pass",
"__xpu__sfa_head_moment_fuse_pass",
"__xpu__mmdnn_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
"quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer
// the output scale and
// fix the attribute
// 'enable_int8' for all
// of the quantized ops.
"npu_subgraph_pass",
"huawei_ascend_npu_subgraph_pass",
"xpu_subgraph_pass",
"bm_subgraph_pass",
"apu_subgraph_pass",
"rknpu_subgraph_pass",
"mlu_subgraph_pass",
"control_flow_op_unused_inputs_and_outputs_eliminate_pass",
"static_kernel_pick_pass", // pick original kernel from graph
"remove_tf_redundant_ops_pass",
"variable_place_inference_pass", // inference arg/var's
"mlu_postprocess_pass",
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
// info
// (target/precision/layout/device)
"type_target_cast_pass", // add io_copy/io_copy_once if meet
// different targets when last and next
// node
"variable_place_inference_pass", //
"argument_type_display_pass", //
"io_copy_kernel_pick_pass", //
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_precision_cast_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_layout_cast_pass", // add layout/layout_once op if meet
// different layout when last and next node
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
"lite_reshape_fuse_pass",
"memory_optimize_pass" // you can comment this line when enable
// PRECISION_PROFILE
}};
"identity_dropout_eliminate_pass",
"__xpu__resnet_fuse_pass",
"__xpu__resnet_d_fuse_pass",
"__xpu__resnet_cbam_fuse_pass",
"__xpu__conv2d_fuse_pass",
"__xpu__conv2d_link_previous_out_max_pass",
"__xpu__sfa_head_meanstd_fuse_pass",
"__xpu__sfa_head_moment_fuse_pass",
"__xpu__mmdnn_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"__xpu__embedding_with_eltwise_add_fuse_pass",
"__xpu__fc_fuse_pass",
"quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer
// the output scale and
// fix the attribute
// 'enable_int8' for all
// of the quantized ops.
"npu_subgraph_pass",
"huawei_ascend_npu_subgraph_pass",
"xpu_subgraph_pass",
"bm_subgraph_pass",
"apu_subgraph_pass",
"rknpu_subgraph_pass",
"mlu_subgraph_pass",
"control_flow_op_unused_inputs_and_outputs_eliminate_pass",
"static_kernel_pick_pass", // pick original kernel from graph
"remove_tf_redundant_ops_pass",
"variable_place_inference_pass", // inference arg/var's
"mlu_postprocess_pass",
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
// info
// (target/precision/layout/device)
"type_target_cast_pass", // add io_copy/io_copy_once if meet
// different targets when last and next
// node
"variable_place_inference_pass", //
"argument_type_display_pass", //
"io_copy_kernel_pick_pass", //
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_precision_cast_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_layout_cast_pass", // add layout/layout_once op if meet
// different layout when last and next node
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
"lite_reshape_fuse_pass",
#if !(defined(LITE_WITH_FPGA) || defined(LITE_WITH_PRECISION_PROFILE))
"memory_optimize_pass"
#endif
}};
if (passes.size() == 1) {
// multi_stream_analysis_pass must be in the front of
......
......@@ -139,7 +139,9 @@ struct Instruction {
#ifdef LITE_WITH_PROFILE
void set_profiler(profile::Profiler* profiler) {
profiler_ = profiler;
#ifndef LITE_WITH_FPGA
if (op_->Type() != "feed" && op_->Type() != "fetch") {
#endif
profile::OpCharacter ch;
ch.op_lite = static_cast<void*>(const_cast<paddle::lite::OpLite*>(op()));
ch.target = kernel()->target();
......@@ -150,7 +152,9 @@ struct Instruction {
// append `ch.kernel_func_name` in StopTiming
profile_id_ = profiler->NewTimer(ch);
kernel_->SetProfiler(profiler_, profile_id_);
#ifndef LITE_WITH_FPGA
}
#endif
}
void SetProfileRuntimeOpInfo(paddle::lite::profile::OpCharacter* ch) {
......
......@@ -5,28 +5,36 @@ endif()
set(fpga_deps fpga_target_wrapper kernel_fpga)
# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(cast_compute_fpga FPGA basic SRCS cast_compute.cc DEPS ${fpga_deps})
add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(interpolate_compute_fpga FPGA basic SRCS interpolate_compute.cc DEPS ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
add_kernel(split_compute_fpga FPGA basic SRCS split_compute.cc DEPS ${fpga_deps})
add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
......@@ -34,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep
add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
add_kernel(yolo_box_compute_fpga FPGA basic SRCS yolo_box_compute.cc DEPS ${fpga_deps})
# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
......
......@@ -25,16 +25,38 @@ using float16 = zynqmp::float16;
void ReluCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto output_data = param.Out->mutable_data<float16>();
zynqmp::InputParam& input_param = pe_.param();
zynqmp::InputParam& relu_param = pe_.param();
input_param.input = param.X->ZynqTensor();
input_param.output = param.Out->ZynqTensor();
relu_param.input = param.X->ZynqTensor();
relu_param.output = param.Out->ZynqTensor();
pe_.init();
pe_.apply();
}
void ReluCompute::Run() { pe_.dispatch(); }
void SigmoidCompute::Run() {
// TODO(chonwhite) use fpga and arm implementation;
auto& param = this->Param<param_t>();
auto output_data = param.Out->mutable_data<float16>();
int numel = param.Out->numel();
float16* in_data = param.X->ZynqTensor()->data<float16>();
float16* out_data = param.Out->ZynqTensor()->data<float16>();
param.X->ZynqTensor()->syncToCPU();
float max = 0.0f;
for (int i = 0; i < numel; i++) {
/* code */
float value = zynqmp::half_to_float(in_data[i]);
value = 1 / (1 + exp(-value));
out_data[i] = zynqmp::float_to_half(value);
max = std::max(std::abs(value), max);
}
param.Out->ZynqTensor()->scale()[0] = max / 127.0;
param.Out->ZynqTensor()->scale()[1] = 127.0 / max;
param.Out->ZynqTensor()->flush();
}
} // namespace fpga
} // namespace kernels
} // namespace lite
......@@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(sigmoid,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::SigmoidCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -14,6 +14,8 @@
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
#include "lite/core/kernel.h"
......@@ -24,6 +26,13 @@ namespace lite {
namespace kernels {
namespace fpga {
static std::map<std::string, zynqmp::ActiveType> activation_map = {
{"relu", zynqmp::TYPE_RELU},
{"relu6", zynqmp::TYPE_RELU6},
{"leaky_relu", zynqmp::TYPE_LEAKY_RELU},
{"sigmoid", zynqmp::TYPE_SIGMOID},
{"", zynqmp::TYPE_NONE}};
class ReluCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
......@@ -40,6 +49,16 @@ class ReluCompute
zynqmp::Tensor output_;
};
class SigmoidCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~SigmoidCompute() = default;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
......
......@@ -12,14 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/activation_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/kernels/fpga/activation_compute.h"
namespace paddle {
namespace lite {
......@@ -39,7 +37,8 @@ void activation_compute_ref(const operators::ActivationParam& param) {
}
TEST(activation_fpga, retrive_op) {
auto activation = KernelRegistry::Global().Create("relu");
auto activation =
KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("relu");
ASSERT_FALSE(activation.empty());
ASSERT_TRUE(activation.front());
}
......
......@@ -44,6 +44,16 @@ void CalibComputeFP16ToFp32::Run() {
return;
}
void CalibComputeFloat2Int::Run() {
auto& param = this->Param<operators::CalibParam>();
const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<int>();
// param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
}
} // namespace fpga
} // namespace kernels
} // namespace lite
......@@ -58,10 +68,26 @@ REGISTER_LITE_KERNEL(calib,
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(calib,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::CalibComputeFloat2Int,
float_2_int_fpga)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
......@@ -70,7 +96,7 @@ REGISTER_LITE_KERNEL(calib,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::CalibComputeFP16ToFp32,
fp16_to_fp32_fpga)
float_to_int_fpga)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32
private:
};
class CalibComputeFloat2Int
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::CalibParam;
void Run() override;
~CalibComputeFloat2Int() override{};
private:
};
} // namespace fpga
} // namespace kernels
} // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/cast_compute.h"
#include <algorithm>
// #include "lite/backends/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
template <class in_type, class out_type>
out_type TransOp(in_type in) {
return static_cast<out_type>(in);
}
void CastCompute::PrepareForRun() {}
void CastCompute::Run() {
auto& ctx = this->ctx_->template As<ARMContext>();
auto& param = this->Param<operators::CastParam>();
auto input_dims = param.X->dims();
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
cast, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::CastCompute, def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include "lite/backends/arm/math/type_trans.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class CastCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::CastParam;
void PrepareForRun() override;
void Run() override;
~CastCompute() {}
private:
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -25,12 +25,46 @@ namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
using lite_api::ActivationType;
void ConvCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
param.output->mutable_data<float16>();
int pad_h = (*param.paddings)[0];
int pad_w = (*param.paddings)[2];
zynqmp::ActiveType active_type = zynqmp::TYPE_NONE;
float leaky_relu_factor = 0;
switch (param.activation_param.active_type) {
case ActivationType::kIndentity:
active_type = zynqmp::TYPE_NONE;
break;
case ActivationType::kRelu:
active_type = zynqmp::TYPE_RELU;
break;
case ActivationType::kRelu6:
active_type = zynqmp::TYPE_RELU6;
break;
case ActivationType::kPRelu:
case ActivationType::kLeakyRelu:
active_type = zynqmp::TYPE_LEAKY_RELU;
leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
break;
case ActivationType::kSigmoid:
active_type = zynqmp::TYPE_SIGMOID;
break;
case ActivationType::kTanh:
case ActivationType::kSwish:
case ActivationType::kExp:
case ActivationType::kAbs:
case ActivationType::kHardSwish:
case ActivationType::kReciprocal:
default:
throw("not supported activation");
break;
}
// ====================================================
if (param.x->ZynqTensor()->shape().channel() != 1 &&
param.groups == param.x->ZynqTensor()->shape().channel()) {
......@@ -45,12 +79,13 @@ void ConvCompute::PrepareForRun() {
conv_param.paddings = std::vector<int>({pad_h, pad_w});
conv_param.dilations = *param.dilations;
fill_scale_bias_const(&conv_param);
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
if (param.bias != nullptr) {
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
}
conv_param.activeParam.type = active_type;
conv_param.activeParam.leaky_relu_factor = leaky_relu_factor;
dw_conv_pe_.init();
dw_conv_pe_.apply();
} else {
......@@ -68,15 +103,8 @@ void ConvCompute::PrepareForRun() {
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
}
if (param.fuse_relu) {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
// conv_param.filter->saveToFile("conv_filter_", true);
// if (param.bias != nullptr) {
// std::cout << "param.bias != nullptr" << std::endl;
// conv_param.bias()->saveToFile("conv_bias_", true);
// }
conv_param.activeParam.type = active_type;
conv_param.activeParam.leaky_relu_factor = leaky_relu_factor;
conv_pe_.init();
conv_pe_.apply();
......@@ -93,9 +121,7 @@ void ConvCompute::Run() {
Debugger::get_instance().registerOutput("dwconv", dwconv_param.output);
#endif
} else {
// zynqmp::ConvParam& conv_param = conv_pe_.param();
conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output);
......
......@@ -35,8 +35,6 @@ class ConvCompute
void Run() override;
~ConvCompute() {}
private:
zynqmp::ConvPE conv_pe_;
zynqmp::DepthwiseConvPE dw_conv_pe_;
......
......@@ -16,6 +16,7 @@
#include <string>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/kernels/fpga/activation_compute.h"
namespace paddle {
namespace lite {
......@@ -29,11 +30,9 @@ void ElementwiseAddCompute::PrepareForRun() {
auto& param = Param<operators::ElementwiseParam>();
param.Out->mutable_data<float16>();
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis;
ew_param.activeParam.type = zynqmp::TYPE_NONE;
pe_.init();
......@@ -50,14 +49,17 @@ void ElementwiseAddCompute::Run() {
void ElementwiseAddActivationCompute::PrepareForRun() {
zynqmp::ElementwiseAddParam& ew_param = pe_.param();
auto& param = Param<operators::FusionElementwiseActivationParam>();
if (param.act_type != "relu") {
if (activation_map.count(param.act_type)) {
ew_param.activeParam.type = activation_map[param.act_type];
} else {
LOG(FATAL) << "unsupported Activation type: " << param.act_type;
}
param.Out->mutable_data<float16>();
ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
ew_param.output = param.Out->ZynqTensor();
ew_param.axis = param.axis;
ew_param.activeParam.type = zynqmp::TYPE_RELU;
pe_.init();
pe_.apply();
}
......@@ -76,25 +78,33 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_param.input = param.X->ZynqTensor();
scale_param.output = param.Out->ZynqTensor();
scale_param.activeParam.type = zynqmp::TYPE_NONE;
int channel = scale_param.input->shape().channel();
zynqmp::Tensor* scale = new zynqmp::Tensor();
zynqmp::Tensor* bias = new zynqmp::Tensor();
scale_param.scale = scale;
scale_param.bias = bias;
scale_param.scale = &scale_;
scale_param.bias = &bias_;
zynqmp::Shape shape(zynqmp::N, {channel});
float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
float scale_value = param.Y->data<float>()[0];
zynqmp::float16* scale_data =
scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
zynqmp::float16* bias_data =
bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
zynqmp::float16 scale_value = 0;
if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
scale_value = zynqmp::float_to_half(param.Y->data<float>()[0]);
} else {
scale_value = param.Y->data<zynqmp::float16>()[0];
}
for (int i = 0; i < channel; ++i) {
for (int i = 0; i < channel; i++) {
if (param.Y->dims().production() != 1) {
scale_value = param.Y->ZynqTensor()->data<float>()[i];
if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
scale_value = zynqmp::float_to_half(param.Y->data<float>()[i]);
} else {
scale_value = param.Y->data<zynqmp::float16>()[i];
}
}
scale_data[i] = scale_value;
bias_data[i] = 0;
bias_data[i] = zero_;
}
pe_.init();
......@@ -102,10 +112,18 @@ void ElementwiseMulCompute::PrepareForRun() {
}
void ElementwiseMulCompute::Run() {
auto& param = Param<operators::ElementwiseParam>();
if (!param.Y->persistable()) {
// TODO(chonwhite) alignment;
param.Y->ZynqTensor()->invalidate();
scale_.copyFrom(param.Y->ZynqTensor());
scale_.flush();
}
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param();
Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif
}
......@@ -161,7 +179,27 @@ REGISTER_LITE_KERNEL(elementwise_mul,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ElementwiseMulCompute,
def)
ew_mul_fpga)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_mul,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ElementwiseMulCompute,
ew_mul_y_arm)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -61,6 +61,9 @@ class ElementwiseMulCompute
private:
zynqmp::ScalePE pe_;
zynqmp::Tensor scale_;
zynqmp::Tensor bias_;
zynqmp::float16 zero_ = zynqmp::float_to_half(0.0f);
};
} // namespace fpga
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#include "lite/kernels/fpga/fc_compute.h"
#include "lite/kernels/fpga/activation_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
......@@ -35,6 +37,11 @@ void FcCompute::PrepareForRun() {
fc_param.output = param.output->ZynqTensor();
fc_param.filter = param.w->ZynqTensor();
fc_param.bias = param.bias->ZynqTensor();
fc_param.bias->flush();
if (activation_map.count(param.activation_type)) {
fc_param.activeParam.type = activation_map[param.activation_type];
}
pe_.init();
pe_.apply();
......@@ -42,6 +49,7 @@ void FcCompute::PrepareForRun() {
void FcCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::FullyConnectedParam& fc_param = pe_.param();
Debugger::get_instance().registerOutput("fc", fc_param.output);
......
......@@ -12,17 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/fc_compute.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/kernels/fpga/fc_compute.h"
namespace paddle {
namespace lite {
......@@ -78,7 +76,8 @@ void FillData(T* a,
}
TEST(fc_fpga, retrive_op) {
auto fc = KernelRegistry::Global().Create("fc");
auto fc =
KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("fc");
ASSERT_FALSE(fc.empty());
ASSERT_TRUE(fc.front());
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/kernels/fpga/feed_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
......@@ -28,7 +29,14 @@ void FeedCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col);
param.out->Resize(x.dims());
param.out->mutable_data<float16>();
auto in_type = x.ZynqTensor()->dataType();
if (in_type == zynqmp::FP32 || in_type == zynqmp::FP16) {
param.out->mutable_data<float16>();
}
if (in_type == zynqmp::INT32) {
param.out->mutable_data<int32_t>();
}
// ====================================================
zynqmp::InputParam& feed_param = pe_.param();
feed_param.input = x.ZynqTensor();
......@@ -40,8 +48,8 @@ void FeedCompute::PrepareForRun() {
void FeedCompute::Run() {
auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col);
pe_.param().input = x.ZynqTensor();
pe_.dispatch();
auto out_lod = param.out->mutable_lod();
*out_lod = x.lod();
......@@ -61,7 +69,7 @@ REGISTER_LITE_KERNEL(
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......@@ -73,7 +81,13 @@ REGISTER_LITE_KERNEL(feed,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FeedCompute,
def_host)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
feed_int32)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
......@@ -12,7 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include "lite/backends/fpga/KD/pes/input_pe.hpp"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
......@@ -26,6 +32,34 @@ class FeedCompute
public:
using param_t = operators::FeedParam;
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
*res = [](const std::map<std::string, const Type*>& inputs,
const std::string& out) -> const Type* {
CHECK(!inputs.empty());
// std::cout << "inputs: " << inputs << std::endl;
auto* type = inputs.at("Input");
std::cout << "type: " << type << std::endl;
exit(-1);
CHECK(type->target() == TARGET(kHost));
auto in_place = type->place();
auto target = TARGET(kFPGA);
auto precision = in_place.precision;
auto layout = in_place.layout;
if (in_place.precision == PRECISION(kFloat)) {
precision = PRECISION(kFP16);
layout = DATALAYOUT(kNHWC);
}
auto* out_type =
Type::Get(type->id(), target, precision, layout, in_place.device);
return out_type;
};
return res;
}
void PrepareForRun() override;
void Run() override;
......
......@@ -23,17 +23,36 @@ namespace fpga {
using float16 = zynqmp::float16;
void resize_output(const Tensor* input, Tensor& out) { // NOLINT
auto in_type = input->ZynqTensor()->dataType();
out.Resize(input->dims());
switch (in_type) {
case zynqmp::FP16:
case zynqmp::FP32:
out.mutable_data<float>();
break;
case zynqmp::INT32:
out.mutable_data<int32_t>();
break;
case zynqmp::INT64:
out.mutable_data<int64_t>();
break;
default:
break;
}
}
void FetchCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
// ====================================================
zynqmp::OutputParam& fetch_param = pe_.param();
auto fetch_list = param.fetch_list;
if (fetch_list->size() <= static_cast<size_t>(param.col)) {
fetch_list->resize(param.col + 1);
}
Tensor& out = param.fetch_list->at(param.col);
out.Resize(param.input->dims());
out.mutable_data<float>();
resize_output(param.input, out);
fetch_param.input = param.input->ZynqTensor();
fetch_param.output = out.ZynqTensor();
......@@ -48,13 +67,16 @@ void FetchCompute::Run() {
if (fetch_list->size() <= static_cast<size_t>(param.col)) {
fetch_list->resize(param.col + 1);
}
Tensor& out = param.fetch_list->at(param.col);
out.Resize(param.input->dims());
resize_output(param.input, out);
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param();
Debugger::get_instance().registerOutput("fetch", fetch_param.output);
Debugger::get_instance().setEnable(true);
#endif
}
......@@ -63,19 +85,6 @@ void FetchCompute::Run() {
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(fetch,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FetchCompute,
fpga_host)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
REGISTER_LITE_KERNEL(fetch,
kFPGA,
kFP16,
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/interpolate_compute.h"
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
void BilinearInterpCompute::Run() {}
void nearest_interp(const float16* src,
int w_in,
int h_in,
int c,
float16* dst,
int w_out,
int h_out,
float scale_x,
float scale_y,
bool with_align) {
float scale_w_new = (with_align)
? (static_cast<float>(w_in - 1) / (w_out - 1))
: (static_cast<float>(w_in) / (w_out));
float scale_h_new = (with_align)
? (static_cast<float>(h_in - 1) / (h_out - 1))
: (static_cast<float>(h_in) / (h_out));
if (with_align) {
for (int h = 0; h < h_out; ++h) {
float16* dst_p = dst + h * w_out * c;
int near_y = static_cast<int>(scale_h_new * h + 0.5);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w + 0.5);
const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c;
}
}
} else {
for (int h = 0; h < h_out; ++h) {
float16* dst_p = dst + h * w_out;
int near_y = static_cast<int>(scale_h_new * h);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w);
const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c;
}
}
}
}
void NearestInterpCompute::PrepareForRun() {
auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize;
lite::Tensor* Out = param.Out;
Out->mutable_data<float16>();
zynqmp::ResizeParam& norm_param = pe_.param();
norm_param.input = X->ZynqTensor();
norm_param.output = Out->ZynqTensor();
pe_.init();
pe_.apply();
}
inline std::vector<int> get_new_shape(
std::vector<const lite::Tensor*> list_new_shape_tensor) {
// get tensor from
std::vector<int> vec_new_shape;
for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
auto tensor = list_new_shape_tensor[i];
vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
}
return vec_new_shape;
}
template <typename T>
inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
std::vector<T> vec_new_data;
auto* new_data = new_data_tensor->data<T>();
lite::Tensor cpu_starts_tensor;
vec_new_data =
std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
return vec_new_data;
}
void interpolate(lite::Tensor* X,
lite::Tensor* OutSize,
std::vector<const lite::Tensor*> SizeTensor,
lite::Tensor* Scale,
lite::Tensor* Out,
int out_height,
int out_width,
float scale,
bool with_align,
std::string interpolate_type) {
int in_h = X->dims()[2];
int in_w = X->dims()[3];
if (SizeTensor.size() > 0) {
auto new_size = get_new_shape(SizeTensor);
out_height = new_size[0];
out_width = new_size[1];
} else {
auto scale_tensor = Scale;
if (scale_tensor != nullptr) {
auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
scale = scale_data[0];
}
if (scale > 0) {
out_height = static_cast<int>(in_h * scale);
out_width = static_cast<int>(in_w * scale);
}
auto out_size = OutSize;
if (out_size != nullptr) {
auto out_size_data = get_new_data_from_tensor<int>(out_size);
out_height = out_size_data[0];
out_width = out_size_data[1];
}
}
float height_scale = scale;
float width_scale = scale;
if (out_width > 0 && out_height > 0) {
height_scale = static_cast<float>(out_height / X->dims()[2]);
width_scale = static_cast<float>(out_width / X->dims()[3]);
}
int num_cout = X->dims()[0];
int c_cout = X->dims()[1];
Out->Resize({num_cout, c_cout, out_height, out_width});
float16* dout = Out->mutable_data<float16>();
const float16* din = X->data<float16>();
int out_num = Out->dims()[0];
int out_c = Out->dims()[1];
int count = out_num;
int out_h = Out->dims()[2];
int out_w = Out->dims()[3];
int spatial_in = in_h * in_w;
int spatial_out = out_h * out_w;
for (int i = 0; i < count; ++i) {
nearest_interp(din + spatial_in * i,
in_w,
in_h,
out_c,
dout + spatial_out * i,
out_w,
out_h,
1.f / width_scale,
1.f / height_scale,
with_align);
}
}
void NearestInterpCompute::Run() {
auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize;
auto SizeTensor = param.SizeTensor;
auto Scale = param.Scale;
lite::Tensor* Out = param.Out;
float scale = param.scale;
int out_w = param.out_w;
int out_h = param.out_h;
bool align_corners = param.align_corners;
std::string interp_method = "";
X->ZynqTensor()->syncToCPU();
interpolate(X,
OutSize,
SizeTensor,
Scale,
Out,
out_h,
out_w,
scale,
align_corners,
interp_method);
Out->ZynqTensor()->flush();
Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
}
} /* namespace fpga */
} /* namespace kernels */
} /* namespace lite */
} /* namespace paddle */
REGISTER_LITE_KERNEL(bilinear_interp,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::BilinearInterpCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(nearest_interp,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::NearestInterpCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class BilinearInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void Run() override;
virtual ~BilinearInterpCompute() = default;
};
class NearestInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~NearestInterpCompute() = default;
private:
zynqmp::ResizePE pe_;
};
} /* namespace fpga */
} /* namespace kernels */
} /* namespace lite */
} /* namespace paddle */
......@@ -31,26 +31,67 @@ void PoolCompute::PrepareForRun() {
auto& param = Param<operators::PoolParam>();
param.output->mutable_data<float16>();
zynqmp::PoolingParam& pool_param = pe_.param();
pool_param.input = param.x->ZynqTensor();
pool_param.output = param.output->ZynqTensor();
pool_param.activeParam.type = zynqmp::TYPE_RELU;
pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
: zynqmp::PoolingType::AVERAGE;
pool_param.globalPooling = param.global_pooling;
pool_param.kernelSize = param.ksize;
pool_param.strides = param.strides;
int pad_h = (*param.paddings)[0];
int pad_w = (*param.paddings)[2];
pool_param.paddings = std::vector<int>({pad_h, pad_w});
pe_.init();
pe_.apply();
int h_kernel = param.ksize[0];
int w_kernel = param.ksize[1];
if (param.global_pooling) {
h_kernel = param.x->ZynqTensor()->shape().height();
w_kernel = param.x->ZynqTensor()->shape().width();
}
int c = param.x->ZynqTensor()->shape().channel();
int w = param.x->ZynqTensor()->shape().width();
int wc_h_kernel = w * c * h_kernel;
int dwconv_limit = 131072;
int num = ceil(wc_h_kernel * 1.0f / dwconv_limit);
split_num_ = num;
if (num == 1) {
zynqmp::PoolingParam& pool_param = pe_.param();
pool_param.input = param.x->ZynqTensor();
pool_param.output = param.output->ZynqTensor();
pool_param.type = param.pooling_type == "max"
? zynqmp::PoolingType::MAX
: zynqmp::PoolingType::AVERAGE;
pool_param.globalPooling = param.global_pooling;
pool_param.kernelSize = param.ksize;
pool_param.strides = param.strides;
int pad_h = (*param.paddings)[0];
int pad_w = (*param.paddings)[2];
pool_param.paddings = std::vector<int>({pad_h, pad_w});
pe_.init();
pe_.apply();
} else {
zynqmp::PoolingParam& pool_param = split_pe_.param();
pool_param.input = param.x->ZynqTensor();
pool_param.output = param.output->ZynqTensor();
pool_param.type = param.pooling_type == "max"
? zynqmp::PoolingType::MAX
: zynqmp::PoolingType::AVERAGE;
pool_param.globalPooling = param.global_pooling;
pool_param.kernelSize = param.ksize;
pool_param.strides = param.strides;
int pad_h = (*param.paddings)[0];
int pad_w = (*param.paddings)[2];
pool_param.paddings = std::vector<int>({pad_h, pad_w});
split_pe_.init();
split_pe_.apply();
}
}
void PoolCompute::Run() {
pe_.dispatch();
if (split_num_ == 1) {
zynqmp::PoolingParam& pool_param = pe_.param();
pe_.dispatch();
} else {
split_pe_.dispatch();
zynqmp::PoolingParam& pool_param = split_pe_.param();
}
#ifdef FPGA_PRINT_TENSOR
zynqmp::PoolingParam& pool_param = pe_.param();
Debugger::get_instance().registerOutput("pooling", pool_param.output);
......
......@@ -16,6 +16,7 @@
#include <algorithm>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/pooling_pe.hpp"
#include "lite/backends/fpga/KD/pes/pooling_split_pe.hpp"
#include "lite/core/kernel.h"
#include "lite/operators/pool_op.h"
......@@ -36,6 +37,8 @@ class PoolCompute
private:
zynqmp::PoolingPE pe_;
zynqmp::PoolingSplitPE split_pe_;
int split_num_ = 1;
};
} // namespace fpga
......
......@@ -12,15 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/pooling_compute.h"
#include <gtest/gtest.h>
#include <limits>
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/core/op_registry.h"
#include "lite/kernels/fpga/pooling_compute.h"
namespace paddle {
namespace lite {
......@@ -278,7 +277,8 @@ TEST(pool_fpga, compute) {
}
TEST(pool_fpga, retrive_op) {
auto pool = KernelRegistry::Global().Create("pool2d");
auto pool = KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
"pool2d");
ASSERT_FALSE(pool.empty());
ASSERT_TRUE(pool.front());
}
......
......@@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() {
float offset = param.offset;
std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
size_t prior_num = aspect_ratios_vec.size() * min_size.size();
int prior_num = aspect_ratios_vec.size() * min_size.size();
prior_num += max_size.size();
std::vector<std::string> order = param.order;
bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
......@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
param.boxes->mutable_data<float>();
param.variances->mutable_data<float>();
zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor();
priobox_param.image = param.image->ZynqTensor();
......
......@@ -25,14 +25,24 @@ namespace fpga {
class ReshapeCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~ReshapeCompute() = default;
};
class FlattenCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void Run() override;
virtual ~FlattenCompute() = default;
};
class ReshapeComputeFpgaToHost
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~ReshapeComputeFpgaToHost() = default;
......
......@@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() {
scale_param.output = param.output->ZynqTensor();
int channel = scale_param.input->shape().channel();
zynqmp::Tensor* scale = new zynqmp::Tensor();
zynqmp::Tensor* bias = new zynqmp::Tensor();
zynqmp::Tensor* scale = &scale_;
zynqmp::Tensor* bias = &bias_;
zynqmp::Shape shape(zynqmp::N, {channel});
float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
......
......@@ -37,6 +37,8 @@ class ScaleCompute
private:
zynqmp::ScalePE pe_;
zynqmp::Tensor scale_;
zynqmp::Tensor bias_;
};
} // namespace fpga
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -19,7 +19,11 @@ cmake .. \
-DLITE_WITH_OPENMP=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_TESTING=OFF \
-DARM_TARGET_OS=armlinux
-DARM_TARGET_OS=armlinux \
-DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_PYTHON=OFF \
-DLITE_WITH_PROFILE=OFF \
-DLITE_WITH_LOG=OFF
make -j8
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册