提交 2bca7447 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

Merge pull request #1300 from zhangyang0701/develop

 unify V1 & V2 style for FPGA track 
......@@ -71,10 +71,10 @@ const char *G_OP_TYPE_SUM = "sum";
const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
extern const char *G_OP_TYPE_TANH = "tanh";
extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
extern const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
const char *G_OP_TYPE_TANH = "tanh";
const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
......@@ -13,251 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V1/api.h"
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <algorithm>
#include <map>
#include "fpga/V1/bias_scale.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#define FPGA_TEST_MODE
#define PADDLE_MOBILE_OS_LINUX
namespace paddle_mobile {
namespace fpga {
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
static std::map<void *, size_t> memory_map;
static inline int do_ioctl(int req, const void *arg) {
#ifdef PADDLE_MOBILE_OS_LINUX
int result = ioctl(fd, req, (uint64_t)arg);
PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
return result;
#else
return -1;
#endif
}
int open_device() {
if (fd == -1) {
fd = open(device_path, O_RDWR);
}
return fd;
}
// memory management;
void *fpga_malloc(size_t size) {
static uint64_t counter = 0;
#ifdef PADDLE_MOBILE_OS_LINUX
auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
#else
auto ptr = malloc(size);
#endif
counter += size;
memory_map.insert(std::make_pair(ptr, size));
// DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
// << counter << " bytes";
return ptr;
}
void fpga_free(void *ptr) {
static uint64_t counter = 0;
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
#ifdef PADDLE_MOBILE_OS_LINUX
munmap(ptr, size);
#else
free(ptr);
#endif
counter += size;
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
} else {
DLOG << "Invalid pointer";
}
}
void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int fpga_flush(void *address, size_t size) {
struct MemoryCacheArgs args = {nullptr};
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
}
int fpga_invalidate(void *address, size_t size) {
struct MemoryCacheArgs args = {nullptr};
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
}
half fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10));
if (tmp & 0x1000) {
t++; // roundoff
}
return t;
}
float fp16_2_fp32(half fp16_num) {
int frac = (fp16_num & 0x3ff);
int exp = ((fp16_num & 0x7c00) >> 10) + 112;
int s = fp16_num & 0x8000;
int tmp = 0;
float fp32_num;
tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; // NOLINT
return fp32_num;
}
int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "======Compute Basic Conv======";
DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_CONV, &args);
}
int ComputeFpgaConv(const struct SplitConvArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFPGAConv===========";
DLOG << " filter_num:" << args.filter_num
<< " group_num:" << args.group_num
<< " split_num:" << args.split_num;
#endif
int split_num = args.split_num;
for (int i = 0; i < split_num; i++) {
ComputeBasicConv(args.conv_args[i]);
}
if (split_num > 1) {
ComputeFPGAConcat(args.concat_arg);
}
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaPool===========";
DLOG << " mode:" << args.mode
<< " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_POOLING, &args);
}
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaEWAdd===========";
DLOG << " relu_enabled:" << args.relu_enabled
<< " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address
<< " image0_scale_address:" << args.image0.scale_address
<< " image0_channels:" << args.image0.channels
<< " image0_height:" << args.image0.height
<< " image0_width:" << args.image0.width
<< " pad0_height:" << args.image0.pad_height
<< " pad0_width:" << args.image0.pad_width;
DLOG << " image1_address:" << args.image1.address
<< " image1_scale_address:" << args.image1.scale_address
<< " image1_channels:" << args.image1.channels
<< " image1_height:" << args.image1.height
<< " image1_width:" << args.image1.width
<< " pad1_height:" << args.image1.pad_height
<< " pad_width:" << args.image1.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_EW, &args);
}
int PerformBypass(const struct BypassArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaBypass===========";
DLOG << " input_type:" << args.input_data_type
<< " output_type:" << args.output_data_type
<< " input_layout_type:" << args.input_layout_type
<< " output_layout_type:" << args.output_layout_type;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
}
int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaConcat===========";
DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out;
DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) {
DLOG << " " << i << "th: ";
DLOG << " channel_num:" << args.channel_num[i]
<< " image_address:" << args.images_in[i]
<< " image_scale_address:" << args.scales_in[i];
}
#endif
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width);
return 0;
}
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
void format_image(framework::Tensor *image_tensor) {
......@@ -397,7 +159,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
arg->conv_args =
arg->conv_arg =
(ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT
arg->concat_arg.image_num = arg->split_num;
......@@ -420,44 +182,44 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
for (int i = 0; i < n; i++) {
arg->conv_args[i].relu_enabled = relu_enabled;
arg->conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].filter_address = &(
arg->conv_arg[i].relu_enabled = relu_enabled;
arg->conv_arg[i].group_num = (uint32_t)group_num;
arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
arg->conv_arg[i].image.address = input_ptr;
arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
arg->conv_arg[i].image.scale_address = input->scale;
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].filter_scale_address = filter->scale;
arg->conv_arg[i].filter_address = &(
(int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT
arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_args[i].filter_num = (uint32_t)(
arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].filter_num = (uint32_t)(
i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div);
if (n > 1) {
arg->conv_args[i].output.scale_address =
arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_args[i].output.address = fpga_malloc(
input->dims()[2] *
align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
IMAGE_ALIGNMENT) *
sizeof(half));
arg->conv_arg[i].output.address =
fpga_malloc(input->dims()[2] *
align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num,
IMAGE_ALIGNMENT) *
sizeof(half));
} else {
arg->conv_args[i].output.scale_address = out->scale;
arg->conv_args[i].output.address = out_ptr;
arg->conv_arg[i].output.scale_address = out->scale;
arg->conv_arg[i].output.address = out_ptr;
}
arg->concat_arg.images_in[i] =
(half *)arg->conv_args[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
(half *)arg->conv_arg[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
}
}
......
......@@ -14,178 +14,13 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
#include "fpga/common/fpga_common.h"
#include "fpga/common/pe.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace fpga {
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
};
enum LayoutType {
LAYOUT_CHW = 1,
LAYOUT_HWC = 0,
};
struct VersionArgs {
void* buffer;
};
struct MemoryCopyArgs {
void* src;
void* dest;
size_t size;
};
struct KernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_w;
uint32_t stride_h;
};
struct ImageInputArgs {
void* address; // input featuremap virtual address
float* scale_address; // input scale address;
uint32_t channels;
uint32_t width; // featuremap width
uint32_t height;
uint32_t pad_width; // padding width;
uint32_t pad_height;
};
struct ImageOutputArgs {
void* address; // output result address;
float* scale_address; // output scale address;
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias are interlaced;
void* filter_address;
float* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t height;
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct GroupConvArgs {
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct SplitConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg
half kernel_reciprocal;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DataType input_data_type;
enum DataType output_data_type;
enum LayoutType input_layout_type;
enum LayoutType output_layout_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
struct FpgaRegWriteArgs {
uint64_t address; //
uint64_t value;
};
struct FpgaRegReadArgs {
uint64_t address;
uint64_t value;
};
struct MemoryCacheArgs {
void* address;
size_t size;
};
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
#define IOCTL_SEPARATOR_0 10
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_SEPARATOR_1 20
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
//============================== API =============================
int open_device();
int close_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dst, const void* src, size_t num);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct SplitConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
int get_align_image_cw(int cw);
void format_image(framework::Tensor* image_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
......@@ -209,8 +44,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
half fp32_2_fp16(float fp32_num);
float fp16_2_fp32(half fp16_num);
} // namespace fpga
} // namespace paddle_mobile
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "fpga/V1/bias_scale.h"
#include <memory.h>
#include "fpga/V1/api.h"
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V1/filter.h"
#include <memory.h>
#include <algorithm>
#include "fpga/V1/api.h"
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......@@ -31,20 +31,22 @@ int calc_split_num(int num, int division_capacity) {
}
int calc_division_number(int num, int group_num, int division_capacity) {
PADDLE_MOBILE_ENFORCE(num % group_num == 0,
"Filter number should be divisible by group number");
// PADDLE_MOBILE_ENFORCE(num % group_num == 0,
// "Filter number should be divisible by group
// number");
int split_num = calc_split_num(num, division_capacity);
PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
"Split number or group number should be 1");
// PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
// "Split number or group number should be 1");
return group_num * split_num;
}
int calc_num_per_div(int num, int group_num, int division_capacity) {
PADDLE_MOBILE_ENFORCE(num % group_num == 0,
"Filter number should be divisible by group number");
// PADDLE_MOBILE_ENFORCE(num % group_num == 0,
// "Filter number should be divisible by group
// number");
int split_num = calc_split_num(num, division_capacity);
PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
"Split number or group number should be 1");
// PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
// "Split number or group number should be 1");
if (group_num == 1) {
if (num > division_capacity) {
return division_capacity;
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V1/image.h"
#include <memory.h>
#include <algorithm>
#include "fpga/V1/api.h"
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/pe.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
namespace paddle_mobile {
namespace fpga {
int ComputeFpgaConv(const struct SplitConvArgs &args) {
ComputeBasicConv(args.conv_arg[0]);
}
int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "======Compute Basic Conv======";
DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaPool===========";
DLOG << " mode:" << args.mode
<< " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaEWAdd===========";
DLOG << " relu_enabled:" << args.relu_enabled
<< " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address
<< " image0_scale_address:" << args.image0.scale_address
<< " image0_channels:" << args.image0.channels
<< " image0_height:" << args.image0.height
<< " image0_width:" << args.image0.width
<< " pad0_height:" << args.image0.pad_height
<< " pad0_width:" << args.image0.pad_width;
DLOG << " image1_address:" << args.image1.address
<< " image1_scale_address:" << args.image1.scale_address
<< " image1_channels:" << args.image1.channels
<< " image1_height:" << args.image1.height
<< " image1_width:" << args.image1.width
<< " pad1_height:" << args.image1.pad_height
<< " pad_width:" << args.image1.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int PerformBypass(const struct BypassArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaBypass===========";
DLOG << " input_type:" << args.input_data_type
<< " output_type:" << args.output_data_type
<< " input_layout_type:" << args.input_layout_type
<< " output_layout_type:" << args.output_layout_type;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaConcat===========";
DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out
<< " out_channel:" << args.out_channel;
DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) {
DLOG << " " << i << "th: ";
DLOG << " channel_num:" << args.channel_num[i]
<< " aligned_channel_num:" << args.aligned_channel_num[i]
<< " image_address:" << args.images_in[i]
<< " image_scale_address:" << args.scales_in[i];
}
#endif
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width);
return 0;
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -13,84 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/api.h"
#include <algorithm>
#include "fpga/V2/bias_scale.h"
#include "fpga/V2/config.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
namespace paddle_mobile {
namespace fpga {
static std::map<void *, size_t> memory_map;
int open_device() {
int ret = driver::open_device_driver();
return ret;
}
int close_device() {
int ret = driver::close_device_driver();
return ret;
}
void *fpga_malloc(size_t size) {
static uint64_t counter = 0;
#ifdef PADDLE_MOBILE_ZU5
auto ptr = driver::fpga_malloc_driver(size);
#else
auto ptr = malloc(size);
#endif
counter += size;
memory_map.insert(std::make_pair(ptr, size));
// DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
// << counter << " bytes";
return ptr;
}
void fpga_free(void *ptr) {
static uint64_t counter = 0;
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
#ifdef PADDLE_MOBILE_ZU5
driver::fpga_free_driver(ptr);
#else
free(ptr);
#endif
counter += size;
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
} else {
DLOG << "Invalid pointer";
}
}
void fpga_copy(void *dest, const void *src, size_t num) {
#ifdef PADDLE_MOBILE_ZU5
driver::fpga_copy_driver(dest, src, num);
#else
memcpy(dest, src, num);
#endif
}
int fpga_flush(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
return driver::fpga_flush_driver(address, size);
#else
return 0;
#endif
}
int fpga_invalidate(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
return driver::fpga_invalidate_driver(address, size);
#else
return 0;
#endif
}
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3];
......@@ -284,8 +213,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg[i].output.address = out_ptr;
arg->conv_arg[i].output.scale_address = out->scale;
int num_after_alignment =
filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
int num_after_alignment = filter::calc_aligned_num(
(int)input->dims()[1], arg->filter_num); // NOLINT
arg->conv_arg[i].free_space =
fpga_malloc(num_after_alignment * 2 * sizeof(half));
}
......
......@@ -14,21 +14,13 @@ limitations under the License. */
#pragma once
#include "fpga/V2/driver/pe.h"
#include "fpga/V2/fpga_common.h"
#include "fpga/common/fpga_common.h"
#include "fpga/common/pe.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace fpga {
int open_device();
int close_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dest, const void* src, size_t num);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
float filter_find_max(framework::Tensor* filter_tensor);
int get_aligned_channel_num(int channel_num);
int get_aligned_filter_num(framework::Tensor* filter_tensor);
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "fpga/V2/bias_scale.h"
#include <memory.h>
#include "fpga/V2/api.h"
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V2/filter.h"
#include <memory.h>
#include <algorithm>
#include "fpga/V2/api.h"
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......@@ -73,7 +73,7 @@ void convert_to_hwc(float **data_in, int num, int channel, int height,
void align_filter(float **data_in, int num, int channel, int height,
int width) {
int aligned_channel = calc_channel_parallelism(channel);
int aligned_channel = calc_aligned_channel(channel);
int hw = height * width;
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float)); // NOLINT
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V2/image.h"
#include <memory.h>
#include <algorithm>
#include "fpga/V2/api.h"
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......
......@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/driver/pe.h"
#include "fpga/V2/config.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/common/pe.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
namespace paddle_mobile {
namespace fpga {
......@@ -166,53 +166,53 @@ int PerformBypass(const struct BypassArgs &args) {
return 0;
#endif
uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
uint64_t bp_enable;
int64_t length;
uint64_t pixels;
// fp32->fp16
if ((args.input_data_type) && (!args.output_data_type)) {
pixels = (args.image.channels) * (args.image.width) * (args.image.height);
length = pixels * sizeof(float);
bp_enable = 0x8800000000000000 + length;
}
// fp16->fp32
else if ((!args.input_data_type) && (args.output_data_type)) {
pixels = filter::calc_aligned_channel((args.image.channels)) *
(args.image.width) * (args.image.height);
length = pixels * sizeof(short);
length = align_to_x((int)length, 64); // NOLINT
bp_enable = 0x8a00000000000000 + length;
}
// fp16->fp16 findmax
else if ((!args.input_data_type) && (!args.output_data_type)) {
pixels = (args.image.channels) * (args.image.width) * (args.image.height);
length = pixels * sizeof(short);
bp_enable = 0x8900000000000000 + length;
} else {
return -1;
}
// start bypass
driver::reg_writeq(ifm_src_paddr, MUL8(27));
driver::reg_writeq(ifm_dst_paddr, MUL8(28));
driver::reg_writeq(0, MUL8(0));
driver::reg_writeq(bp_enable, MUL8(0));
// poll
int ret = -1;
ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
if (ret != -1) {
// clear "irq"
driver::reg_readq(MUL8(63));
}
// get max value
if ((!args.input_data_type) && (!args.output_data_type)) {
float scale = Findfp16Max();
args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT
args.output.scale_address[1] = scale;
}
// uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
// uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
// uint64_t bp_enable;
// int64_t length;
// uint64_t pixels;
//
// // fp32->fp16
// if ((args.input_data_type) && (!args.output_data_type)) {
// pixels = (args.image.channels) * (args.image.width) *
// (args.image.height); length = pixels * sizeof(float); bp_enable =
// 0x8800000000000000 + length;
// }
// // fp16->fp32
// else if ((!args.input_data_type) && (args.output_data_type)) {
// pixels = filter::calc_aligned_channel((args.image.channels)) *
// (args.image.width) * (args.image.height);
// length = pixels * sizeof(short);
// length = align_to_x((int)length, 64); // NOLINT
// bp_enable = 0x8a00000000000000 + length;
// }
// // fp16->fp16 findmax
// else if ((!args.input_data_type) && (!args.output_data_type)) {
// pixels = (args.image.channels) * (args.image.width) *
// (args.image.height); length = pixels * sizeof(short); bp_enable =
// 0x8900000000000000 + length;
// } else {
// return -1;
// }
//
// // start bypass
// driver::reg_writeq(ifm_src_paddr, MUL8(27));
// driver::reg_writeq(ifm_dst_paddr, MUL8(28));
// driver::reg_writeq(0, MUL8(0));
// driver::reg_writeq(bp_enable, MUL8(0));
// // poll
// int ret = -1;
// ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
// if (ret != -1) {
// // clear "irq"
// driver::reg_readq(MUL8(63));
// }
// // get max value
// if ((!args.input_data_type) && (!args.output_data_type)) {
// float scale = Findfp16Max();
// args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT
// args.output.scale_address[1] = scale;
// }
return ret;
}
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/driver/bitmap.h"
#include "fpga/common/bitmap.h"
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len) {
......
......@@ -28,8 +28,8 @@ limitations under the License. */
#include <iostream>
#include "common/enforce.h"
#include "fpga/V2/driver/bitmap.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/common/bitmap.h"
#include "fpga/common/driver.h"
namespace paddle_mobile {
namespace fpga {
......@@ -353,7 +353,7 @@ void fpga_free_driver(void *ptr) {
}
}
static inline int do_ioctl(unsigned long req, const void *arg) {
static inline int do_ioctl(int64_t req, const void *arg) {
return ioctl(g_fpgainfo.fd_mem, req, arg);
}
......@@ -363,7 +363,7 @@ int fpga_flush_driver(void *address, size_t size) {
p_addr = vaddr_to_paddr(address);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
......@@ -375,7 +375,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
p_addr = vaddr_to_paddr(address);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
......@@ -389,7 +389,7 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
for (i = 0; i < num; i++) {
// DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
// usleep(1);
*((int8_t *)dest + i) = *((int8_t *)src + i);
*((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT
}
return;
......
......@@ -33,8 +33,6 @@ namespace driver {
#define FPGA_MEM_PHY_ADDR 0x20000000
#define FPGA_MEM_SIZE 0x20000000
#define CPU_FREQ 1000000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
// PE related macros
......@@ -53,7 +51,7 @@ struct MemoryCacheArgs {
size_t size;
};
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_FPGA_MAGIC 'F'
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
......@@ -105,17 +103,17 @@ extern struct FPGA_INFO g_fpgainfo;
inline uint64_t reg_readq(uint32_t offset) {
// DLOG << "offset : " << offset;
uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
offset); // NOLINT
uint64_t value =
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset); // NOLINT
return value;
}
inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value;
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
offset) = // NOLINT
value;
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset) = value;
}
int open_device_driver();
......
......@@ -12,7 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fpga/V2/fpga_common.h>
#include "fpga/common/fpga_common.h"
#include <algorithm>
#include <map>
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
namespace paddle_mobile {
namespace fpga {
......@@ -40,5 +45,73 @@ float fp16_2_fp32(int16_t fp16_num) {
return fp32_num;
}
static std::map<void *, size_t> memory_map;
int open_device() {
int ret = driver::open_device_driver();
return ret;
}
int close_device() {
int ret = driver::close_device_driver();
return ret;
}
void *fpga_malloc(size_t size) {
static uint64_t counter = 0;
#ifdef PADDLE_MOBILE_ZU5
auto ptr = driver::fpga_malloc_driver(size);
#else
auto ptr = malloc(size);
#endif
counter += size;
memory_map.insert(std::make_pair(ptr, size));
// DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
// << counter << " bytes";
return ptr;
}
void fpga_free(void *ptr) {
static uint64_t counter = 0;
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
#ifdef PADDLE_MOBILE_ZU5
driver::fpga_free_driver(ptr);
#else
free(ptr);
#endif
counter += size;
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
} else {
DLOG << "Invalid pointer";
}
}
void fpga_copy(void *dest, const void *src, size_t num) {
#ifdef PADDLE_MOBILE_ZU5
driver::fpga_copy_driver(dest, src, num);
#else
memcpy(dest, src, num);
#endif
}
int fpga_flush(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
return driver::fpga_flush_driver(address, size);
#else
return 0;
#endif
}
int fpga_invalidate(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
return driver::fpga_invalidate_driver(address, size);
#else
return 0;
#endif
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <cstddef>
#include <cstdint>
namespace paddle_mobile {
......@@ -117,9 +118,19 @@ struct BypassArgs {
struct DeconvArgs {
struct ConvArgs conv_arg;
};
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num);
int open_device();
int close_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dest, const void* src, size_t num);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
} // namespace fpga
} // namespace paddle_mobile
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "fpga/V2/fpga_common.h"
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......
......@@ -67,9 +67,6 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet50 paddle-mobile)
ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-densebox paddle-mobile)
set(FOUND_MATCH ON)
endif ()
......@@ -81,9 +78,6 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
target_link_libraries(test-pe paddle-mobile)
ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-densebox paddle-mobile)
set(FOUND_MATCH ON)
endif ()
......
......@@ -102,7 +102,6 @@ if (CON GREATER -1)
set(MUL_OP ON)
set(RESHAPE_OP ON)
set(SOFTMAX_OP ON)
set(FOUND_MATCH ON)
endif()
......@@ -120,14 +119,12 @@ if (CON GREATER -1)
set(SOFTMAX_OP ON)
set(FUSION_CONVBNRELU_OP ON)
set(FUSION_CONVBN_OP ON)
set(FUSION_CONVADD_OP ON)
set(FOUND_MATCH ON)
endif()
list(FIND NET "FPGA_NET_V2" CON)
if (CON GREATER -1)
message("FPGA_NET_V2 enabled")
set(FEED_OP ON)
set(FUSION_CONVADDRELU_OP ON)
set(FUSION_ELEMENTWISEADDRELU_OP ON)
set(FUSION_FC_OP ON)
......@@ -136,8 +133,6 @@ if (CON GREATER -1)
set(FUSION_CONVBNRELU_OP ON)
set(FUSION_CONVBN_OP ON)
set(CONV_TRANSPOSE_OP ON)
set(FUSION_DECONVRELU_OP ON)
#set(SLICE_OP ON)
set(TANH_OP ON)
set(ELEMENTWISEADD_OP ON)
set(TRANSPOSE2_OP ON)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册