提交 6c21a85b 编写于 作者: E eclipsess

Merge branch 'develop' of https://github.com/Eclipsess/paddle-mobile into develop

...@@ -13,46 +13,30 @@ See the License for the specific language governing permissions and ...@@ -13,46 +13,30 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "fpga/V2/api.h" #include "fpga/V2/api.h"
#include <fcntl.h>
#include <sys/ioctl.h>
#include <algorithm> #include <algorithm>
#include <map>
#include "fpga/V2/bias_scale.h" #include "fpga/V2/bias_scale.h"
#include "fpga/V2/config.h"
#include "fpga/V2/filter.h" #include "fpga/V2/filter.h"
#include "fpga/V2/image.h" #include "fpga/V2/image.h"
#define FPGA_TEST_MODE
// #define PADDLE_MOBILE_OS_LINUX
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
static std::map<void *, size_t> memory_map; static std::map<void *, size_t> memory_map;
static inline int do_ioctl(int req, const void *arg) { int open_device() {
#ifdef PADDLE_MOBILE_OS_LINUX int ret = open_device_driver();
int result = ioctl(fd, req, (uint64_t)arg); return ret;
PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
return result;
#else
return -1;
#endif
} }
int open_device() { int close_device() {
if (fd == -1) { int ret = close_device_driver();
fd = open(device_path, O_RDWR); return ret;
}
return fd;
} }
// memory management;
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
static uint64_t counter = 0; static uint64_t counter = 0;
#ifdef PADDLE_MOBILE_ZU5
#ifdef PADDLE_MOBILE_OS_LINUX auto ptr = fpga_malloc_driver(size);
auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
#else #else
auto ptr = malloc(size); auto ptr = malloc(size);
#endif #endif
...@@ -66,13 +50,12 @@ void *fpga_malloc(size_t size) { ...@@ -66,13 +50,12 @@ void *fpga_malloc(size_t size) {
void fpga_free(void *ptr) { void fpga_free(void *ptr) {
static uint64_t counter = 0; static uint64_t counter = 0;
size_t size = 0; size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) { if (iter != memory_map.end()) {
size = iter->second; size = iter->second;
memory_map.erase(iter); memory_map.erase(iter);
#ifdef PADDLE_MOBILE_OS_LINUX #ifdef PADDLE_MOBILE_ZU5
munmap(ptr, size); fpga_free_driver(ptr);
#else #else
free(ptr); free(ptr);
#endif #endif
...@@ -84,24 +67,6 @@ void fpga_free(void *ptr) { ...@@ -84,24 +67,6 @@ void fpga_free(void *ptr) {
} }
} }
void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int fpga_flush(void *address, size_t size) {
struct MemoryCacheArgs args = {nullptr};
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
}
int fpga_invalidate(void *address, size_t size) {
struct MemoryCacheArgs args = {nullptr};
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
}
half fp32_2_fp16(float fp32_num) { half fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
...@@ -123,136 +88,13 @@ float fp16_2_fp32(half fp16_num) { ...@@ -123,136 +88,13 @@ float fp16_2_fp32(half fp16_num) {
return fp32_num; return fp32_num;
} }
int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "======Compute Basic Conv======";
DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_CONV, &args);
}
int ComputeFpgaConv(const struct SplitConvArgs &args) {
ComputeBasicConv(args.conv_args[0]);
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaPool===========";
DLOG << " mode:" << args.mode
<< " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_POOLING, &args);
}
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaEWAdd===========";
DLOG << " relu_enabled:" << args.relu_enabled
<< " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address
<< " image0_scale_address:" << args.image0.scale_address
<< " image0_channels:" << args.image0.channels
<< " image0_height:" << args.image0.height
<< " image0_width:" << args.image0.width
<< " pad0_height:" << args.image0.pad_height
<< " pad0_width:" << args.image0.pad_width;
DLOG << " image1_address:" << args.image1.address
<< " image1_scale_address:" << args.image1.scale_address
<< " image1_channels:" << args.image1.channels
<< " image1_height:" << args.image1.height
<< " image1_width:" << args.image1.width
<< " pad1_height:" << args.image1.pad_height
<< " pad_width:" << args.image1.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_EW, &args);
}
int PerformBypass(const struct BypassArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaBypass===========";
DLOG << " input_type:" << args.input_data_type
<< " output_type:" << args.output_data_type
<< " input_layout_type:" << args.input_layout_type
<< " output_layout_type:" << args.output_layout_type;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
}
int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaConcat===========";
DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out
<< " out_channel:" << args.out_channel;
DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) {
DLOG << " " << i << "th: ";
DLOG << " channel_num:" << args.channel_num[i]
<< " aligned_channel_num:" << args.aligned_channel_num[i]
<< " image_address:" << args.images_in[i]
<< " image_scale_address:" << args.scales_in[i];
}
#endif
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width, args.aligned_channel_num,
args.out_channel);
return 0;
}
void format_image(framework::Tensor *image_tensor) { void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims(); auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3]; auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>(); auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float); size_t memory_size = channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size); memcpy(new_data, data_ptr, memory_size);
int aligned_channel = filter::calc_aligned_channel((int)channel); // NOLINT int aligned_channel = filter::calc_aligned_channel((int)channel); // NOLINT
image::format_image(&new_data, (int)channel, (int)height, // NOLINT image::format_image(&new_data, (int)channel, (int)height, // NOLINT
(int)width, // NOLINT (int)width, // NOLINT
...@@ -265,7 +107,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) { ...@@ -265,7 +107,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
size_t memory_size = 0; size_t memory_size = 0;
if (dims.size() == 4) { if (dims.size() == 4) {
auto height = dims[2], width = dims[3]; auto height = dims[2], width = dims[3];
memory_size = height * width * aligned_channel * sizeof(half); memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
} else if (dims.size() == 2) { } else if (dims.size() == 2) {
memory_size = aligned_channel * sizeof(half); memory_size = aligned_channel * sizeof(half);
} else { } else {
...@@ -319,7 +161,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, ...@@ -319,7 +161,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
auto data_ptr = filter_tensor->data<float>(); auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float); size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size); memcpy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, (int)num, (int)channel, // NOLINT filter::format_filter(&new_data, (int)num, (int)channel, // NOLINT
(int)height, // NOLINT (int)height, // NOLINT
(int)width, group_num, max_value); // NOLINT (int)width, group_num, max_value); // NOLINT
...@@ -334,7 +176,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { ...@@ -334,7 +176,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
auto data_ptr = filter_tensor->data<float>(); auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float); size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size); memcpy(new_data, data_ptr, memory_size);
filter::format_fc_filter(&new_data, (int)num, (int)channel, // NOLINT filter::format_fc_filter(&new_data, (int)num, (int)channel, // NOLINT
(int)height, // NOLINT (int)height, // NOLINT
(int)width, 1, max_value); // NOLINT (int)width, 1, max_value); // NOLINT
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include <cstddef> #include <cstddef>
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/driver/pe.h"
#include "framework/tensor.h" #include "framework/tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -33,16 +35,6 @@ enum LayoutType { ...@@ -33,16 +35,6 @@ enum LayoutType {
LAYOUT_HWC = 0, LAYOUT_HWC = 0,
}; };
struct VersionArgs {
void* buffer;
};
struct MemoryCopyArgs {
void* src;
void* dest;
size_t size;
};
struct KernelArgs { struct KernelArgs {
uint32_t width; uint32_t width;
uint32_t height; uint32_t height;
...@@ -128,56 +120,10 @@ struct BypassArgs { ...@@ -128,56 +120,10 @@ struct BypassArgs {
struct ImageOutputArgs output; struct ImageOutputArgs output;
}; };
struct FpgaRegWriteArgs {
uint64_t address; //
uint64_t value;
};
struct FpgaRegReadArgs {
uint64_t address;
uint64_t value;
};
struct MemoryCacheArgs {
void* address;
size_t size;
};
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
#define IOCTL_SEPARATOR_0 10
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_SEPARATOR_1 20
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
//============================== API =============================
int open_device(); int open_device();
int close_device(); int close_device();
void* fpga_malloc(size_t size); void* fpga_malloc(size_t size);
void fpga_free(void* ptr); void fpga_free(void* ptr);
void fpga_copy(void* dst, const void* src, size_t num);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct SplitConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
......
...@@ -39,7 +39,6 @@ void align_element(float **data_in, int num, int num_after_alignment) { ...@@ -39,7 +39,6 @@ void align_element(float **data_in, int num, int num_after_alignment) {
void format_bias_scale_array(float **data_in, int num, void format_bias_scale_array(float **data_in, int num,
int num_after_alignment) { int num_after_alignment) {
align_element(data_in, num, num_after_alignment); align_element(data_in, num, num_after_alignment);
fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
} }
} // namespace bias_scale } // namespace bias_scale
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define PADDLE_MOBILE_ZU5
#define FPGA_PRINT_MODE
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/driver/bitmap.h"
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len) {
uint64_t *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_set >= 0) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_LONG;
mask_to_set = ~0UL;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_WORD_MASK(size);
*p |= mask_to_set;
}
}
void bitmap_clear(uint64_t *map, unsigned int start, int len) {
uint64_t *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_clear >= 0) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_LONG;
mask_to_clear = ~0UL;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
*p &= ~mask_to_clear;
}
}
static uint64_t ffs(uint64_t data) {
uint64_t bit = 0;
int i = 0;
for (i = 0; i < sizeof(data); i++) {
if (data & (1 << i)) {
bit = i;
break;
}
}
return bit;
}
static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
uint64_t start, uint64_t invert) {
uint64_t tmp = 0;
if (!nbits || start >= nbits) return nbits;
tmp = addr[start / BITS_PER_LONG] ^ invert;
/* Handle 1st word. */
tmp &= BITMAP_FIRST_WORD_MASK(start);
start = round_down(start, BITS_PER_LONG);
while (!tmp) {
start += BITS_PER_LONG;
if (start >= nbits) return nbits;
tmp = addr[start / BITS_PER_LONG] ^ invert;
}
return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
}
uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
uint64_t offset) {
return _find_next_bit(addr, size, offset, ~0UL);
}
uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
return _find_next_bit(addr, size, offset, 0UL);
}
uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask,
uint64_t align_offset) {
uint64_t index = 0;
uint64_t end = 0;
uint64_t i = 0;
again:
index = find_next_zero_bit(map, size, start);
/* Align allocation */
index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
end = index + nr;
if (end > size) return end;
i = find_next_bit(map, end, index);
if (i < end) {
start = i + 1;
goto again;
}
return index;
}
uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask) {
return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
}
} // namespace fpga_bitmap
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <stdio.h>
#define BITS_PER_LONG 64
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define round_down(x, y) ((x) & ((y)-1))
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len);
void bitmap_clear(uint64_t *map, unsigned int start, int len);
uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask);
} // namespace fpga_bitmap
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
#include <algorithm>
#include <cstddef>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include "common/enforce.h"
#include "fpga/V2/driver/bitmap.h"
#include "fpga/V2/driver/driver.h"
namespace paddle_mobile {
namespace fpga {
struct FPGA_INFO g_fpgainfo;
int open_drvdevice() {
if (g_fpgainfo.fd_drv == -1) {
g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
}
return g_fpgainfo.fd_drv;
}
int open_memdevice() {
if (g_fpgainfo.fd_mem == -1) {
g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
}
return g_fpgainfo.fd_mem;
}
void pl_reset() {
// DLOG << "PL RESET";
// reg_writeq(0x5a, REG_FPGA_RESET);
usleep(100 * 1000);
}
void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
char const *type_name, int pe_idx) {
memset(pe, 0, sizeof(struct fpga_pe));
pe->outer = pe_data;
snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
pe->status = IDLE;
pe->interrupt_cnt = 0;
pe_data->pes[pe_idx] = pe;
pe_data->pe_num++;
}
void pl_init() {
struct pe_data_s *pe_data = nullptr;
pl_reset();
pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
if (pe_data == nullptr) {
DLOG << "pe_data malloc error!";
return;
}
memset(pe_data, 0, sizeof(struct pe_data_s));
pthread_mutex_init(&pe_data->mutex, 0);
setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
g_fpgainfo.pe_data = pe_data;
}
void pl_destroy() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
pthread_mutex_destroy(&pe_data->mutex);
free(pe_data);
}
void pl_start() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
pthread_mutex_unlock(&pe_data->mutex);
}
void pl_stop() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
pthread_mutex_lock(&pe_data->mutex);
}
void pl_reinit() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
struct fpga_pe *pe = nullptr;
int i = 0;
pl_stop();
pl_reset();
pl_start();
for (i = 0; i < pe_data->pe_num; i++) {
pe = pe_data->pes[i];
pe->status = IDLE;
pe->interrupt_cnt = 0;
}
pl_start();
}
int pl_get_status() { return 0; }
/*tmie单位us*/
int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
uint64_t i = 0;
/*timeout精确性待确认*/
int64_t timeout = time * CPU_FREQ / 1000000;
for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) {
break;
}
}
if (i <= timeout) {
return 0;
} else {
return -1;
}
}
/*内存管理*/
int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
unsigned int nr = (unsigned int)_nr;
int ret = 0;
pthread_mutex_lock(&memory->mutex);
unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
memory->bitmap, memory->page_num, 0, nr, 0);
if (pos <= memory->page_num) {
uint64_t address_ofset =
memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
memory->nr[pos] = nr;
*addr = address_ofset;
} else {
ret = -ENOMEM;
}
pthread_mutex_unlock(&memory->mutex);
return ret;
}
void memory_release(struct fpga_memory *memory) {
pthread_mutex_lock(&memory->mutex);
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
pthread_mutex_unlock(&memory->mutex);
}
int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
int rc = 0;
uint64_t *bitmap = nullptr;
unsigned int *nr = nullptr;
// 不允许多份memory创建,所以创建memory结构体不存在互斥
// pthread_mutex_lock(&memory->mutex);
memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
bitmap =
(uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long); // NOLINT
if (!bitmap) {
rc = -EFAULT;
return rc;
}
memory->bitmap = bitmap;
nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
if (!nr) {
rc = -EFAULT;
free(bitmap);
return rc;
}
memory->nr = nr;
memory->mem_start = FPGA_MEM_PHY_ADDR;
memory->mem_end = FPGA_MEM_SIZE;
// pthread_mutex_unlock(memory->mutex);
return rc;
}
int create_fpga_memory(struct fpga_memory **memory_info) {
int rc = 0;
*memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
if (*memory_info == NULL) {
rc = -EFAULT;
return rc;
}
pthread_mutex_init(&((*memory_info)->mutex), nullptr);
rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
if (rc) {
free(*memory_info);
}
return rc;
}
int init_fpga_memory(struct fpga_memory *memory) {
int rc = 0;
if (!memory) {
rc = -EFAULT;
return rc;
}
// spin_lock_init(&memory->spin);
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0.
return 0;
}
void destroy_fpga_memory(struct fpga_memory *memory) {
if (memory) {
free(memory->nr);
free(memory->bitmap);
free(memory);
}
}
int fpga_memory_add() {
int rc = 0;
rc = create_fpga_memory(&g_fpgainfo.memory_info);
if (rc) {
return rc;
}
rc = init_fpga_memory(g_fpgainfo.memory_info);
if (rc) {
destroy_fpga_memory(g_fpgainfo.memory_info);
return rc;
}
return 0;
}
uint64_t vaddr_to_paddr(void *address) {
uint64_t paddr = 0;
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
paddr = iter->second;
} else {
DLOG << "Invalid pointer";
}
return paddr;
}
void *fpga_reg_malloc(size_t size) {
void *ret = nullptr;
ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
// PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
return ret;
}
void *fpga_malloc_driver(size_t size) {
void *ret = nullptr;
uint64_t phy_addr = 0;
memory_request(g_fpgainfo.memory_info, size, &phy_addr);
ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
g_fpgainfo.fd_mem, phy_addr);
PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
return ret;
}
void fpga_free_driver(void *ptr) {
size_t size = 0;
auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
size = iter->second;
g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size);
} else {
DLOG << "Invalid pointer";
}
}
int open_device_driver() {
g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
g_fpgainfo.FpgaRegVirAddr = nullptr;
g_fpgainfo.pe_data = nullptr;
g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
g_fpgainfo.memdevice_path = "/dev/fpgamem0";
g_fpgainfo.fd_drv = -1;
g_fpgainfo.fd_mem = -1;
int ret = 0;
ret = open_drvdevice();
ret |= open_memdevice();
g_fpgainfo.FpgaRegVirAddr =
(uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT
fpga_memory_add();
pl_init();
return ret;
}
int close_device_driver() {
pl_destroy();
fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
memory_release(g_fpgainfo.memory_info);
destroy_fpga_memory(g_fpgainfo.memory_info);
return 0;
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <map>
#include "common/log.h"
namespace paddle_mobile {
namespace fpga {
#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
#define FPGA_REG_PHY_ADDR 0xa0000000
#define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x20000000
#define FPGA_MEM_SIZE 0x20000000
#define CPU_FREQ 1000000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
// PE related macros
const int MAX_NUM_PES = 6;
const size_t MAX_TYPE_NAME_LENTH = 8;
const int PE_IDX_CONV = 0;
const int PE_IDX_POOLING = 1;
const int PE_IDX_EW = 2;
const int PE_IDX_BYPASS = 3;
enum pe_status { IDLE = 0, BUSY = 1 };
struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1];
struct pe_data_s *outer;
pe_status status; // 0=idle 1=busy -1=fail
uint64_t interrupt_cnt;
};
struct pe_data_s {
pthread_mutex_t mutex;
struct fpga_pe pe_conv;
struct fpga_pe pe_pooling;
struct fpga_pe pe_ew;
struct fpga_pe pe_bypass;
struct fpga_pe *pes[MAX_NUM_PES];
int pe_num;
};
struct fpga_memory {
pthread_mutex_t mutex;
uint64_t *bitmap;
unsigned int *nr;
unsigned int page_num;
unsigned int page_num_long;
uint64_t mem_start;
uint64_t mem_end;
};
struct FPGA_INFO {
uint64_t FpgaRegPhyAddr;
uint64_t FpgaMemPhyAddr;
pthread_t poll_pid;
void *FpgaRegVirAddr;
struct pe_data_s *pe_data;
std::map<void *, size_t> fpga_addr2size_map;
std::map<void *, uint64_t> fpga_vaddr2paddr_map;
const char *drvdevice_path;
const char *memdevice_path;
struct fpga_memory *memory_info;
int fd_drv;
int fd_mem;
};
extern struct FPGA_INFO g_fpgainfo;
inline uint64_t reg_readq(uint32_t offset) {
// DLOG << "offset : " << offset;
uint64_t value =
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset); // NOLINT
return value;
}
inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value;
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) = // NOLINT
value;
}
int open_device_driver();
int close_device_driver();
void *fpga_malloc_driver(size_t size);
void fpga_free_driver(void *ptr);
/*pe*/
uint64_t vaddr_to_paddr(void *address);
int fpga_regpoll(uint64_t reg, uint64_t val, int time);
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/driver/pe.h"
#include "fpga/V2/config.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
namespace paddle_mobile {
namespace fpga {
#define MUL8(x) (x * 8)
#define BYPASS_DONE 1
float Findfp16Max() {
uint16_t abs_vals[16];
uint64_t max_fp16;
max_fp16 = reg_readq(MUL8(49));
abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(50));
abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(51));
abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(52));
abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
uint16_t tmp = 0;
for (int i = 0; i < 16; i++) {
if (tmp < abs_vals[i]) {
tmp = abs_vals[i];
}
}
return fp16_2_fp32(tmp) / 127.0f;
}
int ComputeFpgaConv(const struct SplitConvArgs &args) {
ComputeBasicConv(args.conv_args[0]);
}
int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "======Compute Basic Conv======";
DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaPool===========";
DLOG << " mode:" << args.mode
<< " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaEWAdd===========";
DLOG << " relu_enabled:" << args.relu_enabled
<< " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address
<< " image0_scale_address:" << args.image0.scale_address
<< " image0_channels:" << args.image0.channels
<< " image0_height:" << args.image0.height
<< " image0_width:" << args.image0.width
<< " pad0_height:" << args.image0.pad_height
<< " pad0_width:" << args.image0.pad_width;
DLOG << " image1_address:" << args.image1.address
<< " image1_scale_address:" << args.image1.scale_address
<< " image1_channels:" << args.image1.channels
<< " image1_height:" << args.image1.height
<< " image1_width:" << args.image1.width
<< " pad1_height:" << args.image1.pad_height
<< " pad_width:" << args.image1.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int PerformBypass(const struct BypassArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaBypass===========";
DLOG << " input_type:" << args.input_data_type
<< " output_type:" << args.output_data_type
<< " input_layout_type:" << args.input_layout_type
<< " output_layout_type:" << args.output_layout_type;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
uint64_t bp_enable;
int64_t length;
uint64_t pixels;
// fp32->fp16
if ((args.input_data_type) && (!args.output_data_type)) {
pixels = (args.image.channels) * (args.image.width) * (args.image.height);
length = pixels * sizeof(float);
bp_enable = 0x8800000000000000 + length;
}
// fp16->fp32
else if ((!args.input_data_type) && (args.output_data_type)) {
pixels = filter::calc_aligned_channel((args.image.channels)) *
(args.image.width) * (args.image.height);
length = pixels * sizeof(short);
length = align_to_x((int)length, 64); // NOLINT
bp_enable = 0x8a00000000000000 + length;
}
// fp16->fp16 findmax
else if ((!args.input_data_type) && (!args.output_data_type)) {
pixels = (args.image.channels) * (args.image.width) * (args.image.height);
length = pixels * sizeof(short);
bp_enable = 0x8900000000000000 + length;
} else {
return -1;
}
// start bypass
reg_writeq(ifm_src_paddr, MUL8(27));
reg_writeq(ifm_dst_paddr, MUL8(28));
reg_writeq(0, MUL8(0));
reg_writeq(bp_enable, MUL8(0));
// poll
int ret = -1;
ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
if (ret != -1) {
// clear "irq"
reg_readq(MUL8(63));
}
// get max value
if ((!args.input_data_type) && (!args.output_data_type)) {
float scale = Findfp16Max();
args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT
args.output.scale_address[1] = scale;
}
return ret;
}
int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaConcat===========";
DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out
<< " out_channel:" << args.out_channel;
DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) {
DLOG << " " << i << "th: ";
DLOG << " channel_num:" << args.channel_num[i]
<< " aligned_channel_num:" << args.aligned_channel_num[i]
<< " image_address:" << args.images_in[i]
<< " image_scale_address:" << args.scales_in[i];
}
#endif
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width, args.aligned_channel_num,
args.out_channel);
return 0;
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "fpga/V2/api.h"
namespace paddle_mobile {
namespace fpga {
int PerformBypass(const struct BypassArgs& args);
int ComputeBasicConv(const struct ConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFpgaConv(const struct SplitConvArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
} // namespace fpga
} // namespace paddle_mobile
...@@ -94,7 +94,6 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -94,7 +94,6 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
convert_to_hwc(data_in, num, channel, height, width); convert_to_hwc(data_in, num, channel, height, width);
align_filter(data_in, num, channel, height, width); align_filter(data_in, num, channel, height, width);
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
fpga_flush(*data_in, pixel_num * sizeof(float));
} }
void convert_fc_filter(float **data_in, int num, int chw) { void convert_fc_filter(float **data_in, int num, int chw) {
...@@ -114,8 +113,6 @@ void format_fc_filter(float **data_in, int num, int channel, int height, ...@@ -114,8 +113,6 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
int chw = channel * height * width; int chw = channel * height * width;
convert_fc_filter(data_in, num, chw); convert_fc_filter(data_in, num, chw);
align_filter(data_in, num, channel, height, width); align_filter(data_in, num, channel, height, width);
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
fpga_flush(*data_in, pixel_num * sizeof(float));
} }
float find_max(float *data_in, int data_size) { float find_max(float *data_in, int data_size) {
......
...@@ -58,7 +58,6 @@ void format_image(float **data_in, int channel, int height, int width, ...@@ -58,7 +58,6 @@ void format_image(float **data_in, int channel, int height, int width,
int aligned_channel) { int aligned_channel) {
convert_to_hwc(data_in, channel, height, width); convert_to_hwc(data_in, channel, height, width);
align_image(data_in, channel, height, width, aligned_channel); align_image(data_in, channel, height, width, aligned_channel);
fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
} }
void concat_images(int16_t **images_in, float **scales_in, void *image_out, void concat_images(int16_t **images_in, float **scales_in, void *image_out,
...@@ -70,8 +69,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -70,8 +69,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
scale_out[1] = 0.0; scale_out[1] = 0.0;
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
scale_out[0] = std::max(*scale_out, scales_in[i][0]); scale_out[0] = std::max(*scale_out, scales_in[i][0]);
fpga_invalidate(images_in[i],
height * width * aligned_channel_num[i] * sizeof(int16_t));
} }
scale_out[1] = 1 / scale_out[0]; scale_out[1] = 1 / scale_out[0];
...@@ -86,8 +83,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -86,8 +83,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
tmp_channel_sum += channel_num[i]; tmp_channel_sum += channel_num[i];
} }
} }
fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
} }
} // namespace image } // namespace image
......
...@@ -90,8 +90,10 @@ class CLEngine { ...@@ -90,8 +90,10 @@ class CLEngine {
bool BuildProgram(cl_program program) { bool BuildProgram(cl_program program) {
cl_int status; cl_int status;
status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel", std::string path = "-cl-fast-relaxed-math -I " +
0, 0); CLEngine::Instance()->GetCLPath() + "/cl_kernel";
status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
......
...@@ -704,7 +704,7 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() { ...@@ -704,7 +704,7 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
} }
} }
if (self_alloc) { if (self_alloc) {
delete origin_data; delete data;
} }
LOG(kLOG_INFO) << " end init combine memory "; LOG(kLOG_INFO) << " end init combine memory ";
} }
......
...@@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp); ...@@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp); REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
#endif
#endif #endif
...@@ -17,6 +17,14 @@ limitations under the License. */ ...@@ -17,6 +17,14 @@ limitations under the License. */
#include "operators/kernel/slice_kernel.h" #include "operators/kernel/slice_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators {} namespace operators {
template <>
bool SliceKernel<CPU, float>::Init(SliceParam<CPU>* param) {
return true;
}
template <>
void SliceKernel<CPU, float>::Compute(const SliceParam<CPU>& param) {}
} // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef CONCAT_OP #ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h" #include "operators/kernel/concat_kernel.h"
#include "fpga/V2/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -68,7 +69,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { ...@@ -68,7 +69,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
template <> template <>
void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) { void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
ComputeFPGAConcat(param.FpgaArgs()); fpga::ComputeFPGAConcat(param.FpgaArgs());
} }
template class ConcatKernel<FPGA, float>; template class ConcatKernel<FPGA, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#include "operators/kernel/elementwise_add_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
bool relu_enabled = false;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
fpga::format_fp16_ofm(out, aligned_channel_num);
auto out_ptr = out->mutable_data<float>();
fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.const0 = 0x3c00; // =1
ewaddArgs.const1 = 0x3c00; // =1
ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale;
ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
ewaddArgs.image0.pad_height = 0;
ewaddArgs.image0.pad_width = 0;
ewaddArgs.image1.address = input_y_ptr;
ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
ewaddArgs.image1.scale_address = input_y->scale;
ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
ewaddArgs.image1.pad_height = 0;
ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr;
param->SetFpgaArgs(ewaddArgs);
return true;
}
template <>
void ElementwiseAddKernel<FPGA, float>::Compute(
const ElementwiseAddParam<FPGA> &param) {
fpga::ComputeFpgaEWAdd(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -21,7 +21,7 @@ namespace operators { ...@@ -21,7 +21,7 @@ namespace operators {
template <> template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init( bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) { ElementwiseAddReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = false;
auto *input_x = const_cast<LoDTensor *>(param->InputX()); auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out(); auto *out = param->Out();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SLICE_OP
#include "operators/kernel/slice_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
return true;
}
template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -49,12 +49,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { ...@@ -49,12 +49,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *out = param.Out(); Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate(
(void *)in_x->data<float>(), // NOLINT
fpga::get_aligned_channel_num((int)in_x->dims()[1]) * // NOLINT
sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out); math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
} }
} // namespace operators } // namespace operators
......
...@@ -24,7 +24,8 @@ template <typename DeviceType, typename T> ...@@ -24,7 +24,8 @@ template <typename DeviceType, typename T>
class SliceKernel class SliceKernel
: public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> { : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
public: public:
void Compute(const SliceParam<DeviceType>& param) {} void Compute(const SliceParam<DeviceType>& param);
bool Init(SliceParam<DeviceType>* param);
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -436,6 +436,16 @@ class ConvParam : public OpParam { ...@@ -436,6 +436,16 @@ class ConvParam : public OpParam {
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
int offset_; int offset_;
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
template <typename Dtype> template <typename Dtype>
Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param); Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
...@@ -580,15 +590,6 @@ class MulParam : OpParam { ...@@ -580,15 +590,6 @@ class MulParam : OpParam {
GType *out_; GType *out_;
int x_num_col_dims_; int x_num_col_dims_;
int y_num_col_dims_; int y_num_col_dims_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -1641,15 +1642,6 @@ class FusionConvAddParam : public ConvParam<Dtype> { ...@@ -1641,15 +1642,6 @@ class FusionConvAddParam : public ConvParam<Dtype> {
RType *bias_; RType *bias_;
int axis_; int axis_;
RType *output_; RType *output_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
template <typename Dtype> template <typename Dtype>
...@@ -1696,15 +1688,6 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> { ...@@ -1696,15 +1688,6 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
RType *output_; RType *output_;
RType *alpha_; RType *alpha_;
std::string mode_; std::string mode_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -1754,15 +1737,6 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> { ...@@ -1754,15 +1737,6 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
std::string keyOutput_; std::string keyOutput_;
std::string keyX1_; std::string keyX1_;
std::string keyY1_; std::string keyY1_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -1829,16 +1803,6 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> { ...@@ -1829,16 +1803,6 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
bool is_test_; bool is_test_;
RType *new_bias_; RType *new_bias_;
RType *new_scale_; RType *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -1916,15 +1880,6 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> { ...@@ -1916,15 +1880,6 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
std::string keyBNY_; std::string keyBNY_;
std::string keyX_; std::string keyX_;
std::string keyY_; std::string keyY_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -1983,15 +1938,6 @@ class FusionConvBNParam : public ConvParam<Dtype> { ...@@ -1983,15 +1938,6 @@ class FusionConvBNParam : public ConvParam<Dtype> {
bool is_test_; bool is_test_;
RType *new_bias_; RType *new_bias_;
RType *new_scale_; RType *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -2058,15 +2004,6 @@ class FusionConvAddBNParam : public ConvParam<Dtype> { ...@@ -2058,15 +2004,6 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
bool is_test_; bool is_test_;
RType *new_bias_; RType *new_bias_;
RType *new_scale_; RType *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -2184,15 +2121,6 @@ class FusionConvBNReluParam : public ConvParam<Dtype> { ...@@ -2184,15 +2121,6 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
bool is_test_; bool is_test_;
RType *new_bias_; RType *new_bias_;
RType *new_scale_; RType *new_scale_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::SplitConvArgs fpga_conv_args;
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
......
...@@ -34,5 +34,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp); ...@@ -34,5 +34,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp); REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
#endif
#endif #endif
...@@ -130,10 +130,12 @@ if (CON GREATER -1) ...@@ -130,10 +130,12 @@ if (CON GREATER -1)
set(FUSION_ELEMENTWISEADDRELU_OP ON) set(FUSION_ELEMENTWISEADDRELU_OP ON)
set(FUSION_FC_OP ON) set(FUSION_FC_OP ON)
set(POOL_OP ON) set(POOL_OP ON)
set(CONCAT_OP ON)
set(SOFTMAX_OP ON) set(SOFTMAX_OP ON)
set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBNRELU_OP ON)
set(FUSION_CONVBN_OP ON) set(FUSION_CONVBN_OP ON)
# set(CONV_TRANSPOSE_OP ON)
# set(SLICE_OP ON)
# set(ELEMENTWISEADD_OP ON)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册