提交 202d5f29 编写于 作者: qnqinan's avatar qnqinan

add FPGA support op

上级 8c4a035f
...@@ -73,6 +73,8 @@ const char *G_OP_TYPE_QUANTIZE = "quantize"; ...@@ -73,6 +73,8 @@ const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
extern const char *G_OP_TYPE_TANH = "tanh"; extern const char *G_OP_TYPE_TANH = "tanh";
extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
extern const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
std::unordered_map< std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
...@@ -133,5 +135,7 @@ std::unordered_map< ...@@ -133,5 +135,7 @@ std::unordered_map<
{G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_TANH, {{"X"}, {"Out"}}}, {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}}; {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD_RELU, {{"Input"}, {"Out"}}}};
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -142,6 +142,9 @@ extern const char *G_OP_TYPE_DEQUANTIZE; ...@@ -142,6 +142,9 @@ extern const char *G_OP_TYPE_DEQUANTIZE;
extern const char *G_OP_TYPE_TANH; extern const char *G_OP_TYPE_TANH;
extern const char *G_OP_TYPE_FUSION_DECONV_RELU; extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
extern const char *G_OP_TYPE_FUSION_DECONV_ADD;
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU;
extern std::unordered_map< extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key; op_input_output_key;
......
...@@ -16,47 +16,29 @@ limitations under the License. */ ...@@ -16,47 +16,29 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "fpga/V2/bias_scale.h" #include "fpga/V2/bias_scale.h"
#include "fpga/V2/config.h" #include "fpga/V2/config.h"
<<<<<<< HEAD
=======
#include "fpga/V2/driver/driver.h" #include "fpga/V2/driver/driver.h"
>>>>>>> upstream/develop
#include "fpga/V2/filter.h" #include "fpga/V2/filter.h"
#include "fpga/V2/image.h" #include "fpga/V2/image.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
<<<<<<< HEAD
static std::map<void *, size_t> memory_map;
int open_device() {
int ret = open_device_driver();
=======
static std::map<void *, size_t> memory_map; static std::map<void *, size_t> memory_map;
int open_device() { int open_device() {
int ret = driver::open_device_driver(); int ret = driver::open_device_driver();
>>>>>>> upstream/develop
return ret; return ret;
} }
int close_device() { int close_device() {
<<<<<<< HEAD
int ret = close_device_driver();
=======
int ret = driver::close_device_driver(); int ret = driver::close_device_driver();
>>>>>>> upstream/develop
return ret; return ret;
} }
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
static uint64_t counter = 0; static uint64_t counter = 0;
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
<<<<<<< HEAD
auto ptr = fpga_malloc_driver(size);
=======
auto ptr = driver::fpga_malloc_driver(size); auto ptr = driver::fpga_malloc_driver(size);
>>>>>>> upstream/develop
#else #else
auto ptr = malloc(size); auto ptr = malloc(size);
#endif #endif
...@@ -75,11 +57,7 @@ void fpga_free(void *ptr) { ...@@ -75,11 +57,7 @@ void fpga_free(void *ptr) {
size = iter->second; size = iter->second;
memory_map.erase(iter); memory_map.erase(iter);
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
<<<<<<< HEAD
fpga_free_driver(ptr);
=======
driver::fpga_free_driver(ptr); driver::fpga_free_driver(ptr);
>>>>>>> upstream/develop
#else #else
free(ptr); free(ptr);
#endif #endif
...@@ -90,28 +68,6 @@ void fpga_free(void *ptr) { ...@@ -90,28 +68,6 @@ void fpga_free(void *ptr) {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer";
} }
} }
<<<<<<< HEAD
half fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10)));
if (tmp & 0x1000) {
t++; // roundoff
}
return t;
}
float fp16_2_fp32(half fp16_num) {
int frac = (fp16_num & 0x3ff);
int exp = ((fp16_num & 0x7c00) >> 10) + 112;
int s = fp16_num & 0x8000;
int tmp = 0;
float fp32_num;
tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; // NOLINT
return fp32_num;
=======
void fpga_copy(void *dest, const void *src, size_t num) { void fpga_copy(void *dest, const void *src, size_t num) {
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
driver::fpga_copy_driver(dest, src, num); driver::fpga_copy_driver(dest, src, num);
...@@ -133,7 +89,6 @@ int fpga_invalidate(void *address, size_t size) { ...@@ -133,7 +89,6 @@ int fpga_invalidate(void *address, size_t size) {
#else #else
return 0; return 0;
#endif #endif
>>>>>>> upstream/develop
} }
void format_image(framework::Tensor *image_tensor) { void format_image(framework::Tensor *image_tensor) {
...@@ -288,11 +243,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -288,11 +243,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->filter_num = (uint32_t)filter->dims()[0]; arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr; arg->output.address = out_ptr;
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
<<<<<<< HEAD
arg->conv_args =
=======
arg->conv_arg = arg->conv_arg =
>>>>>>> upstream/develop
(ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT
arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_num = arg->split_num;
...@@ -310,30 +261,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -310,30 +261,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
(uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
<<<<<<< HEAD
arg->conv_args[i].relu_enabled = relu_enabled;
arg->conv_args[i].sb_address = bs_ptr;
arg->conv_args[i].filter_address = (int8_t *)filter_ptr; // NOLINT
arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].filter_num = arg->filter_num;
arg->conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
arg->conv_args[i].output.address = out_ptr;
arg->conv_args[i].output.scale_address = out->scale;
=======
arg->conv_arg[i].relu_enabled = relu_enabled; arg->conv_arg[i].relu_enabled = relu_enabled;
arg->conv_arg[i].sb_address = bs_ptr; arg->conv_arg[i].sb_address = bs_ptr;
arg->conv_arg[i].filter_address = (int8_t *)filter_ptr; // NOLINT arg->conv_arg[i].filter_address = (int8_t *)filter_ptr; // NOLINT
...@@ -361,7 +288,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -361,7 +288,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
filter::calc_aligned_num((int)input->dims()[1], arg->filter_num); filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
arg->conv_arg[i].free_space = arg->conv_arg[i].free_space =
fpga_malloc(num_after_alignment * 2 * sizeof(half)); fpga_malloc(num_after_alignment * 2 * sizeof(half));
>>>>>>> upstream/develop
} }
} }
......
...@@ -14,132 +14,20 @@ limitations under the License. */ ...@@ -14,132 +14,20 @@ limitations under the License. */
#pragma once #pragma once
<<<<<<< HEAD
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/driver/pe.h"
=======
#include "fpga/V2/driver/pe.h" #include "fpga/V2/driver/pe.h"
#include "fpga/V2/fpga_common.h" #include "fpga/V2/fpga_common.h"
>>>>>>> upstream/develop
#include "framework/tensor.h" #include "framework/tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
<<<<<<< HEAD
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
};
enum LayoutType {
LAYOUT_CHW = 1,
LAYOUT_HWC = 0,
};
struct KernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_w;
uint32_t stride_h;
};
struct ImageInputArgs {
void* address; // input featuremap virtual address
float* scale_address; // input scale address;
uint32_t channels;
uint32_t width; // featuremap width
uint32_t height;
uint32_t pad_width; // padding width;
uint32_t pad_height;
};
struct ImageOutputArgs {
void* address; // output result address;
float* scale_address; // output scale address;
uint64_t timer_cnt; // time counter for FPGA computation
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias are interlaced;
void* filter_address;
float* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t* aligned_channel_num;
uint32_t out_channel;
uint32_t height;
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg
half kernel_reciprocal;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DataType input_data_type;
enum DataType output_data_type;
enum LayoutType input_layout_type;
enum LayoutType output_layout_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
=======
>>>>>>> upstream/develop
int open_device(); int open_device();
int close_device(); int close_device();
void* fpga_malloc(size_t size); void* fpga_malloc(size_t size);
void fpga_free(void* ptr); void fpga_free(void* ptr);
<<<<<<< HEAD
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
=======
void fpga_copy(void* dest, const void* src, size_t num); void fpga_copy(void* dest, const void* src, size_t num);
int fpga_flush(void* address, size_t size); int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size); int fpga_invalidate(void* address, size_t size);
>>>>>>> upstream/develop
float filter_find_max(framework::Tensor* filter_tensor); float filter_find_max(framework::Tensor* filter_tensor);
int get_aligned_channel_num(int channel_num); int get_aligned_channel_num(int channel_num);
...@@ -167,11 +55,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, ...@@ -167,11 +55,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
bool relu_enabled, int group_num, int stride_h, bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr); int stride_w, int padding_h, int padding_w, float* bs_ptr);
<<<<<<< HEAD
half fp32_2_fp16(float fp32_num);
float fp16_2_fp32(half fp16_num);
=======
>>>>>>> upstream/develop
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -27,11 +27,7 @@ void align_element(float **data_in, int num, int num_after_alignment) { ...@@ -27,11 +27,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
(float *)fpga_malloc(total_element * sizeof(float)); // NOLINT (float *)fpga_malloc(total_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, total_element * sizeof(float)); memset(ptr_aligned, 0, total_element * sizeof(float));
<<<<<<< HEAD
for (int i = 1; i < num; i++) {
=======
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
>>>>>>> upstream/develop
ptr_aligned[i * 2 + 0] = ptr_unaligned[i]; ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num]; ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
} }
...@@ -43,10 +39,7 @@ void align_element(float **data_in, int num, int num_after_alignment) { ...@@ -43,10 +39,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
void format_bias_scale_array(float **data_in, int num, void format_bias_scale_array(float **data_in, int num,
int num_after_alignment) { int num_after_alignment) {
align_element(data_in, num, num_after_alignment); align_element(data_in, num, num_after_alignment);
<<<<<<< HEAD
=======
fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float)); fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
>>>>>>> upstream/develop
} }
} // namespace bias_scale } // namespace bias_scale
......
...@@ -57,13 +57,8 @@ static uint64_t ffs(uint64_t data) { ...@@ -57,13 +57,8 @@ static uint64_t ffs(uint64_t data) {
uint64_t bit = 0; uint64_t bit = 0;
int i = 0; int i = 0;
<<<<<<< HEAD
for (i = 0; i < sizeof(data); i++) {
if (data & (1 << i)) {
=======
for (i = 0; i < sizeof(data) * 8; i++) { for (i = 0; i < sizeof(data) * 8; i++) {
if (data & (1UL << i)) { if (data & (1UL << i)) {
>>>>>>> upstream/develop
bit = i; bit = i;
break; break;
} }
......
...@@ -25,11 +25,7 @@ limitations under the License. */ ...@@ -25,11 +25,7 @@ limitations under the License. */
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) #define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
<<<<<<< HEAD
#define round_down(x, y) ((x) & ((y)-1))
=======
#define round_down(x, y) ((x) & ~((y)-1)) #define round_down(x, y) ((x) & ~((y)-1))
>>>>>>> upstream/develop
namespace fpga_bitmap { namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len); void bitmap_set(uint64_t *map, unsigned int start, int len);
......
...@@ -17,10 +17,7 @@ limitations under the License. */ ...@@ -17,10 +17,7 @@ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
<<<<<<< HEAD
=======
#include <sys/ioctl.h> #include <sys/ioctl.h>
>>>>>>> upstream/develop
#include <sys/mman.h> #include <sys/mman.h>
#include <unistd.h> #include <unistd.h>
#include <algorithm> #include <algorithm>
...@@ -36,10 +33,7 @@ limitations under the License. */ ...@@ -36,10 +33,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
<<<<<<< HEAD
=======
namespace driver { namespace driver {
>>>>>>> upstream/develop
struct FPGA_INFO g_fpgainfo; struct FPGA_INFO g_fpgainfo;
int open_drvdevice() { int open_drvdevice() {
...@@ -51,12 +45,8 @@ int open_drvdevice() { ...@@ -51,12 +45,8 @@ int open_drvdevice() {
int open_memdevice() { int open_memdevice() {
if (g_fpgainfo.fd_mem == -1) { if (g_fpgainfo.fd_mem == -1) {
<<<<<<< HEAD
g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
=======
// g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC); // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR); g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
>>>>>>> upstream/develop
} }
return g_fpgainfo.fd_mem; return g_fpgainfo.fd_mem;
} }
...@@ -64,10 +54,6 @@ int open_memdevice() { ...@@ -64,10 +54,6 @@ int open_memdevice() {
void pl_reset() { void pl_reset() {
// DLOG << "PL RESET"; // DLOG << "PL RESET";
<<<<<<< HEAD
// reg_writeq(0x5a, REG_FPGA_RESET);
=======
>>>>>>> upstream/develop
usleep(100 * 1000); usleep(100 * 1000);
} }
...@@ -147,11 +133,7 @@ int pl_get_status() { return 0; } ...@@ -147,11 +133,7 @@ int pl_get_status() { return 0; }
int fpga_regpoll(uint64_t reg, uint64_t val, int time) { int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
uint64_t i = 0; uint64_t i = 0;
/*timeout精确性待确认*/ /*timeout精确性待确认*/
<<<<<<< HEAD
int64_t timeout = time * CPU_FREQ / 1000000;
=======
int64_t timeout = time * 6; int64_t timeout = time * 6;
>>>>>>> upstream/develop
for (i = 0; i < timeout; i++) { for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) { if (val == reg_readq(reg)) {
...@@ -193,11 +175,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { ...@@ -193,11 +175,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
} }
void memory_release(struct fpga_memory *memory) { void memory_release(struct fpga_memory *memory) {
<<<<<<< HEAD
pthread_mutex_lock(&memory->mutex);
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
pthread_mutex_unlock(&memory->mutex);
=======
void *ptr = nullptr; void *ptr = nullptr;
/*unmap memory*/ /*unmap memory*/
...@@ -206,7 +183,6 @@ void memory_release(struct fpga_memory *memory) { ...@@ -206,7 +183,6 @@ void memory_release(struct fpga_memory *memory) {
for (iter = map.begin(); iter != map.end(); iter++) { for (iter = map.begin(); iter != map.end(); iter++) {
fpga_free_driver(ptr); fpga_free_driver(ptr);
} }
>>>>>>> upstream/develop
} }
int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) { int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
...@@ -269,10 +245,6 @@ int init_fpga_memory(struct fpga_memory *memory) { ...@@ -269,10 +245,6 @@ int init_fpga_memory(struct fpga_memory *memory) {
return rc; return rc;
} }
<<<<<<< HEAD
// spin_lock_init(&memory->spin);
=======
>>>>>>> upstream/develop
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0. fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0.
...@@ -327,11 +299,6 @@ void *fpga_reg_malloc(size_t size) { ...@@ -327,11 +299,6 @@ void *fpga_reg_malloc(size_t size) {
return ret; return ret;
} }
<<<<<<< HEAD
void *fpga_malloc_driver(size_t size) {
void *ret = nullptr;
uint64_t phy_addr = 0;
=======
void *fpga_reg_free(void *ptr) { void *fpga_reg_free(void *ptr) {
size_t size = 0; size_t size = 0;
...@@ -349,7 +316,6 @@ void *fpga_malloc_driver(size_t size) { ...@@ -349,7 +316,6 @@ void *fpga_malloc_driver(size_t size) {
void *ret = nullptr; void *ret = nullptr;
uint64_t phy_addr = 0; uint64_t phy_addr = 0;
int i = 0; int i = 0;
>>>>>>> upstream/develop
memory_request(g_fpgainfo.memory_info, size, &phy_addr); memory_request(g_fpgainfo.memory_info, size, &phy_addr);
...@@ -365,19 +331,14 @@ void *fpga_malloc_driver(size_t size) { ...@@ -365,19 +331,14 @@ void *fpga_malloc_driver(size_t size) {
void fpga_free_driver(void *ptr) { void fpga_free_driver(void *ptr) {
size_t size = 0; size_t size = 0;
<<<<<<< HEAD
=======
uint32_t pos = 0; uint32_t pos = 0;
uint64_t p_addr = 0; uint64_t p_addr = 0;
>>>>>>> upstream/develop
auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
if (iter != g_fpgainfo.fpga_addr2size_map.end()) { if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
size = iter->second; size = iter->second;
g_fpgainfo.fpga_addr2size_map.erase(iter); g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size); munmap(ptr, size);
<<<<<<< HEAD
=======
p_addr = vaddr_to_paddr(ptr); p_addr = vaddr_to_paddr(ptr);
pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
...@@ -387,14 +348,11 @@ void fpga_free_driver(void *ptr) { ...@@ -387,14 +348,11 @@ void fpga_free_driver(void *ptr) {
fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
g_fpgainfo.memory_info->nr[pos]); g_fpgainfo.memory_info->nr[pos]);
pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
>>>>>>> upstream/develop
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer";
} }
} }
<<<<<<< HEAD
=======
static inline int do_ioctl(unsigned long req, const void *arg) { static inline int do_ioctl(unsigned long req, const void *arg) {
return ioctl(g_fpgainfo.fd_mem, req, arg); return ioctl(g_fpgainfo.fd_mem, req, arg);
} }
...@@ -437,7 +395,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) { ...@@ -437,7 +395,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
return; return;
} }
>>>>>>> upstream/develop
int open_device_driver() { int open_device_driver() {
g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR; g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR; g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
...@@ -463,20 +420,13 @@ int open_device_driver() { ...@@ -463,20 +420,13 @@ int open_device_driver() {
int close_device_driver() { int close_device_driver() {
pl_destroy(); pl_destroy();
<<<<<<< HEAD
fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
=======
fpga_reg_free(g_fpgainfo.FpgaRegVirAddr); fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
>>>>>>> upstream/develop
memory_release(g_fpgainfo.memory_info); memory_release(g_fpgainfo.memory_info);
destroy_fpga_memory(g_fpgainfo.memory_info); destroy_fpga_memory(g_fpgainfo.memory_info);
return 0; return 0;
} }
<<<<<<< HEAD
=======
} // namespace driver } // namespace driver
>>>>>>> upstream/develop
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -24,10 +24,7 @@ limitations under the License. */ ...@@ -24,10 +24,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
<<<<<<< HEAD
=======
namespace driver { namespace driver {
>>>>>>> upstream/develop
#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
...@@ -51,8 +48,6 @@ const int PE_IDX_BYPASS = 3; ...@@ -51,8 +48,6 @@ const int PE_IDX_BYPASS = 3;
enum pe_status { IDLE = 0, BUSY = 1 }; enum pe_status { IDLE = 0, BUSY = 1 };
<<<<<<< HEAD
=======
struct MemoryCacheArgs { struct MemoryCacheArgs {
void *offset; void *offset;
size_t size; size_t size;
...@@ -62,7 +57,6 @@ struct MemoryCacheArgs { ...@@ -62,7 +57,6 @@ struct MemoryCacheArgs {
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
>>>>>>> upstream/develop
struct fpga_pe { struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1]; char type_name[MAX_TYPE_NAME_LENTH + 1];
struct pe_data_s *outer; struct pe_data_s *outer;
...@@ -111,39 +105,20 @@ extern struct FPGA_INFO g_fpgainfo; ...@@ -111,39 +105,20 @@ extern struct FPGA_INFO g_fpgainfo;
inline uint64_t reg_readq(uint32_t offset) { inline uint64_t reg_readq(uint32_t offset) {
// DLOG << "offset : " << offset; // DLOG << "offset : " << offset;
<<<<<<< HEAD
uint64_t value =
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset); // NOLINT
=======
uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
offset); // NOLINT offset); // NOLINT
>>>>>>> upstream/develop
return value; return value;
} }
inline void reg_writeq(uint64_t value, uint32_t offset) { inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value; // DLOG << "offset : " << offset << ", value : " << value;
<<<<<<< HEAD
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) = // NOLINT
=======
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
offset) = // NOLINT offset) = // NOLINT
>>>>>>> upstream/develop
value; value;
} }
int open_device_driver(); int open_device_driver();
<<<<<<< HEAD
int close_device_driver();
void *fpga_malloc_driver(size_t size);
void fpga_free_driver(void *ptr);
/*pe*/
uint64_t vaddr_to_paddr(void *address);
int fpga_regpoll(uint64_t reg, uint64_t val, int time);
=======
int close_device_driver(); int close_device_driver();
...@@ -164,6 +139,5 @@ uint64_t vaddr_to_paddr(void *address); ...@@ -164,6 +139,5 @@ uint64_t vaddr_to_paddr(void *address);
int fpga_regpoll(uint64_t reg, uint64_t val, int time); int fpga_regpoll(uint64_t reg, uint64_t val, int time);
} // namespace driver } // namespace driver
>>>>>>> upstream/develop
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -20,49 +20,29 @@ limitations under the License. */ ...@@ -20,49 +20,29 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
<<<<<<< HEAD
#define MUL8(x) (x * 8)
=======
#define MUL8(x) ((x)*8) #define MUL8(x) ((x)*8)
>>>>>>> upstream/develop
#define BYPASS_DONE 1 #define BYPASS_DONE 1
float Findfp16Max() { float Findfp16Max() {
uint16_t abs_vals[16]; uint16_t abs_vals[16];
uint64_t max_fp16; uint64_t max_fp16;
<<<<<<< HEAD
max_fp16 = reg_readq(MUL8(49));
=======
max_fp16 = driver::reg_readq(MUL8(49)); max_fp16 = driver::reg_readq(MUL8(49));
>>>>>>> upstream/develop
abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
<<<<<<< HEAD
max_fp16 = reg_readq(MUL8(50));
=======
max_fp16 = driver::reg_readq(MUL8(50)); max_fp16 = driver::reg_readq(MUL8(50));
>>>>>>> upstream/develop
abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
<<<<<<< HEAD
max_fp16 = reg_readq(MUL8(51));
=======
max_fp16 = driver::reg_readq(MUL8(51)); max_fp16 = driver::reg_readq(MUL8(51));
>>>>>>> upstream/develop
abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
<<<<<<< HEAD
max_fp16 = reg_readq(MUL8(52));
=======
max_fp16 = driver::reg_readq(MUL8(52)); max_fp16 = driver::reg_readq(MUL8(52));
>>>>>>> upstream/develop
abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16)); abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
...@@ -78,11 +58,7 @@ float Findfp16Max() { ...@@ -78,11 +58,7 @@ float Findfp16Max() {
} }
int ComputeFpgaConv(const struct SplitConvArgs &args) { int ComputeFpgaConv(const struct SplitConvArgs &args) {
<<<<<<< HEAD
ComputeBasicConv(args.conv_args[0]);
=======
ComputeBasicConv(args.conv_arg[0]); ComputeBasicConv(args.conv_arg[0]);
>>>>>>> upstream/develop
} }
int ComputeBasicConv(const struct ConvArgs &args) { int ComputeBasicConv(const struct ConvArgs &args) {
...@@ -190,13 +166,8 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -190,13 +166,8 @@ int PerformBypass(const struct BypassArgs &args) {
return 0; return 0;
#endif #endif
<<<<<<< HEAD
uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
=======
uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
>>>>>>> upstream/develop
uint64_t bp_enable; uint64_t bp_enable;
int64_t length; int64_t length;
uint64_t pixels; uint64_t pixels;
...@@ -225,18 +196,6 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -225,18 +196,6 @@ int PerformBypass(const struct BypassArgs &args) {
} }
// start bypass // start bypass
<<<<<<< HEAD
reg_writeq(ifm_src_paddr, MUL8(27));
reg_writeq(ifm_dst_paddr, MUL8(28));
reg_writeq(0, MUL8(0));
reg_writeq(bp_enable, MUL8(0));
// poll
int ret = -1;
ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
if (ret != -1) {
// clear "irq"
reg_readq(MUL8(63));
=======
driver::reg_writeq(ifm_src_paddr, MUL8(27)); driver::reg_writeq(ifm_src_paddr, MUL8(27));
driver::reg_writeq(ifm_dst_paddr, MUL8(28)); driver::reg_writeq(ifm_dst_paddr, MUL8(28));
driver::reg_writeq(0, MUL8(0)); driver::reg_writeq(0, MUL8(0));
...@@ -247,7 +206,6 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -247,7 +206,6 @@ int PerformBypass(const struct BypassArgs &args) {
if (ret != -1) { if (ret != -1) {
// clear "irq" // clear "irq"
driver::reg_readq(MUL8(63)); driver::reg_readq(MUL8(63));
>>>>>>> upstream/develop
} }
// get max value // get max value
if ((!args.input_data_type) && (!args.output_data_type)) { if ((!args.input_data_type) && (!args.output_data_type)) {
......
...@@ -12,12 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
<<<<<<< HEAD
#include "fpga/V2/api.h"
=======
#include "fpga/V2/fpga_common.h" #include "fpga/V2/fpga_common.h"
>>>>>>> upstream/develop
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
......
...@@ -94,10 +94,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -94,10 +94,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
convert_to_hwc(data_in, num, channel, height, width); convert_to_hwc(data_in, num, channel, height, width);
align_filter(data_in, num, channel, height, width); align_filter(data_in, num, channel, height, width);
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
<<<<<<< HEAD
=======
fpga_flush(*data_in, pixel_num * sizeof(float)); fpga_flush(*data_in, pixel_num * sizeof(float));
>>>>>>> upstream/develop
} }
void convert_fc_filter(float **data_in, int num, int chw) { void convert_fc_filter(float **data_in, int num, int chw) {
...@@ -117,11 +114,8 @@ void format_fc_filter(float **data_in, int num, int channel, int height, ...@@ -117,11 +114,8 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
int chw = channel * height * width; int chw = channel * height * width;
convert_fc_filter(data_in, num, chw); convert_fc_filter(data_in, num, chw);
align_filter(data_in, num, channel, height, width); align_filter(data_in, num, channel, height, width);
<<<<<<< HEAD
=======
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
fpga_flush(*data_in, pixel_num * sizeof(float)); fpga_flush(*data_in, pixel_num * sizeof(float));
>>>>>>> upstream/develop
} }
float find_max(float *data_in, int data_size) { float find_max(float *data_in, int data_size) {
......
...@@ -58,10 +58,7 @@ void format_image(float **data_in, int channel, int height, int width, ...@@ -58,10 +58,7 @@ void format_image(float **data_in, int channel, int height, int width,
int aligned_channel) { int aligned_channel) {
convert_to_hwc(data_in, channel, height, width); convert_to_hwc(data_in, channel, height, width);
align_image(data_in, channel, height, width, aligned_channel); align_image(data_in, channel, height, width, aligned_channel);
<<<<<<< HEAD
=======
fpga_flush(*data_in, aligned_channel * height * width * sizeof(float)); fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
>>>>>>> upstream/develop
} }
void concat_images(int16_t **images_in, float **scales_in, void *image_out, void concat_images(int16_t **images_in, float **scales_in, void *image_out,
...@@ -73,11 +70,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -73,11 +70,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
scale_out[1] = 0.0; scale_out[1] = 0.0;
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
scale_out[0] = std::max(*scale_out, scales_in[i][0]); scale_out[0] = std::max(*scale_out, scales_in[i][0]);
<<<<<<< HEAD
=======
fpga_invalidate(images_in[i], fpga_invalidate(images_in[i],
height * width * aligned_channel_num[i] * sizeof(int16_t)); height * width * aligned_channel_num[i] * sizeof(int16_t));
>>>>>>> upstream/develop
} }
scale_out[1] = 1 / scale_out[0]; scale_out[1] = 1 / scale_out[0];
...@@ -92,10 +86,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -92,10 +86,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
tmp_channel_sum += channel_num[i]; tmp_channel_sum += channel_num[i];
} }
} }
<<<<<<< HEAD
=======
fpga_flush(image_out, hw * out_channel * sizeof(int16_t)); fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
>>>>>>> upstream/develop
} }
} // namespace image } // namespace image
......
...@@ -26,10 +26,7 @@ limitations under the License. */ ...@@ -26,10 +26,7 @@ limitations under the License. */
#include "framework/program/var_desc.h" #include "framework/program/var_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
<<<<<<< HEAD:src/framework/executor.cpp
=======
#include "memory/t_malloc.h" #include "memory/t_malloc.h"
>>>>>>> upstream/develop:src/framework/executor.cpp
#ifdef PADDLE_EXECUTOR_MULTITHREAD #ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue> #include <queue>
......
...@@ -21,7 +21,6 @@ limitations under the License. */ ...@@ -21,7 +21,6 @@ limitations under the License. */
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
namespace paddle_mobile { namespace paddle_mobile {
static std::mutex lc;
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::SetThreadNum(int num) { void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
#ifdef _OPENMP #ifdef _OPENMP
...@@ -203,10 +202,7 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) { ...@@ -203,10 +202,7 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
#endif #endif
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
<<<<<<< HEAD
=======
static std::mutex lc; static std::mutex lc;
>>>>>>> upstream/develop
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::SetCLPath(std::string path) { void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
std::lock_guard<std::mutex> lock(lc); std::lock_guard<std::mutex> lock(lc);
...@@ -214,8 +210,6 @@ void PaddleMobile<Dtype, P>::SetCLPath(std::string path) { ...@@ -214,8 +210,6 @@ void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
framework::CLEngine::Instance()->setClPath(path); framework::CLEngine::Instance()->setClPath(path);
} }
} }
<<<<<<< HEAD
=======
template <> template <>
double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() { double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
cl_int status; cl_int status;
...@@ -418,7 +412,6 @@ int PaddleMobile<Dtype, P>::readText( ...@@ -418,7 +412,6 @@ int PaddleMobile<Dtype, P>::readText(
return size + 1; return size + 1;
} }
>>>>>>> upstream/develop
#endif #endif
template class PaddleMobile<CPU, Precision::FP32>; template class PaddleMobile<CPU, Precision::FP32>;
......
...@@ -81,11 +81,8 @@ class PaddleMobile { ...@@ -81,11 +81,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
public: public:
void SetCLPath(std::string cl_path); void SetCLPath(std::string cl_path);
<<<<<<< HEAD
=======
int readText(const char *kernelPath, int readText(const char *kernelPath,
char **pcode); // 读取文本文件放入 pcode,返回字符串长度 char **pcode); // 读取文本文件放入 pcode,返回字符串长度
>>>>>>> upstream/develop
#endif #endif
private: private:
......
...@@ -61,5 +61,7 @@ REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp); ...@@ -61,5 +61,7 @@ REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp); REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add, ops::FusionConvAddOp);
#endif
#endif #endif
...@@ -29,8 +29,9 @@ namespace operators { ...@@ -29,8 +29,9 @@ namespace operators {
class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher { class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
public: public:
FusionConvAddReluOpMatcher() { FusionConvAddReluOpMatcher() {
// node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD); node_ = framework::Node(G_OP_TYPE_CONV);
// node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU); node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
} }
void FolderNodes( void FolderNodes(
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADD_OP
#include "operators/fusion_deconv_add_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher);
#ifdef PADDLE_MOBILE_CPU
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADD_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/deconv_add_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDeconvAddMatcher : public framework::FusionOpMatcher {
public:
FusionDeconvAddMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD; }
};
template <typename DeviceType, typename T>
class FusionDeconvAddOp : public framework::OperatorWithKernel<
DeviceType, FusionDeconvAddParam<DeviceType>,
operators::DeconvAddKernel<DeviceType, T>> {
public:
FusionDeconvAddOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDeconvAddParam<DeviceType>,
operators::DeconvAddKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DECONV_ADD_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDRELU_OP
#include "operators/fusion_deconv_add_relu_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_deconv_add_relu, ops::FusionDeconvAddReluMatcher);
#ifdef PADDLE_MOBILE_CPU
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_deconv_add_relu, ops::FusionDeconvAddReluOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/deconv_add_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDeconvAddReluMatcher : public framework::FusionOpMatcher {
public:
FusionDeconvAddReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_RELU; }
};
template <typename DeviceType, typename T>
class FusionDeconvAddReluOp : public framework::OperatorWithKernel<
DeviceType, FusionDeconvAddReluParam<DeviceType>,
operators::DeconvAddReluKernel<DeviceType, T>> {
public:
FusionDeconvAddReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDeconvAddReluParam<DeviceType>,
operators::DeconvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DECONV_ADD_RELU_OP
...@@ -8,10 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,10 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
<<<<<<< HEAD
#ifdef FUSION_CONVADD_OP
=======
>>>>>>> upstream/develop
#include "operators/kernel/fetch_kernel.h" #include "operators/kernel/fetch_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -26,7 +22,3 @@ void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) { ...@@ -26,7 +22,3 @@ void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
template class FetchKernel<CPU, float>; template class FetchKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
<<<<<<< HEAD
#endif
=======
>>>>>>> upstream/develop
...@@ -13,20 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,20 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
<<<<<<< HEAD
__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
=======
__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c) __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c)
>>>>>>> upstream/develop
{ {
int i = get_global_id(0); int i = get_global_id(0);
int j = get_global_id(1); int j = get_global_id(1);
half4 pixel; half4 pixel;
pixel.x = convert_half(in[(i * w + j)]); pixel.x = convert_half(in[(i * w + j)]);
<<<<<<< HEAD
pixel.y = convert_half(in[h * w + (i * w + j)]);
pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
=======
if(c>=2){ if(c>=2){
pixel.y = convert_half(in[h * w + (i * w + j)]); pixel.y = convert_half(in[h * w + (i * w + j)]);
}else{ }else{
...@@ -37,7 +29,6 @@ __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h, ...@@ -37,7 +29,6 @@ __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,
}else{ }else{
pixel.z = 0.0; pixel.z = 0.0;
} }
>>>>>>> upstream/develop
pixel.w = 0.0; pixel.w = 0.0;
int2 coords; int2 coords;
coords.x = j; coords.x = j;
......
...@@ -36,10 +36,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -36,10 +36,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
int numel = input->numel(); int numel = input->numel();
cl_mem cl_image = output->GetCLImage(); cl_mem cl_image = output->GetCLImage();
<<<<<<< HEAD
=======
int c = input->dims()[1]; int c = input->dims()[1];
>>>>>>> upstream/develop
int height = output->dims()[2]; int height = output->dims()[2];
int width = output->dims()[3]; int width = output->dims()[3];
CLTensor input_cl_tensor(this->cl_helper_.CLContext(), CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
...@@ -55,11 +52,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -55,11 +52,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height); status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
<<<<<<< HEAD
=======
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
>>>>>>> upstream/develop
size_t global_work_size[2] = {width, height}; size_t global_work_size[2] = {width, height};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADD_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DeconvAddKernel
: public OpKernelBase<DeviceType, FusionDeconvAddParam<DeviceType>> {
public:
void Compute(const FusionDeconvAddParam<DeviceType> &param);
bool Init(FusionDeconvAddParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDRELU_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DeconvAddReluKernel
: public OpKernelBase<DeviceType, FusionDeconvAddReluParam<DeviceType>> {
public:
void Compute(const FusionDeconvAddReluParam<DeviceType> &param);
bool Init(FusionDeconvAddReluParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
bool relu_enabled = false;
auto input = const_cast<Tensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = bias_ptr[i];
}
fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
return true;
}
template <>
void ConvAddKernel<FPGA, float>::Compute(
const FusionConvAddParam<FPGA> &param) {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADD_OP
#include "operators/kernel/deconv_add_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
return true;
}
template <>
void DeconvAddKernel<FPGA, float>::Compute(
const FusionDeconvAddParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDRELU_OP
#include "operators/kernel/deconv_add_relu_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddReluKernel<FPGA, float>::Init(FusionDeconvAddReluParam<FPGA> *param) {
return true;
}
template <>
void DeconvAddReluKernel<FPGA, float>::Compute(
const FusionDeconvAddReluParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -24,10 +24,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) { ...@@ -24,10 +24,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
} }
template <> template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {} void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
<<<<<<< HEAD
=======
>>>>>>> upstream/develop
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
...@@ -49,16 +49,12 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { ...@@ -49,16 +49,12 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *out = param.Out(); Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
<<<<<<< HEAD
math::SoftmaxFuntor<CPU, float>()(in_x, out);
=======
fpga::fpga_invalidate( fpga::fpga_invalidate(
(void *)in_x->data<float>(), // NOLINT (void *)in_x->data<float>(), // NOLINT
fpga::get_aligned_channel_num((int)in_x->dims()[1]) * // NOLINT fpga::get_aligned_channel_num((int)in_x->dims()[1]) * // NOLINT
sizeof(float)); sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out); math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size()); fpga::fpga_flush(out->data<float>(), out->memory_size());
>>>>>>> upstream/develop
} }
} // namespace operators } // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SPLIT_OP
#include "operators/kernel/split_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA>* param) {
return true;
}
template <>
void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA>& param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TRANSPOSE2_OP
#include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
return true;
}
template <>
void Transpose2Kernel<FPGA, float>::Compute(const Transpose2Param<FPGA> &param) {
//Transpose2Compute<float>(param);
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -1532,8 +1532,6 @@ class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> { ...@@ -1532,8 +1532,6 @@ class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
}; };
#endif #endif
<<<<<<< HEAD
=======
#endif #endif
#ifdef TANH_OP #ifdef TANH_OP
...@@ -1555,7 +1553,6 @@ class TanhParam : public OpParam { ...@@ -1555,7 +1553,6 @@ class TanhParam : public OpParam {
RType *input_x_; RType *input_x_;
RType *out_; RType *out_;
}; };
>>>>>>> upstream/develop
#endif #endif
#ifdef PRELU_OP #ifdef PRELU_OP
...@@ -2224,7 +2221,10 @@ class ConvTransposeParam : public OpParam { ...@@ -2224,7 +2221,10 @@ class ConvTransposeParam : public OpParam {
const Scope &scope) { const Scope &scope) {
filter_ = FilterFrom<GType>(inputs, scope); filter_ = FilterFrom<GType>(inputs, scope);
input_ = InputFrom<GType>(inputs, scope); input_ = InputFrom<GType>(inputs, scope);
output_ = OutputFrom<GType>(outputs, scope); //output_ = OutputFrom<GType>(outputs, scope);
if (outputs.count("Output")) {
output_ = OpParam::OutputFrom<GType>(outputs, scope);
}
strides_ = GetAttr<vector<int>>("strides", attrs); strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs); paddings_ = GetAttr<vector<int>>("paddings", attrs);
dilations_ = GetAttr<vector<int>>("dilations", attrs); dilations_ = GetAttr<vector<int>>("dilations", attrs);
...@@ -2265,6 +2265,37 @@ class ConvTransposeParam : public OpParam { ...@@ -2265,6 +2265,37 @@ class ConvTransposeParam : public OpParam {
#endif #endif
}; };
#endif #endif
#ifdef FUSION_DECONVADD_OP
template <typename Dtype>
class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDeconvAddParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope)
:ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
bias_ = OpParam::InputYFrom<GType>(inputs, scope);
axis_ = OpParam::GetAttr<int>("axis", attrs);
output_ = OpParam::OutFrom<GType>(outputs, scope);
}
RType *Bias() const { return bias_; }
const int &Axis() const { return axis_; }
RType *Output() const { return output_; }
protected:
RType *bias_;
int axis_;
RType *output_;
};
#endif
#ifdef FUSION_DECONVADDRELU_OP
template <typename Dtype>
using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>;
#endif
#ifdef FUSION_DECONVRELU_OP #ifdef FUSION_DECONVRELU_OP
template <typename Dtype> template <typename Dtype>
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SPLIT_OP #ifdef SPLIT_OP
#include <vector>
#include "operators/split_op.h" #include "operators/split_op.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -83,5 +83,8 @@ namespace ops = paddle_mobile::operators; ...@@ -83,5 +83,8 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(split, ops::SplitOp); REGISTER_OPERATOR_CPU(split, ops::SplitOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(split, ops::SplitOp);
#endif
#endif // SPLIT_OP #endif // SPLIT_OP
...@@ -29,7 +29,7 @@ void TanhOp<DeviceType, T>::InferShape() const { ...@@ -29,7 +29,7 @@ void TanhOp<DeviceType, T>::InferShape() const {
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp); REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp);
#endif #endif
#endif #endif
...@@ -60,5 +60,8 @@ namespace ops = paddle_mobile::operators; ...@@ -60,5 +60,8 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op); REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(transpose2, ops::Transpose2Op);
#endif
#endif // TRANSPOSE_OP #endif // TRANSPOSE_OP
...@@ -66,15 +66,12 @@ list(FIND NET "FPGA_NET_V1" CON) ...@@ -66,15 +66,12 @@ list(FIND NET "FPGA_NET_V1" CON)
if (CON GREATER -1) if (CON GREATER -1)
ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet50 paddle-mobile) target_link_libraries(test-resnet50 paddle-mobile)
ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-densebox paddle-mobile)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif () endif ()
<<<<<<< HEAD
list(FIND NET "FPGA_NET_V2" CON)
if (CON GREATER -1)
ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet50 paddle-mobile)
=======
list(FIND NET "FPGA_NET_V2" CON) list(FIND NET "FPGA_NET_V2" CON)
if (CON GREATER -1) if (CON GREATER -1)
...@@ -83,7 +80,10 @@ if (CON GREATER -1) ...@@ -83,7 +80,10 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-pe fpga/test_pe.cpp) ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
target_link_libraries(test-pe paddle-mobile) target_link_libraries(test-pe paddle-mobile)
>>>>>>> upstream/develop
ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-densebox paddle-mobile)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif () endif ()
......
...@@ -13,19 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,19 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream> #include <iostream>
<<<<<<< HEAD
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
// paddle_mobile.SetThreadNum(4);
auto time1 = paddle_mobile::time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
=======
#include <thread> #include <thread>
#include "../../src/common/types.h" #include "../../src/common/types.h"
#include "../../src/io/paddle_test_inference_api.h" #include "../../src/io/paddle_test_inference_api.h"
...@@ -99,7 +86,6 @@ void t2() { ...@@ -99,7 +86,6 @@ void t2() {
std::string(g_yolo_mul) + "/params", true); std::string(g_yolo_mul) + "/params", true);
// auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
>>>>>>> upstream/develop
if (isok) { if (isok) {
auto time2 = paddle_mobile::time(); auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
...@@ -138,8 +124,6 @@ void t2() { ...@@ -138,8 +124,6 @@ void t2() {
// std::cout << i << std::endl; // std::cout << i << std::endl;
// } // }
} }
<<<<<<< HEAD
=======
} }
void t3() { void t3() {
...@@ -201,6 +185,5 @@ int main() { ...@@ -201,6 +185,5 @@ int main() {
// th2.join(); // th2.join();
// th3.join(); // th3.join();
th1.join(); th1.join();
>>>>>>> upstream/develop
return 0; return 0;
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册