提交 3f1e8f7d 编写于 作者: C Chon 提交者: Houjiang Chen

FPGA refactor #1560 (#1561)

* FPGA refactor

* disable FPGAKD

* recover misdeleted code segment
上级 54f13158
......@@ -14,6 +14,7 @@ option(FPGA "build with FPGA support" OFF)
if(FPGA)
option(FPGAV1 "build with fpga v1 support" ON)
option(FPGAV2 "build with fpga v2 support" OFF)
option(FPGAKD "build with fpga KD support" OFF)
endif()
project(paddle-mobile)
......@@ -97,8 +98,7 @@ else()
endif()
if(FPGA)
add_definitions(-DPADDLE_MOBILE_FPGA)
file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/math/*.cc src/operators/kernel/fpga/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
......@@ -110,6 +110,7 @@ if(FPGA)
list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
if(FPGAV1)
add_definitions(-DPADDLE_MOBILE_FPGA)
message("FPGA_V1 enabled")
add_definitions(-DPADDLE_MOBILE_FPGA_V1)
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
......@@ -120,8 +121,18 @@ if(FPGA)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
src/fpga/KD/*.h src/fpga/KD/*.hpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if(FPGAV2)
add_definitions(-DPADDLE_MOBILE_FPGA)
message("FPGA_V2 enabled")
add_definitions(-DPADDLE_MOBILE_FPGA_V2)
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
......@@ -132,8 +143,46 @@ if(FPGA)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
src/fpga/KD/*.h src/fpga/KD/*.hpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if(FPGAKD)
message("FPGAKD enabled")
add_definitions(-DPADDLE_MOBILE_FPGA_KD)
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.h)
foreach(f ${_tmp_list})
list(APPEND PADDLE_MOBILE_H ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.cpp)
foreach(f ${_tmp_list})
list(APPEND PADDLE_MOBILE_CC ${f})
endforeach()
endif()
else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
foreach(f ${_tmp_list})
......@@ -183,6 +232,8 @@ if(FPGAV1)
set(NET "FPGA_NET_V1" CACHE STRING "select net type")
elseif(FPGAV2)
set(NET "FPGA_NET_V2" CACHE STRING "select net type")
elseif(FPGAKD)
set(NET "FPGA_OPS_KD" CACHE STRING "select net type")
else()
set(NET "default" CACHE STRING "select net type")
endif()
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef alignment_h
#define alignment_h
#include <stdio.h>
#include "llapi/zynqmp_api.h"
namespace paddle_mobile {
namespace zynqmp {
inline int align_image(int wc) { return align_to_x(wc, IMAGE_ALIGNMENT); }
} // namespace zynqmp
} // namespace paddle_mobile
#endif /* alignment_h */
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef Context_hpp
#define Context_hpp
#include <stdio.h>
#include "pe.hpp"
#include "pes/conv_pe.hpp"
#include "pes/depthwise_conv_pe.hpp"
#include "pes/fully_connected_pe.hpp"
#include "pes/input_pe.hpp"
#include "pes/output_pe.hpp"
#include "pes/pooling_pe.hpp"
#include "pes/softmax_pe.hpp"
namespace paddle_mobile {
namespace zynqmp {
class Context {
public:
template <typename Ptype>
Ptype& pe() {
if (pe_ == nullptr) {
pe_ = new Ptype();
}
return static_cast<Ptype&>(*pe_);
}
~Context() {
if (pe_ != nullptr) {
delete pe_;
}
}
private:
PE* pe_ = nullptr;
};
} // namespace zynqmp
} // namespace paddle_mobile
#endif /* Context_hpp */
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "dl_engine.hpp"
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
namespace paddle_mobile {
namespace zynqmp {
class DLEngine {
public:
static DLEngine& get_instance() {
static DLEngine s_instance;
return s_instance;
}
private:
DLEngine();
};
} // namespace zynqmp
} // namespace paddle_mobile
此差异已折叠。
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "fpga/KD/alignment.h"
namespace paddle_mobile {
namespace zynqmp {
enum LayoutType {
N,
NC,
NCHW,
NHWC,
NHW,
};
class Layout {
public:
virtual int numIndex() = 0;
virtual int channelIndex() { return -1; }
virtual int heightIndex() { return -1; }
virtual int widthIndex() { return -1; }
virtual int alignedElementCount(const std::vector<int>& dims) = 0;
virtual int elementCount(const std::vector<int>& dims) = 0;
};
struct NCHW : Layout {
int numIndex() { return 0; }
int channelIndex() { return 1; }
int heightIndex() { return 2; }
int widthIndex() { return 3; }
int alignedElementCount(const std::vector<int>& dims) {
return dims[0] * dims[2] * align_image(dims[1] * dims[3]);
}
virtual int elementCount(const std::vector<int>& dims) {
return dims[0] * dims[1] * dims[2] * dims[3];
}
};
struct NHWC : Layout {
int numIndex() { return 0; }
int heightIndex() { return 1; }
int widthIndex() { return 2; }
int channelIndex() { return 3; }
int alignedElementCount(const std::vector<int>& dims) {
return dims[0] * dims[1] * align_image(dims[2] * dims[3]);
}
virtual int elementCount(const std::vector<int>& dims) {
return dims[0] * dims[1] * dims[2] * dims[3];
}
};
struct NC : Layout {
int numIndex() { return 0; }
int channelIndex() { return 1; }
int alignedElementCount(const std::vector<int>& dims) {
return dims[0] * dims[1];
}
virtual int elementCount(const std::vector<int>& dims) {
return dims[0] * dims[1];
}
};
struct N : Layout {
int numIndex() { return 0; }
int alignedElementCount(const std::vector<int>& dims) { return dims[0]; }
virtual int elementCount(const std::vector<int>& dims) { return dims[0]; }
};
struct NHW : Layout {
int numIndex() { return 0; }
int heightIndex() { return 1; }
int widthIndex() { return 2; }
int alignedElementCount(const std::vector<int>& dims) {
// TODO(chonwhite) align it;
return dims[0] * dims[1] * dims[2];
}
virtual int elementCount(const std::vector<int>& dims) {
return dims[0] * dims[1] * dims[2];
}
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory.h>
#include "fpga/KD/llapi/bias_scale.h"
#include "fpga/KD/llapi/zynqmp_api.h"
namespace paddle_mobile {
namespace zynqmp {
namespace bias_scale {
void align_element(float **data_in, int num_per_div_before_alignment, int num) {
int copynum = 0;
float *ptr_unaligned = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned =
(float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, num_element * sizeof(float));
for (int i = 0; i < div_num; i++) {
if (i == div_num - 1) {
copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
}
fpga_free(ptr_unaligned);
*data_in = ptr_aligned;
}
void interleave(float **data_in, int num_after_alignment) {
float *ptr_uninterleaved = *data_in;
float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
int num = num_after_alignment / 4;
for (int i = 0; i < num; i++) {
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
}
fpga_free(ptr_uninterleaved);
*data_in = ptr_interleaved;
}
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
align_element(bias_scale_array, element_num_per_division, num);
int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
}
void format_bias_array(float **bias_array, int num) {
float *ptr_unaligned = *bias_array;
int num_before_align = num;
int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
int16_t *ptr_aligned =
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
for (int i = 0; i < num_before_align; i++) {
float value = ptr_aligned[i];
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
}
*bias_array = (float *)ptr_aligned; // NOLINT
fpga_free(ptr_unaligned);
}
} // namespace bias_scale
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle_mobile {
namespace zynqmp {
namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
} // namespace bias_scale
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define PADDLE_MOBILE_ZU5
#define FPGA_PRINT_MODE
#define PADDLE_MOBILE_PROFILE
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/KD/llapi/filter.h"
#include <memory.h>
#include <algorithm>
#include "fpga/KD/float16.hpp"
#include "fpga/KD/llapi/zynqmp_api.h"
namespace paddle_mobile {
namespace zynqmp {
namespace filter {
int calc_division_capacity(int chw) {
int n = 2048 / ((chw + 15) / 16) * 32;
return n < 2048 ? n : 2048;
}
int calc_split_num(int num, int division_capacity) {
return (num + division_capacity - 1) / division_capacity;
}
int calc_division_number(int num, int group_num, int division_capacity) {
int split_num = calc_split_num(num, division_capacity);
// PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
// "Split number or group number should be 1");
return group_num * split_num;
}
int calc_num_per_div(int num, int group_num, int division_capacity) {
if (group_num == 1) {
if (num > division_capacity) {
return division_capacity;
} else {
return num;
}
} else {
return (num + group_num - 1) / group_num;
}
}
void convert_to_hwc(char **data_in, int num, int channel, int height,
int width) {
char *tmp = *data_in;
int chw = channel * height * width;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + n * chw + offset_height + w * channel + c) =
*((*data_in)++);
}
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
float find_max(float *data_in, int data_size) {
float max = 0.0;
for (int i = 0; i < data_size; ++i) {
float value = data_in[i];
float abs = value > 0 ? value : -value;
max = std::max(max, abs);
}
return max;
}
signed char float_to_int8(float fdata) {
if (fdata < 0.0) {
fdata -= 0.5;
} else {
fdata += 0.5;
}
return (signed char)fdata;
}
void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in;
float fix_range = 127;
float scale = fix_range / max;
signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) {
tmp_data[i] = float_to_int8(
(*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale);
}
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
}
void align_element(char **data_in, int num, int chw) {
int j = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) {
char *tmp = *data_in;
char *data_tmp =
(char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT
memset(data_tmp, 0, num * align_chw);
for (j = 0; j < num; j++) {
memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void align_num(char **data_in, int num_per_div_before_alignment, int num,
int chw) {
int i = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char));
for (i = 0; i < div_num - 1; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
(num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
*data_in = data_tmp;
fpga_free(tmp);
}
void reorder(char **data_in, int num_after_alignment, int chw) {
int index = 0;
int new_index = 0;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
(index / 16 % 2 * 4);
memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
chw_align);
}
*data_in = data_tmp;
fpga_free(tmp);
}
size_t interleave(char **data_in, int num_after_alignment, int chw) {
int i = 0;
int j = 0;
int k = 0;
int interleave_per_num = 16;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
std::cout << "interleave size:" << chw_align * num_after_alignment
<< std::endl;
char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) {
for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(data_tmp + i * chw_align + interleave_per_num * j,
*data_in + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
*data_in + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num);
}
}
*data_in = data_tmp;
fpga_free(tmp);
return chw_align * num_after_alignment;
}
size_t format_filter(float **data_in, int num, int channel, int height,
int width, int group_num, float max) {
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw);
size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
return mem_size;
}
void convert_fc_filter(char **data_in, int num, int chw) {
char *tmp = *data_in;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
for (int n = 0; n < num; n++) {
for (int c = 0; c < chw; c++) {
data_tmp[n * chw + c] = (*data_in)[num * c + n];
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void format_fc_filter(float **data_in, int num, int channel, int height,
int width, int group_num, float max) {
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_fc_filter(quantize_data, num, chw);
align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
}
void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
int16_t *tmp = *data_in;
int16_t *data_tmp =
(int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
*(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_n(int16_t **data_in, int num, int height, int width) {
int unalign_n = num;
int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
if (unalign_n == align_n) {
return;
} else {
int16_t *tmp = *data_in;
int num_element = height * width * align_n;
int16_t *data_tmp =
(int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(int16_t));
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int offset_unalign = h * width * unalign_n + w * unalign_n;
int offset_align = h * width * align_n + w * align_n;
for (int n = 0; n < unalign_n; n++) {
data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
}
}
}
*data_in = data_tmp;
free(tmp);
}
}
void quantize_to_fp16(float **data_in, int num, int height, int width,
float *scale_ptr) {
float *tmp = *data_in;
int size = num * height * width;
float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16)); // NOLINT
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int index = n * height * width + h * width + w;
float value = tmp[index] * scale_val;
tmp_data[index] = float_to_half(value);
}
}
}
fpga_flush(tmp_data, size * sizeof(int16_t));
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
}
void format_dwconv_filter(float **data_in, int num, int height, int width,
float *scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
convert_to_hwn(quantize_data, num, height, width);
align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
}
} // namespace filter
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
#include <cstdlib>
#include <cwchar>
namespace paddle_mobile {
namespace zynqmp {
namespace filter {
int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity);
void convert_to_hwc(char** data_in, int num, int channel, int height,
int width);
float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max);
void align_element(char** data_in, int num, int chw);
void align_num(char** data_in, int num_per_div_before_alignment, int num,
int chw);
void reorder(char** data_in, int num_after_alignment, int chw);
size_t interleave(char** data_in, int num_after_alignment, int chw);
size_t format_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max);
void convert_fc_filter(char** data_in, int num, int chw);
void format_fc_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max);
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
void align_element_n(int16_t** data_in, int num, int height, int width);
void quantize_to_fp16(float** data_in, int num, int height, int width,
float* scale_ptr);
void format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr);
} // namespace filter
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory.h>
#include <algorithm>
#include "fpga/KD/llapi/image.h"
#include "fpga/KD/llapi/zynqmp_api.h"
namespace paddle_mobile {
namespace zynqmp {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_conv(float **data_in, int height, int cw) {
int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
memset(data_tmp, 0, height * align_cw * sizeof(float));
for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float));
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void format_image(float **data_in, int channel, int height, int width) {
// convert_to_hwc(data_in, channel, height, width);
align_element_conv(data_in, height, channel * width);
fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(float));
}
void concat_images(int16_t **images_in, float **scales_in, void *image_out,
float *scale_out, int image_num, uint32_t *channel_num,
int height, int width) {
int i = 0;
int j = 0;
int k = 0;
int each_out_line_channel = 0;
int align_each_out_area_cw = 0;
int align_each_in_area_cw = 0;
int align_each_out_area_cw_differ = 0;
int tmp_channel = 0;
scale_out[0] = 0.0;
scale_out[1] = 0.0;
for (i = 0; i < image_num; i++) {
each_out_line_channel += channel_num[i];
scale_out[0] = std::max(*scale_out, scales_in[i][0]);
// fpga_invalidate(images_in[i],
// height *
// align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
// sizeof(int16_t));
}
scale_out[1] = 1 / scale_out[0];
align_each_out_area_cw =
align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
align_each_out_area_cw_differ =
align_each_out_area_cw - each_out_line_channel * width;
for (k = 0; k < height; k++) {
for (j = 0; j < width; j++) {
for (i = 0; i < image_num; i++) {
align_each_in_area_cw =
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
memcpy((int16_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int16_t));
tmp_channel += channel_num[i];
}
}
}
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
}
void split_image(int16_t *image_in, const float *scale_in, void **images_out,
float **scales_out, int image_num,
const uint32_t *channel_nums, int height, int width) {
int total_channel = 0;
for (int i = 0; i < image_num; i++) {
scales_out[i][0] = scale_in[0];
scales_out[i][1] = scale_in[1];
total_channel += channel_nums[i];
}
int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
fpga_invalidate(image_in, element_num * sizeof(int16_t));
int src_offset = 0;
int des_offset = 0;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
w * total_channel;
for (int i = 0; i < image_num; i++) {
des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
w * channel_nums[i];
memcpy(reinterpret_cast<int16_t *>(images_out[i] + des_offset),
image_in + src_offset, channel_nums[i] * sizeof(int16_t));
src_offset += channel_nums[i];
}
}
}
for (int i = 0; i < image_num; i++) {
element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
fpga_flush(images_out[i], element_num * sizeof(int16_t));
}
}
} // namespace image
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle_mobile {
namespace zynqmp {
namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width);
// Concat featuremaps along channel direction
void concat_images(int16_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num,
int height, int width);
// Split featuremap along channel direction
void split_image(int16_t* image_in, const float* scale_in, void** images_out,
float** scales_out, int image_num,
const uint32_t* channel_nums, int height, int width);
} // namespace image
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <cstring>
#include <map>
#include "fpga/KD/llapi/config.h"
#include "fpga/KD/llapi/zynqmp_api.h"
namespace paddle_mobile {
namespace zynqmp {
#define PADDLE_MOBILE_OS_LINUX
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
static std::map<void *, size_t> memory_map;
static size_t memory_size_max = 0;
static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) {
#ifdef PADDLE_MOBILE_OS_LINUX
return ioctl(fd, req, arg);
#else
return -1;
#endif
}
int open_device() {
std::cout << "open_device" << std::endl;
if (fd == -1) {
fd = open(device_path, O_RDWR);
}
std::cout << "open_device fd:" << fd << std::endl;
return fd;
}
void close_device() { close(fd); }
void reset_device() {
FpgaResetArgs args;
do_ioctl(IOCTL_FPGA_RESET, &args);
}
// memory management;
void *fpga_malloc(size_t size) {
// std::cout << "fpga malloc: 0x" << std::hex << size << std::dec << " (" <<
// size << ") - ";
#ifdef ENABLE_DEBUG
// std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_MOBILE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) {
std::cout << "not enough memory !";
exit(-1);
}
// std::cout << std::hex << ptr << std::dec << std::endl;
memory_map.insert(std::make_pair(ptr, size));
memory_size += size;
if (memory_size > memory_size_max) {
memory_size_max = memory_size;
}
return ptr;
#else
return malloc(size);
#endif
}
size_t fpga_get_memory_size(void *ptr) { return memory_map[ptr]; }
size_t fpga_get_memory_size_max() { return memory_size_max; }
size_t fpga_diagnose_memory(int detailed) {
size_t total = 0;
// size_t size = 0;
// int i = 0;
auto iter = memory_map.begin(); // std::map<void *, size_t>::iterator
while (iter != memory_map.end()) {
total += iter->second;
iter++;
}
return total;
}
void fpga_free(void *ptr) {
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
}
memory_size -= size;
#ifdef PADDLE_MOBILE_OS_LINUX
munmap(ptr, size);
#else
free(ptr);
#endif
}
void fpga_copy(void *dst, const void *src, int size) { memcpy(dst, src, size); }
int fpga_flush(void *address, size_t size) {
struct MemoryCacheArgs args;
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
}
int fpga_invalidate(void *address, size_t size) {
// std::cout <<
// "=================================================================================="
// << std::endl;
struct MemoryCacheArgs args;
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
}
int invalidate_cache(void *addr, int size) {
struct MemoryCacheArgs args;
args.address = addr;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
}
int flush_cache(void *addr, int size) {
struct MemoryCacheArgs args;
args.address = addr;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
}
void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int ioctl_conv(const struct ConvArgs &args) {
#ifdef ENABLE_DEBUG
// std::cout << "======Compute Basic Conv======";
// std::cout << " relu_enabled:" << args.relu_enabled
// << " sb_address:" << args.sb_address
// << " filter_address:" << args.filter_address
// << " filter_num:" << args.filter_num
// << " group_num:" << args.group_num;
// std::cout << " image_address:" << args.image.address
// << " image_scale_address:" << args.image.scale_address
// << " image_channels:" << args.image.channels
// << " image_height:" << args.image.height
// << " image_width:" << args.image.width
// << " pad_height:" << args.image.pad_height
// << " pad_width:" << args.image.pad_width;
// std::cout << " kernel_height:" << args.kernel.height
// << " kernel_width:" << args.kernel.width
// << " stride_h:" << args.kernel.stride_h
// << " stride_w:" << args.kernel.stride_w;
// std::cout << " out_address:" << args.output.address
// << " out_scale_address:" << args.output.scale_address;
//
// float* in_scale = (float*)args.image.scale_address;
// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
#endif
return do_ioctl(IOCTL_CONFIG_CONV, &args);
// return 0;
}
int compute_fpga_conv_basic(const struct ConvArgs &args) {
#ifdef ENABLE_DEBUG
// std::cout << "======Compute Basic Conv======";
// std::cout << " relu_enabled:" << args.relu_enabled
// << " sb_address:" << args.sb_address
// << " filter_address:" << args.filter_address
// << " filter_num:" << args.filter_num
// << " group_num:" << args.group_num;
// std::cout << " image_address:" << args.image.address
// << " image_scale_address:" << args.image.scale_address
// << " image_channels:" << args.image.channels
// << " image_height:" << args.image.height
// << " image_width:" << args.image.width
// << " pad_height:" << args.image.pad_height
// << " pad_width:" << args.image.pad_width;
// std::cout << " kernel_height:" << args.kernel.height
// << " kernel_width:" << args.kernel.width
// << " stride_h:" << args.kernel.stride_h
// << " stride_w:" << args.kernel.stride_w;
// std::cout << " out_address:" << args.output.address
// << " out_scale_address:" << args.output.scale_address;
// float *in_scale = (float *)args.image.scale_address;
// std::cout << " scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
// float *filter_scale = (float *)args.filter_scale_address;
// std::cout << " filter scale:" << filter_scale[0] << "," <<
// filter_scale[1] << std::endl;
#endif
return do_ioctl(IOCTL_CONFIG_CONV, &args);
}
int compute_fpga_conv(const struct SplitConvArgs &args) {
// return do_ioctl(IOCTL_CONFIG_CONV, &args);
int split_num = args.split_num;
int ret = -1;
for (int i = 0; i < split_num; i++) {
// ComputeBasicConv(args.conv_args[i]);
ret = compute_fpga_conv_basic(args.conv_arg[i]);
}
if (split_num > 1) {
std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
exit(-1);
}
return ret;
}
int compute_fpga_pool(const struct PoolingArgs &args) {
return do_ioctl(IOCTL_CONFIG_POOLING, &args);
}
int compute_fpga_ewadd(const struct EWAddArgs &args) {
return do_ioctl(IOCTL_CONFIG_EW, &args);
}
int perform_bypass(const struct BypassArgs &args) {
int size = args.image.channels * args.image.width * args.image.height;
int max_size = 1 << 21;
float times = 1.0 * size / max_size;
int count = static_cast<int>(times);
void *input_address = args.image.address;
int type_size =
args.input_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
void *output_address = args.output.address;
int out_type_size =
args.output_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
struct BypassArgs bypassArgs = args;
bypassArgs.image.width = 1;
bypassArgs.image.height = 1;
// std::cout << "times:" << times << " count:" << count << std::endl;
for (int i = 0; i < count; ++i) {
bypassArgs.image.channels = max_size;
bypassArgs.image.address =
reinterpret_cast<char *>(input_address + i * max_size * type_size);
bypassArgs.output.address =
reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
if (ret != 0) {
return ret;
}
// std::cout << "@:" << i << " ret:" << ret << std::endl;
}
int remainder = size - max_size * count;
// std::cout << "remainder:" << remainder << std::endl;
bypassArgs.image.channels = remainder;
bypassArgs.image.address =
reinterpret_cast<char *>(input_address + count * max_size * type_size);
bypassArgs.output.address = reinterpret_cast<char *>(
output_address + count * max_size * out_type_size);
return do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
}
int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
int compute_fpga_scale(const struct ScaleArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Scale======";
std::cout << "scale_address:" << args.scale_address << std::endl;
std::cout << "bias_address:" << args.bias_address << std::endl;
std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_SCALE, &args);
}
int compute_fpga_dwconv(const struct DWconvArgs &args) {
std::cout << "======Compute Basic Conv======";
std::cout << " relu_enabled:" << args.relu_enabled
<< " filter_address:" << args.filter_address;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
// float *in_scale = (float *)args.image.scale_address;
// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
}
// int config_power(const struct PowerArgs& args) {
// return do_ioctl(IOCTL_CONFIG_POWER, &args);
// }
int config_inplace(const struct InplaceArgs &args) {
return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
}
// uint64_t vaddr_to_paddr(void *address) {
// return 0;
// }
int16_t fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10)));
if (tmp & 0x1000) {
t++; // roundoff
}
return t;
}
float fp16_2_fp32(int16_t fp16_num) {
if (0 == fp16_num) {
return 0;
}
int frac = (fp16_num & 0x3ff);
int exp = ((fp16_num & 0x7c00) >> 10) + 112;
int s = fp16_num & 0x8000;
int tmp = 0;
float fp32_num = 0;
tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; // NOLINT
return fp32_num;
}
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
namespace paddle_mobile {
namespace zynqmp {
typedef int16_t half;
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
enum DDataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
};
enum DLayoutType {
LAYOUT_CHW = 1,
LAYOUT_HWC = 0,
};
struct VersionArgs {
void* buffer;
};
struct MemoryCopyArgs {
void* src;
void* dest;
size_t size;
};
struct MemoryCacheArgs {
void* address;
size_t size;
};
struct MemoryBarrierArgs {};
struct BNArgs {
bool enabled;
void* bias_address;
void* scale_address;
};
/**
Conv and Pooling kernel
*/
struct KernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_w;
uint32_t stride_h;
};
struct ImageInputArgs {
void* address; // input featuremap virtual address
void* scale_address; // input scale address;
uint32_t channels;
uint32_t width; // featuremap width
uint32_t height;
uint32_t pad_width; // padding width;
uint32_t pad_height;
};
struct ImageOutputArgs {
void* address; // output result address;
float* scale_address; // output scale address;
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias are interlaced;
void* filter_address;
void* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct DWconvArgs {
bool relu_enabled;
void* bias_address;
void* filter_address;
struct KernelArgs kernel;
struct ImageInputArgs image;
struct ImageOutputArgs output;
uint16_t out_width;
uint16_t out_height;
uint16_t sub_conv_num;
};
struct PoolingArgs {
uint16_t mode;
uint16_t kernel_reciprocal;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
uint16_t out_width;
uint16_t out_height;
};
// elementwise add arguments
struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DDataType input_data_type;
enum DDataType output_data_type;
enum DLayoutType input_layout_type;
enum DLayoutType output_layout_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
struct ScaleArgs {
void* scale_address;
void* bias_address;
uint32_t wc_alignment;
uint32_t channel_alignment;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
struct NormalizeArgs {
void* input_image_address;
void* output_image_address;
uint32_t image_width;
uint32_t image_height;
uint32_t image_channel;
uint32_t* output_scale_address;
};
struct PowerParameterArgs {
uint16_t shift;
uint16_t scale;
uint16_t power;
};
struct NormalizeParameterArgs {
uint32_t channel;
uint32_t hight_width;
};
struct InplaceArgs {
bool relu_enable;
bool power_enable;
bool normalize_enable;
};
struct FpgaRegWriteArgs {
uint64_t address; //
uint64_t value;
};
struct FpgaRegReadArgs {
uint64_t address;
uint64_t value;
};
struct FpgaResetArgs {};
#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
#define IOCTL_SEPARATOR_0 10
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_MEMORY_BARRIER \
_IOW(IOCTL_FPGA_MAGIC, 14, struct MemoryBarrierArgs)
#define IOCTL_SEPARATOR_1 20
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs)
#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs)
#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs)
#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs)
#define IOCTL_CONFIG_POWER_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
#define IOCTL_CONFIG_NORMALIZE_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
//============================== API =============================
// struct DWconvArgs {
// bool relu_enabled;
// void* bias_address;
// void* filter_address;
// struct KernelArgs kernel;
// struct ImageInputArgs image;
// struct ImageOutputArgs output;
// };
struct DeconvArgs {
uint32_t sub_conv_num;
uint32_t group_num;
uint32_t filter_num;
uint32_t omit_size;
uint32_t sub_output_width;
uint32_t sub_output_height;
struct ImageOutputArgs output;
struct SplitConvArgs* split_conv_args;
};
struct SplitArgs {
uint32_t image_num;
int16_t* image_in;
float* scale_in;
void** images_out;
float** scales_out;
uint32_t* out_channel_nums;
uint32_t height;
uint32_t width;
};
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t height;
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct GroupConvArgs {
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct SplitConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
int open_device();
void close_device();
void reset_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
size_t fpga_get_memory_size(void* ptr);
size_t fpga_get_memory_size_max();
size_t fpga_diagnose_memory(int detailed);
void fpga_copy(void* dst, const void* src, int size);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
int perform_bypass(const struct BypassArgs& args);
int compute_fpga_conv_basic(const struct ConvArgs& args);
int compute_fpga_conv(const struct SplitConvArgs& args);
int compute_fpga_pool(const struct PoolingArgs& args);
int compute_fpga_ewadd(const struct EWAddArgs& args);
int compute_fpga_scale(const struct ScaleArgs& args);
int compute_fpga_concat(const struct ConcatArgs& args);
int config_power(const struct PowerArgs& args);
int compute_fpga_dwconv(const struct DWconvArgs& args);
// int config_relu(const struct ReluArgs& args);
int config_inplace(const struct InplaceArgs& args);
int flush_cache(void* addr, int size);
int invalidate_cache(void* addr, int size);
int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num);
} // namespace zynqmp
} // namespace paddle_mobile
#endif // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef PE_hpp
#define PE_hpp
#include <stdio.h>
#include <iostream>
#include "pe_params.hpp"
#include "tensor_util.hpp"
namespace paddle_mobile {
namespace zynqmp {
class PE {
public:
virtual bool init() { return false; }
virtual void apply() {}
virtual bool dispatch() {
std::cout << "pe dispatch \n";
return false;
}
virtual ~PE() {}
};
} // namespace zynqmp
} // namespace paddle_mobile
#endif /* PE_hpp */
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef PEParams_hpp
#define PEParams_hpp
#include <stdio.h>
#include <vector>
#include "llapi/zynqmp_api.h"
#include "tensor.hpp"
namespace paddle_mobile {
namespace zynqmp {
struct PEParam {};
struct InputParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* output = nullptr;
};
struct OutputParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* output = nullptr;
};
struct ReLUParam : PEParam {
public:
bool enabled = false;
};
struct BatchnormParam : PEParam {
public:
Tensor* bias = nullptr;
Tensor* scale = nullptr;
Tensor* mean = nullptr;
Tensor* variance = nullptr;
float epsilon = 0;
};
struct BasicConvParam {
Tensor output;
Tensor filter;
Tensor scaleBias;
ConvArgs args;
};
struct ConvParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* output = nullptr;
Tensor* filter = nullptr;
BatchnormParam* batchnorm = nullptr;
ReLUParam relu;
int groups = 1;
std::vector<int> strides;
std::vector<int> paddings;
std::vector<int> kernelSize;
std::vector<int> dilations;
Tensor* scale() { return scale_; }
Tensor* bias() { return bias_; }
// Tensor* quantizedFilter() {
// return quantizedFilter_;
// }
std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
protected:
std::vector<BasicConvParam*> splitParams_;
// Tensor* quantizedFilter_ = new Tensor();
Tensor* scale_ = new Tensor();
Tensor* bias_ = new Tensor();
};
struct DepthwiseConvParam : ConvParam {
public:
Tensor* quantizedFilter() { return quantizedFilter_; }
DWconvArgs args;
protected:
Tensor* quantizedFilter_ = new Tensor();
};
enum PoolingType : int {
MAX = 0,
AVERAGE = 1,
};
struct PoolingParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* output = nullptr;
PoolingType type = PoolingType::MAX;
bool globalPooling = false;
std::vector<int> kernelSize;
std::vector<int> strides;
std::vector<int> paddings;
PoolingArgs poolingArgs = {0};
};
struct ConcatParam : PEParam {
public:
std::vector<Tensor*> inputs;
Tensor* output;
int axis = 0;
};
struct ElementwiseAddParam : PEParam {
public:
std::vector<Tensor*> inputs;
Tensor* output = nullptr;
int axis = 0;
ReLUParam relu;
EWAddArgs ewargs;
};
struct FullyConnectedParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* filter = nullptr;
Tensor* bias = nullptr;
Tensor* output = nullptr;
Tensor* quantizedFilter() { return quantizedFilter_; }
Tensor* biasScale() { return biasScale_; }
SplitConvArgs convArgs;
protected:
Tensor* quantizedFilter_ = new Tensor();
Tensor* biasScale_ = new Tensor();
};
struct SoftmaxParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* output = nullptr;
private:
Tensor* floatInput = nullptr;
};
struct NormParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* output = nullptr;
private:
Tensor* floatInput = nullptr;
};
} // namespace zynqmp
} // namespace paddle_mobile
#endif /* PEParams_hpp */
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "../pe.hpp"
#include "../pe_params.hpp"
namespace paddle_mobile {
namespace zynqmp {
class ConcatPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
return true;
}
void apply() {}
bool dispatch() {
Tensor* output = param_.output;
Shape& output_shape = output->shape();
float16* out_data = param_.output->data<float16>();
int channel_sum = 0;
int out_channel = output_shape.channel();
float scale = 0;
for (int n = 0; n < param_.inputs.size(); n++) {
Tensor* input = param_.inputs[n];
input->invalidate();
scale = std::max(scale, input->scale()[0]);
Shape& input_shape = input->shape();
int wh = output_shape.width() * output_shape.height();
for (int j = 0; j < wh; j++) {
float16* src = input->data<float16>() + j * input_shape.channel();
memcpy(out_data + j * out_channel + channel_sum, src,
input_shape.channel() * sizeof(float16));
}
channel_sum += input_shape.channel();
}
output->scale()[0] = scale;
output->scale()[1] = 1.0f / scale;
std::cout << "conv scale::" << scale << std::endl;
output->flush();
return true;
}
ConcatParam& param() { return param_; }
private:
ConcatParam param_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "../llapi/image.h"
#include "../pe.hpp"
#include "../pe_params.hpp"
#include "concat_pe.hpp"
#include "conv_pe.hpp"
#include "conv_process.hpp"
namespace paddle_mobile {
namespace zynqmp {
class ConvPE : public PE {
public:
bool init() {
std::cout << "Conv init" << std::endl;
return true;
}
void apply() {
// process scale and bias;
BatchnormParam* bn = param_.batchnorm;
int channel = param_.output->shape().channel();
Shape sb_shape(N, {channel});
float* new_scale_ptr = param_.scale()->mutableData<float>(FP32, sb_shape);
float* new_bias_ptr = param_.bias()->mutableData<float>(FP32, sb_shape);
if (bn != nullptr) {
float* bn_scale_ptr = bn->scale->data<float>();
float* bn_bias_ptr = bn->bias->data<float>();
float* bn_var_ptr = bn->variance->data<float>();
float* bn_mean_ptr = bn->mean->data<float>();
float epsilon = bn->epsilon;
for (int i = 0; i < channel; i++) {
float new_scale =
bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_scale_ptr[i] = new_scale;
new_bias_ptr[i] =
bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
}
} else {
for (int i = 0; i < channel; i++) {
new_scale_ptr[i] = 1.0f;
new_bias_ptr[i] = 0.0f;
}
}
fill_split_arg(param_);
if (param_.splitParams().size() > 1) {
ConcatParam& concat_param = concatPE_.param();
for (auto conv_param : param_.splitParams()) {
concat_param.inputs.push_back(&conv_param->output);
}
concat_param.output = param_.output;
concatPE_.init();
concatPE_.apply();
}
}
bool dispatch() {
std::vector<BasicConvParam*>& params = param_.splitParams();
int ret = 0;
for (auto conv_param : params) {
ret |= compute_fpga_conv_basic(conv_param->args);
}
size_t size = params.size();
if (ret == 0 && size > 1) {
concatPE_.dispatch();
}
return ret == 0;
}
ConvParam& param() { return param_; }
private:
ConvParam param_;
ConcatPE concatPE_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef conv_process_hpp
#define conv_process_hpp
#include <string.h>
#include <cmath>
#include <vector>
#include "../float16.hpp"
#include "../llapi/bias_scale.h"
#include "../llapi/filter.h"
#include "../llapi/image.h"
#include "../tensor.hpp"
namespace paddle_mobile {
namespace zynqmp {
inline int get_aligned_filter_element_num(int chw) {
return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
}
inline int get_filter_num_per_div(Tensor* filter, int group_num) {
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_num_per_div(num, group_num, div_capacity);
}
inline int get_split_num(Tensor* filter) {
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity);
}
inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter,
Tensor* scale_bias, int group) {
float* scale_data = nullptr;
float* bias_data = nullptr;
if (scale != nullptr) {
scale_data = scale->data<float>();
}
if (bias != nullptr) {
bias_data = bias->data<float>();
}
int channel = filter->shape().num();
Shape bias_scale_shape(N, {2 * channel});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
for (int i = 0; i < channel; i++) {
float scale_value = scale_data == nullptr ? 1 : scale_data[i];
float bias_value = bias_data == nullptr ? 0 : bias_data[i];
bs_data[i + channel] = scale_value;
bs_data[i] = bias_value;
}
int element_num_per_div = get_filter_num_per_div(filter, group);
bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
}
inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
float max_value = find_max(*filter);
Shape& filter_shape = filter->shape();
quantized_filter->setAligned(true);
quantized_filter->mutableData<int8_t>(INT8, filter->shape());
quantized_filter->scale()[0] = max_value / 127.0f;
quantized_filter->scale()[1] = 127.0f / max_value;
auto memory_size = filter->shape().memorySize(sizeof(float));
auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
memcpy(new_data, filter->data<float>(), memory_size);
size_t mem_size = filter::format_filter(
&new_data, filter_shape.num(), filter_shape.channel(),
filter_shape.height(), filter_shape.width(), group, max_value);
int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
memcpy(src, new_data, mem_size);
fpga_free(new_data);
quantized_filter->flush();
}
inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter,
float* scale) {
int num = filter->shape().num();
int height = filter->shape().height();
int width = filter->shape().width();
auto memory_size = filter->shape().memorySize(sizeof(float));
auto new_data = (float*)fpga_malloc(memory_size); // NOLINT
memcpy(new_data, filter->data<float>(), memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale);
float16* src = quantized_filter->mutableData<float16>(FP16, filter->shape());
memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(float16)));
quantized_filter->flush();
fpga_free(new_data);
}
inline void format_fc_filter(Tensor* filter, Tensor* quantized_filter) {
float max_value = find_max(*filter);
Shape& filter_shape = filter->shape();
quantized_filter->setAligned(true);
quantized_filter->mutableData<int8_t>(INT8, filter->shape());
quantized_filter->scale()[0] = max_value / 127.0f;
quantized_filter->scale()[1] = 127.0f / max_value;
size_t memory_size = filter->shape().memorySize(sizeof(float));
auto new_data = (float*)fpga_malloc(memory_size); // NOLINT
memcpy(new_data, filter->data<float>(), memory_size);
filter::format_fc_filter(&new_data, filter_shape.num(),
filter_shape.channel(), filter_shape.height(),
filter_shape.width(), 1, max_value);
int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(int8_t)));
quantized_filter->flush();
fpga_free(new_data);
}
inline void fill_split_arg(const ConvParam& c_param) {
ConvParam& param = const_cast<ConvParam&>(c_param);
Tensor* input = param.input;
Tensor* out = param.output;
Tensor* filter = param.filter;
auto channel = out->shape().channel();
int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
int element_num = get_aligned_filter_element_num(filter->shape().channel() *
filter->shape().height() *
filter->shape().width());
Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
int filter_num = filter->shape().num();
float16* out_address = nullptr;
int8_t* filter_address = nullptr;
float* sb_address = nullptr;
float* out_scale_address = nullptr;
ConvArgs& args = conv_param->args;
if (split_num == 1) {
out_address = out->data<float16>();
out_scale_address = out->scale();
}
filter_num = i == split_num - 1
? channel - (split_num - 1) * filter_num_per_div // NOLINT
: filter_num_per_div;
if (split_num != 1) {
Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
}
Shape f_shape(NCHW, {filter_num, filter->shape().channel(),
filter->shape().height(), filter->shape().width()});
Tensor new_filter;
float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
int filter_hwc = filter->shape().height() * filter->shape().width() *
filter->shape().channel();
memcpy(new_filter_data,
filter->data<float>() + i * filter_num_per_div * filter_hwc,
filter_num * filter_hwc * sizeof(float));
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
format_filter(&new_filter, &(conv_param->filter), param.groups);
filter_address = conv_param->filter.data<int8_t>();
std::cout << conv_param->filter.scale()[0] << std::endl;
args.filter_scale_address = conv_param->filter.scale();
int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
Tensor scale;
Tensor bias;
int chnnnel_start = i * filter_num_per_div;
Shape s_shape(N, {filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int i = 0; i < filter_num; i++) {
scale_data[i] = param.scale()->data<float>()[i + chnnnel_start];
}
for (int i = 0; i < filter_num; i++) {
// bias_data[i] = 0.0f;//TODO
bias_data[i] = param.bias()->data<float>()[i + chnnnel_start];
}
Shape sb_shape(N, {sb_num});
format_scale_bias(&scale, &bias, &conv_param->filter,
&conv_param->scaleBias, param.groups);
sb_address = conv_param->scaleBias.mutableData<float>(FP32, sb_shape);
args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = sb_address;
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
args.kernel.width = new_filter.shape().width();
args.filter_address = filter_address;
args.filter_num = filter_num;
args.image.address = input->data<void>();
args.image.scale_address = input->scale();
args.image.channels = input->shape().channel();
args.image.width = input->shape().width();
args.image.height = input->shape().height();
args.image.pad_width = param.paddings[0];
args.image.pad_height = param.paddings[1];
args.output.address = out_address;
args.output.scale_address = out_scale_address;
param.splitParams().push_back(conv_param);
}
}
inline void fill_split_arg(struct SplitConvArgs* arg, Tensor* input,
Tensor* out, Tensor* filter, bool relu_enabled,
int group_num, int stride_h, int stride_w,
int padding_h, int padding_w, float* bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num;
arg->split_num = group_num == 1 ? get_split_num(filter) : 1;
arg->filter_num = filter->shape().num();
arg->output.address = out_ptr;
arg->output.scale_address = out->scale();
arg->conv_arg =
(ConvArgs*)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT
memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
arg->concat_arg.image_num = arg->split_num;
arg->concat_arg.image_out = out_ptr;
arg->concat_arg.scale_out = out->scale();
arg->concat_arg.height = out->shape().height();
arg->concat_arg.width = out->shape().width();
int n = arg->split_num;
arg->concat_arg.images_in = (half**)fpga_malloc(n * sizeof(int*)); // NOLINT
arg->concat_arg.scales_in =
(float**)fpga_malloc(n * sizeof(float*)); // NOLINT
arg->concat_arg.channel_num =
(uint32_t*)fpga_malloc(n * sizeof(uint32_t)); // NOLINT
auto channel = out->shape().channel();
int filter_num_per_div = get_filter_num_per_div(filter, group_num);
int element_num = get_aligned_filter_element_num(filter->shape().channel() *
filter->shape().height() *
filter->shape().width());
for (int i = 0; i < n; i++) {
arg->conv_arg[i].relu_enabled = relu_enabled;
arg->conv_arg[i].group_num = (uint32_t)group_num;
arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
arg->conv_arg[i].kernel.height = filter->shape().height();
arg->conv_arg[i].kernel.width = filter->shape().width();
arg->conv_arg[i].image.address = input_ptr;
arg->conv_arg[i].image.channels = input->shape().channel();
arg->conv_arg[i].image.height = input->shape().height();
arg->conv_arg[i].image.width = input->shape().width();
arg->conv_arg[i].image.scale_address = input->scale();
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].filter_scale_address = filter->scale();
arg->conv_arg[i].filter_num = (uint32_t)(
i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div);
size_t filter_size =
element_num *
align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head =
&((int8_t*)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT
arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size);
size_t bs_size = 2 *
align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
sizeof(float);
auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
fpga_flush(arg->conv_arg[i].sb_address, bs_size);
if (n > 1) {
arg->conv_arg[i].output.scale_address =
(float*)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_arg[i].output.address = fpga_malloc(
out->shape().height() *
align_to_x(out->shape().width() * arg->conv_arg[i].filter_num,
IMAGE_ALIGNMENT) *
sizeof(half));
} else {
arg->conv_arg[i].output.scale_address = out->scale();
arg->conv_arg[i].output.address = out_ptr;
}
arg->concat_arg.images_in[i] =
(half*)arg->conv_arg[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
}
}
inline int do_concat(const struct ConcatArgs& args) {
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width);
return 0;
}
inline bool compute_conv(const ConvParam& c_conv_params) {
ConvParam& conv_params = const_cast<ConvParam&>(c_conv_params);
std::vector<BasicConvParam*>& params = conv_params.splitParams();
int ret = 0;
for (auto conv_param : params) {
ret |= compute_fpga_conv_basic(conv_param->args);
}
size_t size = params.size();
if (ret == 0 && size > 1) {
Tensor* output = conv_params.output;
Tensor& img = params[0]->output;
for (int i = 0; i < 1; i++) {
for (int i = 0; i < img.shape().numel(); i++) {
float value = half_to_float(img.data<float16>()[i]);
std::cout << "value:" << value << std::endl;
}
}
}
return ret == 0;
}
inline bool compute_conv(const SplitConvArgs& args) {
int ret = 0;
int split_num = args.split_num;
for (int i = 0; i < split_num; i++) {
ret |= compute_fpga_conv_basic(args.conv_arg[i]);
}
if (split_num > 1) {
do_concat(args.concat_arg);
}
return ret == 0;
}
} // namespace zynqmp
} // namespace paddle_mobile
#endif /* conv_process_hpp */
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../float16.hpp"
#include "../pe.hpp"
#include "../pe_params.hpp"
#include "conv_process.hpp"
namespace paddle_mobile {
namespace zynqmp {
class DepthwiseConvPE : public PE {
public:
bool init() {
std::cout << "DWConv init" << std::endl;
return true;
}
void apply() {
DepthwiseConvParam& param = param_;
Tensor* input = param.input;
Tensor* output = param.output;
int channel = output->shape().channel();
Tensor* new_scale = param.scale();
Tensor* new_bias = param.bias();
Shape shape(NC, {channel, 1});
float* new_scale_data = new_scale->mutableData<float>(FP32, shape);
float16* new_bias_data = new_bias->mutableData<float16>(FP16, shape);
BatchnormParam* batchnorm = param.batchnorm;
memset(new_scale_data, 0, new_scale->shape().memorySize(sizeof(float16)));
memset(new_bias_data, 0, new_bias->shape().memorySize(sizeof(float16)));
if (batchnorm != nullptr) {
for (size_t i = 0; i < channel; i++) {
// TODO(chonwhite) combine;
}
} else {
float16 zero = float_to_half(0.0f);
for (size_t i = 0; i < channel; i++) {
new_bias_data[i] = zero;
new_scale_data[i] = 1.0f;
}
}
Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
DWconvArgs args = {0};
void* filter_address = quantized_filter->data<float>();
std::cout << "filter:" << filter_address;
args.bias_address = new_bias_data;
args.filter_address = param.quantizedFilter()->data<void>();
args.kernel.width = param.kernelSize[0];
args.kernel.height = param.kernelSize[1];
args.kernel.stride_w = param.strides[0];
args.kernel.stride_h = param.strides[1];
args.image.address = input->data<void>();
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_width = param.paddings[0];
args.image.pad_height = param.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
args.out_width = param.output->shape().width();
args.out_height = param.output->shape().height();
args.sub_conv_num = 1;
param.args = args;
}
bool dispatch() { return compute_fpga_dwconv(param_.args) == 0; }
DepthwiseConvParam& param() { return param_; }
private:
DepthwiseConvParam param_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../pe.hpp"
#include "../pe_params.hpp"
namespace paddle_mobile {
namespace zynqmp {
class ElementwiseAddPE : public PE {
public:
bool init() { return true; }
void apply() {
Tensor* input0 = param_.inputs[0];
Tensor* input1 = param_.inputs[1];
Tensor* output = param_.output;
EWAddArgs args = {0};
args.const0 = 0x3c00;
args.const1 = 0x3c00; // =1
args.image0.address = input0->data<float16>();
args.image0.channels = input0->shape().channel();
args.image0.scale_address = input0->scale();
args.image0.height = input0->shape().height();
args.image0.width = input0->shape().width();
args.image0.pad_height = 0;
args.image0.pad_width = 0;
args.image1.address = input1->data<float16>();
args.image1.channels = input1->shape().channel();
args.image1.scale_address = input1->scale();
args.image1.height = input1->shape().height();
args.image1.width = input1->shape().width();
args.image1.pad_height = 0;
args.image1.pad_width = 0;
args.output.scale_address = output->scale();
args.output.address = output->data<float16>();
param_.ewargs = args;
}
bool dispatch() {
InplaceArgs inplace_args = {0};
if (param_.relu.enabled) {
inplace_args.relu_enable = true;
config_inplace(inplace_args);
}
compute_fpga_ewadd(param_.ewargs);
if (param_.relu.enabled) {
inplace_args.relu_enable = false;
config_inplace(inplace_args);
}
return true;
}
ElementwiseAddParam& param() { return param_; }
private:
ElementwiseAddParam param_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "../pe.hpp"
#include "../pe_params.hpp"
#include "conv_process.hpp"
namespace paddle_mobile {
namespace zynqmp {
class FullyConnectedPE : public PE {
public:
bool init() { return true; }
void apply() {
Tensor* input = param_.input;
Tensor* output = param_.output;
convParam_.input = param_.input;
convParam_.output = param_.output;
// convParam_.relu = param_.relu;
convParam_.groups = 1;
convParam_.strides = {1, 1};
convParam_.paddings = {0, 0};
convParam_.kernelSize = {input->shape().width(), input->shape().height()};
convParam_.dilations = {1, 1};
int num = param_.filter->shape().channel();
int chw = param_.filter->shape().num();
int height = param_.input->shape().height();
int width = param_.input->shape().width();
int filter_channel = chw / height / width;
int channel = param_.output->shape().channel();
Shape shape(NCHW, {num, filter_channel, height, width});
Tensor* conv_filter = new Tensor();
float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
float* filter_data = param_.filter->data<float>();
for (int i = 0; i < num; i++) {
float sum = 0;
float* f_start = filter_data + i * chw;
for (int j = 0; j < chw; j++) {
float scale = filter_data[j * num + i];
new_filter_data[i * chw + j] = scale;
}
}
conv_filter->flush();
convParam_.filter = conv_filter;
Shape sb_shape(N, {channel});
float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
for (int i = 0; i < channel; i++) {
scale_data[i] = 1.0f;
bias_data[i] = param_.bias->data<float>()[i];
}
fill_split_arg(convParam_);
}
bool dispatch() {
int ret = 0;
std::vector<BasicConvParam*>& params = convParam_.splitParams();
for (auto conv_param : params) {
std::cout << "conv basic \n";
ret |= compute_fpga_conv_basic(conv_param->args);
}
return ret == 0;
}
FullyConnectedParam& param() { return param_; }
private:
FullyConnectedParam param_;
ConvParam convParam_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../pe.hpp"
#include "../pe_params.hpp"
namespace paddle_mobile {
namespace zynqmp {
class InputPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
return true;
}
bool dispatch() {
std::cout << "InputPE dispatch \n";
Tensor* input = param_.input;
Tensor* output = param_.output;
Tensor* src = input;
Tensor half_tensor;
if (input->dataType() == DataType::FP32) {
half_tensor.mutableData<void*>(DataType::FP16, input->shape());
half_tensor.copyFrom(input);
src = &half_tensor;
}
output->mutableData<void>();
src->alignImage(output, true);
return true;
}
InputParam& param() { return param_; }
private:
InputParam param_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/* NEON implementation of sin, cos, exp and log
*
* Inspired by Intel Approximate Math library, and based on the
* corresponding algorithms of the cephes math library
*/
/* Copyright (C) 2011 Julien Pommier
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* (this is the zlib license)
*/
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#pragma once
#include <arm_neon.h>
static const int32_t c_inv_mant_mask = ~0x7f800000u;
static const float c_cephes_SQRTHF = 0.707106781186547524;
static const float c_cephes_log_p0 = 7.0376836292E-2;
static const float c_cephes_log_p1 = -1.1514610310E-1;
static const float c_cephes_log_p2 = 1.1676998740E-1;
static const float c_cephes_log_p3 = -1.2420140846E-1;
static const float c_cephes_log_p4 = +1.4249322787E-1;
static const float c_cephes_log_p5 = -1.6668057665E-1;
static const float c_cephes_log_p6 = +2.0000714765E-1;
static const float c_cephes_log_p7 = -2.4999993993E-1;
static const float c_cephes_log_p8 = +3.3333331174E-1;
static const float c_cephes_log_q1 = -2.12194440e-4;
static const float c_cephes_log_q2 = 0.693359375;
/* natural logarithm computed for 4 simultaneous float
* return NaN for x <= 0
*/
static inline float32x4_t log_ps(float32x4_t x) {
float32x4_t one = vdupq_n_f32(1);
x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
int32x4_t ux = vreinterpretq_s32_f32(x);
int32x4_t emm0 = vshrq_n_s32(ux, 23);
/* keep only the fractional part */
ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
x = vreinterpretq_f32_s32(ux);
emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
float32x4_t e = vcvtq_f32_s32(emm0);
e = vaddq_f32(e, one);
/* part2:
* if( x < SQRTHF ) {
* e -= 1;
* x = x + x - 1.0;
* } else { x = x - 1.0; }
*/
uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
float32x4_t tmp =
vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
x = vsubq_f32(x, one);
e = vsubq_f32(
e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
x = vaddq_f32(x, tmp);
float32x4_t z = vmulq_f32(x, x);
float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
y = vmulq_f32(y, x);
y = vmulq_f32(y, z);
tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
y = vaddq_f32(y, tmp);
tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
y = vsubq_f32(y, tmp);
tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
x = vaddq_f32(x, y);
x = vaddq_f32(x, tmp);
x = vreinterpretq_f32_u32(vorrq_u32(
vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
return x;
}
static const float c_exp_hi = 88.3762626647949f;
static const float c_exp_lo = -88.3762626647949f;
static const float c_cephes_LOG2EF = 1.44269504088896341;
static const float c_cephes_exp_C1 = 0.693359375;
static const float c_cephes_exp_C2 = -2.12194440e-4;
static const float c_cephes_exp_p0 = 1.9875691500E-4;
static const float c_cephes_exp_p1 = 1.3981999507E-3;
static const float c_cephes_exp_p2 = 8.3334519073E-3;
static const float c_cephes_exp_p3 = 4.1665795894E-2;
static const float c_cephes_exp_p4 = 1.6666665459E-1;
static const float c_cephes_exp_p5 = 5.0000001201E-1;
/* exp() computed for 4 float at once */
static inline float32x4_t exp_ps(float32x4_t x) {
float32x4_t tmp, fx;
float32x4_t one = vdupq_n_f32(1);
x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
/* express exp(x) as exp(g + n*log(2)) */
fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
/* perform a floorf */
tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
/* if greater, substract 1 */
uint32x4_t mask = vcgtq_f32(tmp, fx);
mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
x = vsubq_f32(x, tmp);
x = vsubq_f32(x, z);
static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
c_cephes_exp_p2, c_cephes_exp_p3,
c_cephes_exp_p4, c_cephes_exp_p5};
float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
y = vmulq_f32(y, x);
z = vmulq_f32(x, x);
y = vaddq_f32(y, c1);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c2);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c3);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c4);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c5);
y = vmulq_f32(y, z);
y = vaddq_f32(y, x);
y = vaddq_f32(y, one);
/* build 2^n */
int32x4_t mm;
mm = vcvtq_s32_f32(fx);
mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
mm = vshlq_n_s32(mm, 23);
float32x4_t pow2n = vreinterpretq_f32_s32(mm);
y = vmulq_f32(y, pow2n);
return y;
}
static const float c_minus_cephes_DP1 = -0.78515625;
static const float c_minus_cephes_DP2 = -2.4187564849853515625e-4;
static const float c_minus_cephes_DP3 = -3.77489497744594108e-8;
static const float c_sincof_p0 = -1.9515295891E-4;
static const float c_sincof_p1 = 8.3321608736E-3;
static const float c_sincof_p2 = -1.6666654611E-1;
static const float c_coscof_p0 = 2.443315711809948E-005;
static const float c_coscof_p1 = -1.388731625493765E-003;
static const float c_coscof_p2 = 4.166664568298827E-002;
static const float c_cephes_FOPI = 1.27323954473516; // 4 / M_PI
/* evaluation of 4 sines & cosines at once.
*
* The code is the exact rewriting of the cephes sinf function.
* Precision is excellent as long as x < 8192 (I did not bother to
* take into account the special handling they have for greater values
* -- it does not return garbage for arguments over 8192, though, but
* the extra precision is missing).
*
* Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
* surprising but correct result.
*
* Note also that when you compute sin(x), cos(x) is available at
* almost no extra price so both sin_ps and cos_ps make use of
* sincos_ps..
*/
static inline void sincos_ps(float32x4_t x, float32x4_t *ysin,
float32x4_t *ycos) {
// any x
float32x4_t xmm1, xmm2, xmm3, y;
uint32x4_t emm2;
uint32x4_t sign_mask_sin, sign_mask_cos;
sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
x = vabsq_f32(x);
/* scale by 4/Pi */
y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
/* store the integer part of y in mm0 */
emm2 = vcvtq_u32_f32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
y = vcvtq_f32_u32(emm2);
/* get the polynom selection mask
* there is one polynom for 0 <= x <= Pi/4
* and another one for Pi/4<x<=Pi/2
*
* Both branches will be computed.
*/
uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
/* The magic pass: "Extended precision modular arithmetic"
* x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
x = vaddq_f32(x, xmm1);
x = vaddq_f32(x, xmm2);
x = vaddq_f32(x, xmm3);
sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
* and the second polynom (Pi/4 <= x <= 0) in y2 */
float32x4_t z = vmulq_f32(x, x);
float32x4_t y1, y2;
y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, x);
y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
y2 = vaddq_f32(y2, x);
y1 = vaddq_f32(y1, vdupq_n_f32(1));
/* select the correct result from the two polynoms */
float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
*ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
*ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
}
static inline float32x4_t sin_ps(float32x4_t x) {
float32x4_t ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ysin;
}
static inline float32x4_t cos_ps(float32x4_t x) {
float32x4_t ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ycos;
}
static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
float32x4_t reciprocal = vrecpeq_f32(b);
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
return vmulq_f32(a, reciprocal);
}
static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
return exp_ps(vmulq_f32(b, log_ps(a)));
}
#endif // __ARM_NEON__
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../pe.hpp"
#include "../pe_params.hpp"
namespace paddle_mobile {
namespace zynqmp {
class OutputPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(false);
return true;
}
bool dispatch() {
Tensor* input = param_.input;
Tensor* output = param_.output;
Tensor* src_tensor = input;
Tensor float_tensor;
input->invalidate();
float_tensor.mutableData<float>(DataType::FP32, input->shape());
if (input->dataType() == DataType::FP16) {
float_tensor.copyFrom(input);
src_tensor = &float_tensor;
}
src_tensor->unalignImage(output, true);
return true;
}
OutputParam& param() { return param_; }
private:
OutputParam param_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../pe.hpp"
#include "../pe_params.hpp"
namespace paddle_mobile {
namespace zynqmp {
class PoolingPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
return true;
}
void apply() {
Tensor* input = param_.input;
Tensor* output = param_.output;
uint32_t k_width = param_.kernelSize[0];
uint32_t k_height = param_.kernelSize[1];
if (param_.globalPooling) {
k_width = input->shape().width();
k_height = input->shape().height();
}
PoolingArgs args = {0};
args.mode = param_.type;
args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
args.image.address = input->data<float16>();
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_height = param_.paddings[0];
args.image.pad_width = param_.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output->mutableData<float16>();
args.output.scale_address = output->scale();
args.kernel.height = k_height;
args.kernel.width = k_width;
args.kernel.stride_h = param_.strides[0];
args.kernel.stride_w = param_.strides[1];
args.out_height = output->shape().height();
args.out_width = output->shape().width();
param_.poolingArgs = args;
}
bool dispatch() { return compute_fpga_pool(param_.poolingArgs) == 0; }
PoolingParam& param() { return param_; }
private:
PoolingParam param_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "softmax_pe.hpp"
#include <vector>
namespace paddle_mobile {
namespace zynqmp {
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#ifndef __aarch64__
static inline float32_t vmaxvq_f32(const float32x4_t &r) {
float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
return vget_lane_f32(vpmax_f32(v, v), 0);
}
static inline float32_t vaddvq_f32(const float32x4_t &r) {
float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
return vget_lane_f32(vpadd_f32(v, v), 0);
}
#endif // __aarch64__
#endif // __ARM_NEON__
static float find_max(const float *input, const int num_classes) {
int remain = num_classes;
float max = -std::numeric_limits<float>::max();
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
int loop = num_classes >> 3;
remain = num_classes & 0x7;
float32x4_t __max = vdupq_n_f32(max);
for (int i = 0; i < loop; ++i, input += 8) {
float32x4_t x0 = vld1q_f32(input);
float32x4_t x1 = vld1q_f32(input + 4);
__max = vmaxq_f32(x0, __max);
__max = vmaxq_f32(x1, __max);
}
max = vmaxvq_f32(__max);
#endif
for (int i = 0; i < remain; ++i) {
max = std::max(max, input[i]);
}
return max;
}
static void softmax(Tensor *X, Tensor *Y) {
std::vector<int> dims = X->shape().dims();
int batch_size = X->shape().num();
int num_classes = dims[X->shape().dimSize() - 1];
int channels = X->shape().numel() / batch_size / num_classes;
float *x = X->data<float>();
float *y = Y->mutableData<float>();
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < batch_size; ++batch) {
for (int channel = 0; channel < channels; ++channel) {
size_t offset = (batch * channels + channel) * num_classes;
const float *input = x + offset;
float *output = y + offset;
// find max
float max = find_max(input, num_classes);
// exp(x - max)
int remain = num_classes;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
int loop = num_classes >> 3;
remain = num_classes & 0x7;
float32x4_t __max = vdupq_n_f32(max);
for (int i = 0; i < loop; ++i, input += 8, output += 8) {
float32x4_t x0 = vld1q_f32(input);
float32x4_t x1 = vld1q_f32(input + 4);
x0 = vsubq_f32(x0, __max);
x1 = vsubq_f32(x1, __max);
x0 = exp_ps(x0);
x1 = exp_ps(x1);
vst1q_f32(output, x0);
vst1q_f32(output + 4, x1);
}
#endif // __ARM_NEON__
for (int i = 0; i < remain; ++i) {
output[i] = expf(input[i] - max);
}
// sum(exp(x - max))
float sum = 0.f;
output = y + offset;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t __sum = vdupq_n_f32(0.f);
for (int i = 0; i < loop; ++i, output += 8) {
float32x4_t x0 = vld1q_f32(output);
float32x4_t x1 = vld1q_f32(output + 4);
__sum = vaddq_f32(x0, __sum);
__sum = vaddq_f32(x1, __sum);
}
sum += vaddvq_f32(__sum);
#endif // __ARM_NEON__
for (int i = 0; i < remain; ++i) {
sum += output[i];
}
// exp(x - max) / sum
float inv_sum = 1.f / sum;
output = y + offset;
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
for (int i = 0; i < loop; ++i, output += 8) {
float32x4_t x0 = vld1q_f32(output);
float32x4_t x1 = vld1q_f32(output + 4);
x0 = vmulq_f32(x0, __inv_sum);
x1 = vmulq_f32(x1, __inv_sum);
vst1q_f32(output, x0);
vst1q_f32(output + 4, x1);
}
#endif
for (int i = 0; i < remain; ++i) {
output[i] *= inv_sum;
}
}
}
}
bool SoftmaxPE::init() {
Tensor *output = param_.output;
output->setAligned(false);
return true;
}
bool SoftmaxPE::dispatch() {
Tensor *input = param_.input;
Tensor *output = param_.output;
input->invalidate();
Tensor float_input;
Tensor float_output;
float_input.mutableData<float>(DataType::FP32, input->shape());
float_input.copyFrom(input);
float_input.unalignImage();
float *out_data =
float_output.mutableData<float>(DataType::FP32, input->shape());
softmax(&float_input, &float_output);
float_output.flush();
output->copyFrom(&float_output);
return true;
}
SoftmaxParam &SoftmaxPE::param() { return param_; }
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <algorithm>
#include <limits>
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h>
#include "fpga/KD/pes/math_func_neon.h"
#endif
#include "../pe.hpp"
#include "../pe_params.hpp"
namespace paddle_mobile {
namespace zynqmp {
class SoftmaxPE : public PE {
public:
bool init();
bool dispatch();
SoftmaxParam& param();
private:
SoftmaxParam param_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
#include <vector>
#include "fpga/KD/alignment.h"
#include "fpga/KD/layout.hpp"
namespace paddle_mobile {
namespace zynqmp {
static struct NCHW nchw_;
static struct NHWC nhwc_;
static struct NC nc_;
static struct NHW nhw_;
static struct N n_;
class Shape {
public:
explicit Shape(std::vector<int> dims) { dims_ = dims; }
Shape(LayoutType type, std::vector<int> dims) {
dims_ = dims;
setLayoutType(type);
}
Shape(const Shape& src) {
dims_ = src.dims_;
setLayoutType(src.layoutType_);
}
bool shouldAlign() {
return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_);
}
int num() {
int index = layout_->numIndex();
return index == -1 ? 1 : dims_[index];
}
int channel() {
int index = layout_->channelIndex();
return index == -1 ? 1 : dims_[index];
}
int height() {
int index = layout_->heightIndex();
return index == -1 ? 1 : dims_[index];
}
int width() {
int index = layout_->widthIndex();
return index == -1 ? 1 : dims_[index];
}
int dimSize() { return dims_.size(); }
std::vector<int> dims() { return dims_; }
size_t memorySize(int cellSize) {
return layout_->alignedElementCount(dims_) * cellSize;
}
int numel() { return layout_->elementCount(dims_); }
void setLayoutType(LayoutType layout) {
this->layoutType_ = layout;
switch (layout) {
case NCHW:
layout_ = &nchw_;
break;
case NHWC:
layout_ = &nhwc_;
break;
case NC:
layout_ = &nc_;
break;
case NHW:
layout_ = &nhw_;
break;
case N:
layout_ = &n_;
break;
default:
break;
}
}
int operator[](int index) { return dims_[index]; }
private:
LayoutType layoutType_;
Layout* layout_ = &nhwc_;
std::vector<int> dims_;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include "float16.hpp"
#include "llapi/zynqmp_api.h"
#include "shape.hpp"
namespace paddle_mobile {
namespace zynqmp {
enum DataType : int {
FP32 = 0,
FP16 = 1,
INT8 = 2,
};
typedef uint16_t float16;
inline int CellSize(DataType type) {
switch (type) {
case FP32:
return sizeof(float);
case FP16:
return sizeof(float16);
case INT8:
return sizeof(int8_t);
default:
return 0;
}
return 0;
}
class PlaceHolder {
public:
explicit PlaceHolder(size_t size) {
size_ = size;
data_ = fpga_malloc(size_);
}
void* data() { return data_; }
size_t memorySize() { return size_; }
~PlaceHolder() {
std::cout << "place holder dealloc";
fpga_free(data_);
}
private:
void* data_ = nullptr;
size_t size_ = 0;
};
class Tensor {
public:
int id() { return id_; }
template <typename Dtype>
Dtype* data() {
if (placeHolder_ == nullptr) {
return nullptr;
}
return reinterpret_cast<Dtype*>(this->placeHolder_->data());
}
template <typename Dtype>
Dtype* mutableData(DataType dataType, const Shape& shape) {
// if (this->shape_ != &shape) {
if (this->shape_ != nullptr) {
delete shape_;
}
this->shape_ = new Shape(shape);
// }
this->dataType_ = dataType;
return mutableData<Dtype>();
}
template <typename Dtype>
Dtype* mutableData() {
size_t memorySize = shape_->memorySize(CellSize(dataType_));
if (placeHolder_ != nullptr) {
if (memorySize > placeHolder_->memorySize()) {
delete placeHolder_;
placeHolder_ = new PlaceHolder(memorySize);
}
} else {
placeHolder_ = new PlaceHolder(memorySize);
}
return reinterpret_cast<Dtype*>(placeHolder_->data());
}
void setDataType(DataType dataType) { this->dataType_ = dataType; }
DataType dataType() { return this->dataType_; }
Shape& shape() { return *shape_; }
bool aligned() { return this->aligned_; }
void setAligned(bool aligned) { this->aligned_ = aligned; }
float* scale() { return scale_; }
void alignImage(Tensor* dst = nullptr, bool copy = false) {
if (shape_->shouldAlign()) {
int cell_size = CellSize(this->dataType_);
char* dst_data = nullptr;
size_t mem_size = shape_->memorySize(cell_size);
if (dst == nullptr) {
dst_data = reinterpret_cast<char*>(fpga_malloc(mem_size));
} else {
dst_data = dst->data<char>();
}
int wc = shape_->width() * shape_->channel();
int wc_aligned = align_image(wc);
int remainder = wc_aligned - wc;
char* src_start = data<char>();
char* dst_start = dst_data;
for (int n = 0; n < shape_->num(); n++) {
for (int h = 0; h < shape_->height(); h++) {
memcpy(dst_start, src_start, wc * cell_size);
memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
src_start += wc * cell_size;
dst_start += wc_aligned * cell_size;
}
}
if (dst == nullptr) {
memcpy(data<void>(), dst_data, mem_size);
flush();
fpga_free(dst_data);
} else {
dst->flush();
}
} else {
if (copy) {
dst->copyFrom(this);
} else {
// TODO(chonwhite) share data.
}
}
}
void unalignImage(Tensor* dst = nullptr, bool copy = false) {
if (shape_->shouldAlign()) {
// int cell_size = CellSize(this->dataType_);
// char* dst_data = nullptr;
// size_t mem_size = shape_->memorySize(cell_size);
// if (dst == nullptr) {
// dst_data = (char*)fpga_malloc(mem_size);
// } else {
// dst_data = dst->data<char>();
// }
// int wc = shape_->width() * shape_->channel();
// int wc_aligned = align_image(wc);
// int remainder = wc_aligned - wc;
// char* src_start = data<char>();
// char* dst_start = dst_data;
// for (int n = 0; n < shape_->num(); n++) {
// for (int h = 0;h < shape_->height(); h++) {
// memcpy(dst_start, src_start, wc * cell_size);
// memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
// src_start += wc * cell_size;
// dst_start += wc_aligned * cell_size;
// }
// }
// if (dst == nullptr) {
// memcpy(data<void>(), dst_data, mem_size);
// flush();
// fpga_free(dst_data);
// } else {
// dst->flush();
// }
} else {
if (copy) {
dst->copyFrom(this);
} else {
// TODO(chonwhite) share data.
}
}
}
void copyFrom(Tensor* src) {
BypassArgs args;
args.input_data_type =
src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.input_layout_type = LAYOUT_HWC;
args.output_layout_type = LAYOUT_HWC;
args.image = {.address = src->data<void>(),
.scale_address = src->scale(),
.channels = (uint32_t)src->shape().channel(),
.width = (uint32_t)src->shape().width(),
.height = (uint32_t)src->shape().height(),
.pad_width = 0u,
.pad_height = 0u};
args.output = {
.address = data<void>(),
.scale_address = scale(),
};
src->flush();
perform_bypass(args);
this->invalidate();
}
void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
void invalidate() {
fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
}
void print() {
int count = shape_->numel();
for (int i = 0; i < count; i++) {
std::cout << "" << '\n';
}
}
void saveToFile() {
std::string path = std::to_string(id_) + ".txt";
saveToFile(path);
}
void saveToFile(std::string path) {
std::ofstream ofs;
static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path;
counter++;
ofs.open(npath);
for (size_t i = 0; i < shape_->numel(); i++) {
float value = 0;
if (dataType_ == FP32) {
value = data<float>()[i];
} else {
value = half_to_float(data<float16>()[i]);
}
ofs << value << std::endl;
}
ofs.close();
}
private:
float scale_[2];
Shape* shape_ = nullptr;
DataType dataType_ = FP32;
bool aligned_ = false;
static int generateID() {
static int sID = 0;
int id = sID++;
return id;
}
int id_ = generateID();
PlaceHolder* placeHolder_ = nullptr;
};
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "tensor_util.hpp"
namespace paddle_mobile {
namespace zynqmp {
float find_max(const Tensor& tensor) {
float max = 0;
Tensor& t = const_cast<Tensor&>(tensor);
float* data = t.data<float>();
for (int i = 0; i < t.shape().numel(); i++) {
max = std::max(data[i], max);
}
return max;
}
} // namespace zynqmp
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
#include "tensor.hpp"
namespace paddle_mobile {
namespace zynqmp {
float find_max(const Tensor& tensor);
} // namespace zynqmp
} // namespace paddle_mobile
......@@ -62,7 +62,8 @@ Executor<Device, T>::Executor(const Program<Device> &program,
use_optimize_ ? program_.optimizeProgram : program_.originProgram;
PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
"program_desc_ should not be nullptr");
#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_CL)
#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_FPGA_KD) && \
!defined(PADDLE_MOBILE_CL)
pass::MemoryOptPass()(program_desc_.get(), program_.scope.get());
#endif
// resize feed and fetch list
......@@ -230,20 +231,6 @@ void Executor<Device, T>::InitMemory() {
}
}
static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program,
framework::Scope *scope) {
for (const auto &block : program->Blocks()) {
for (const auto &var_desc : block->Vars()) {
if (!var_desc->Persistable() &&
var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
auto var = scope->Var(var_desc->Name());
auto array = var->template GetMutable<framework::LoDTensorArray>();
array->resize(1);
}
}
}
}
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
char *origin_data = nullptr;
......@@ -281,6 +268,20 @@ void Executor<Device, T>::InitCombineMemory() {
LOG(kLOG_INFO) << "init combine memory finish";
}
static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program,
framework::Scope *scope) {
for (const auto &block : program->Blocks()) {
for (const auto &var_desc : block->Vars()) {
if (!var_desc->Persistable() &&
var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
auto var = scope->Var(var_desc->Name());
auto array = var->template GetMutable<framework::LoDTensorArray>();
array->resize(1);
}
}
}
}
template <typename Device, typename T>
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
for (const auto &block : program_desc_->Blocks()) {
......
......@@ -26,6 +26,12 @@ limitations under the License. */
#include "framework/tensor_base.h"
#include "memory/t_malloc.h"
#ifdef PADDLE_MOBILE_FPGA_KD
#include "framework/zynqmp/ztensor.hpp"
#endif
#ifndef PADDLE_MOBILE_FPGA_KD
namespace paddle_mobile {
namespace framework {
......@@ -290,3 +296,5 @@ inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
} // namespace framework
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
#include <cstring>
#include <fstream>
#include <memory>
#include <string>
#include <vector>
#include "common/enforce.h"
#include "framework/data_layout.h"
#include "framework/tensor_base.h"
#include "memory/t_malloc.h"
#ifdef PADDLE_MOBILE_FPGA_KD
#include "fpga/KD/tensor.hpp"
namespace paddle_mobile {
namespace framework {
class LoDTensor;
class Tensor : public TensorBase {
public:
Tensor() {}
template <typename T>
Tensor(std::vector<T> input, DDim ddim) {
PADDLE_MOBILE_ENFORCE(
input.size() == framework::product(ddim),
"input vector'length should be equal to tensor's length");
auto input_ptr = mutable_data<T>(ddim);
for (int i = 0; i < input.size(); ++i) {
input_ptr[i] = input[i];
}
}
Tensor(const Tensor &inTensor) {
this->dims_ = inTensor.dims_;
this->holder_ = inTensor.holder_;
this->offset_ = inTensor.offset_;
}
/*! Resize the dimensions of the memory block. */
inline Tensor &Resize(const DDim &dims) {
dims_ = dims;
// TODO(chonwhite) resize holder?
return *this;
}
/*! The internal of two tensors share the same memory block. */
inline Tensor &ShareDataWith(const Tensor &src) {
src.check_memory_size();
if (holder_.get() != src.holder_.get()) {
*this = src;
}
return *this;
}
/*! The internal of two tensors share the same memory block. */
inline Tensor &ShareHolderWith(const Tensor &src) {
src.check_memory_size();
if (holder_.get() != src.holder_.get()) {
holder_ = src.holder_;
}
return *this;
}
inline zynqmp::Tensor *zynqmpTensor() const {
PlaceholderImpl *holder = static_cast<PlaceholderImpl *>(holder_.get());
// mutable_data(holder->type());
return holder->tensor_;
}
inline void *mutable_data(const kTypeId_t type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
int64_t size = numel() * SizeOfType(type);
if (holder_ == nullptr || holder_->size() < size + offset_) {
PlaceholderImpl *impl = nullptr;
if (holder_ == nullptr) {
std::cout << "holder null" << std::endl;
impl = new PlaceholderImpl(dims_, type);
holder_.reset(impl);
} else {
impl = static_cast<PlaceholderImpl *>(holder_.get());
std::cout << "holder reize" << std::endl;
// holder_->resize(size);
}
impl->resize(dims_, type);
offset_ = 0;
}
return reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
*/
template <typename T>
inline T *mutable_data() {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T *>(mutable_data(type_id<T>().hash_code()));
}
/**
* @brief Return a pointer to mutable memory block.
*
* @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block.
*
* @note If not exist, then allocation.
*/
template <typename T>
inline T *mutable_data(DDim dims) {
static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims);
return mutable_data<T>();
}
/**
* @brief Return a sub-tensor of the given tensor.
*
* @param[in] begin_idx The index of the start row(inclusive) to
* slice.
* The index number begins from 0.
* @param[in] end_idx The index of the end row(exclusive) to
* slice.
* The index number begins from 0.
*/
inline Tensor Slice(int begin_idx, int end_idx) const {
check_memory_size();
PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
"The start row index must be greater than 0.")
PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
"The end row index is out of bound.")
PADDLE_MOBILE_ENFORCE(
begin_idx < end_idx,
"The start row index must be lesser than the end row index")
if (dims_[0] == 1) {
return *this;
} else {
size_t base = numel() / dims_[0];
Tensor dst;
dst.holder_ = holder_;
DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims);
dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
return dst;
}
}
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T *data() {
check_memory_size();
PADDLE_MOBILE_ENFORCE(
(std::is_same<T, void>::value ||
holder_->type() == type_id<T>().hash_code()),
"Tensor holds the wrong type, it holds %d, requested %d",
this->holder_->type(), type_id<T>().hash_code());
return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
/*! Return a pointer to constant memory block. */
template <typename T>
inline const T *data() const {
check_memory_size();
PADDLE_MOBILE_ENFORCE(
(std::is_same<T, void>::value ||
holder_->type() == type_id<T>().hash_code()),
"Tensor holds the wrong type, it holds %d, requested %d",
this->holder_->type(), type_id<T>().hash_code());
return reinterpret_cast<const T *>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
private:
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(DDim ddim, const kTypeId_t type) {
tensor_ = new zynqmp::Tensor();
type_ = type;
std::vector<int> v = framework::vectorize2int(ddim);
zynqmp::LayoutType layout_type = zynqmp::NCHW;
switch (v.size()) {
case 1:
layout_type = zynqmp::N;
break;
case 2:
layout_type = zynqmp::NC;
break;
case 3:
layout_type = zynqmp::NHW;
break;
case 4:
layout_type = zynqmp::NCHW;
break;
}
zynqmp::Shape input_shape(layout_type, v);
// for (int i = 0; i < v.size(); i++) {
// std::cout << ":" << v[i] << std::endl;
// }
zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
tensor_->mutableData<float>(dtype, input_shape);
}
virtual size_t size() const { return size_; }
virtual void *ptr() const {
void *ptr = tensor_->data<void *>();
return ptr;
}
virtual kTypeId_t type() const { return type_; }
virtual void set_type(const kTypeId_t type) { type_ = type; }
virtual void resize(size_t size) {
if (size > capatity_) {
capatity_ = size;
// TODO(chonwhite) implement;
}
size_ = size;
}
void resize(DDim ddim, const kTypeId_t type) {
std::vector<int> v = framework::vectorize2int(ddim);
zynqmp::LayoutType layout_type = zynqmp::NCHW;
switch (v.size()) {
case 1:
layout_type = zynqmp::N;
break;
case 2:
layout_type = zynqmp::NC;
break;
case 3:
layout_type = zynqmp::NHW;
break;
case 4:
layout_type = zynqmp::NCHW;
break;
}
zynqmp::Shape input_shape(layout_type, v);
zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
tensor_->mutableData<float>(dtype, input_shape);
}
/*! the size of memory block. */
size_t size_;
size_t capatity_;
/* the current type of memory */
kTypeId_t type_;
zynqmp::Tensor *tensor_;
// zynqmp::Shape* shape_;
};
};
#ifdef PADDLE_MOBILE_DEBUG
inline Print &operator<<(Print &printer, const Tensor &tensor) {
printer << " dims: " << tensor.dims() << "\n";
int stride = tensor.numel() / 20;
stride = stride > 0 ? stride : 1;
return printer;
}
#endif
inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
Tensor res;
res.ShareDataWith(src);
res.Resize(flatten_to_2d(src.dims(), num_col_dims));
return res;
}
} // namespace framework
} // namespace paddle_mobile
#endif
......@@ -24,6 +24,10 @@ limitations under the License. */
#include "fpga/V2/api.h"
#endif
#ifdef PADDLE_MOBILE_FPGA_KD
#include "fpga/KD/llapi/zynqmp_api.h"
#endif
namespace paddle_mobile {
namespace memory {
const int MALLOC_ALIGN = 64;
......@@ -43,7 +47,21 @@ void Free(void *ptr) {
}
}
#elif defined(PADDLE_MOBILE_FPGA_KD)
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
void *Alloc(size_t size) { return zynqmp::fpga_malloc(size); }
void Free(void *ptr) {
if (ptr) {
zynqmp::fpga_free(ptr);
}
}
#else
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
......@@ -57,11 +75,9 @@ void *Alloc(size_t size) {
void *r = reinterpret_cast<void *>(reinterpret_cast<size_t>(p + offset) &
(~(MALLOC_ALIGN - 1)));
static_cast<void **>(r)[-1] = p;
return r; // if necessary, you need initialize memory by yourself (developer)
return r;
}
// if you use this ptr again after Free
// you need to assign `ptr` as nullptr yourself (developer)
void Free(void *ptr) {
if (ptr) {
free(static_cast<void **>(ptr)[-1]);
......
......@@ -37,7 +37,7 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
#endif
......
......@@ -38,7 +38,7 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_CL
......
......@@ -31,7 +31,7 @@ namespace ops = paddle_mobile::operators;
REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_CL
......
......@@ -55,7 +55,7 @@ REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu,
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif
#ifdef PADDLE_MOBILE_CL
......
......@@ -58,7 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_conv_add, ops::FusionConvAddOp);
#endif
#endif
......@@ -53,7 +53,7 @@ REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#ifdef PADDLE_MOBILE_CL
......
......@@ -58,7 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif
......
......@@ -54,7 +54,7 @@ REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
#endif
......
......@@ -57,7 +57,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
#endif
......
......@@ -36,7 +36,7 @@ REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
// ops::FusionElementwiseAddReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
ops::FusionElementwiseAddReluOp);
#endif
......
......@@ -63,7 +63,7 @@ REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
#endif
......
......@@ -60,7 +60,7 @@ REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
#endif
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -37,7 +37,7 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(pad2d, ops::Pad2DOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2DOp);
#endif
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册