提交 2e2c9d4b 编写于 作者: M MyPandaShaoxiang

fix: update backend fpga patch

上级 41826a31
...@@ -3,13 +3,35 @@ if (NOT LITE_WITH_FPGA) ...@@ -3,13 +3,35 @@ if (NOT LITE_WITH_FPGA)
endif() endif()
set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD") set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD")
set(LITE_FPGA_KD_LLAPI_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/llapi")
set(LITE_FPGA_KD_PE_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/pes")
set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga") set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga")
message("fpga_kd_path ${LITE_FPGA_KD_PATH}") message("fpga_kd_path ${LITE_FPGA_KD_PATH}")
message("fpga_path ${LITE_FPGA_PATH}") message("fpga_path ${LITE_FPGA_PATH}")
file(GLOB_RECURSE KD_CPP *.cpp *.cc) file(GLOB KD_CPP "${LITE_FPGA_KD_PATH}/*.cpp")
file(GLOB PE_CPP "${LITE_FPGA_KD_PE_PATH}/*.cpp")
file(GLOB LLAPI_CPP "${LITE_FPGA_KD_LLAPI_PATH}/*.cpp")
file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc") file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc")
set(FPGA_ALL_CPP "")
cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP}) FOREACH(FILE_PATH ${KD_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list(APPEND FPGA_ALL_CPP KD/${FILE_NAME})
ENDFOREACH(FILE_PATH)
FOREACH(FILE_PATH ${PE_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list(APPEND FPGA_ALL_CPP KD/pes/${FILE_NAME})
ENDFOREACH(FILE_PATH)
FOREACH(FILE_PATH ${LLAPI_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list(APPEND FPGA_ALL_CPP KD/llapi/${FILE_NAME})
ENDFOREACH(FILE_PATH)
FOREACH(FILE_PATH ${FPGA_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list( APPEND FPGA_ALL_CPP ${FILE_NAME})
ENDFOREACH(FILE_PATH)
message("fpga kd: ${FPGA_ALL_CPP}")
cc_library(kernel_fpga SRCS ${FPGA_ALL_CPP})
#cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory) cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory)
cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga) cc_library(fpga_target_wrapper SRCS target_wrapper.cc DEPS kernel_fpga)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
// #include "lite/backends/fpga/lite_tensor.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
#define FPGA_PRINT_TENSOR
class Debugger {
public:
static Debugger& get_instance() {
static Debugger s_instance;
return s_instance;
}
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
// tensor->printScale();
if (op_type != "conv") {
// tensor->saveToFile(op_type, true);
}
}
private:
std::unordered_map<std::string, bool> op_config;
Debugger() {
op_config["concat"] = true;
op_config["conv"] = true;
op_config["crop"] = true;
}
};
inline void chw_to_hwc(Tensor* t, float* dst) {
int num = t->dims()[0];
int channel = t->dims()[1];
int height = 1;
int width = 1;
if (t->dims().size() > 2) {
height = t->dims()[2];
}
if (t->dims().size() > 3) {
width = t->dims()[3];
}
// int width = t->dims()[3];
const float* chw_data = t->data<float>();
float* hwc_data = dst;
int chw = channel * height * width;
int wc = width * channel;
int index = 0;
for (int n = 0; n < num; n++) {
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
index++;
}
}
}
}
}
inline void read_from_file(lite::Tensor* t, const std::string& path) {
std::ifstream file_stream;
file_stream.open(path);
if (!file_stream) {
return;
}
float* data = t->mutable_data<float>();
int num = t->numel();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
// flush();
}
inline void save_float(float* data, const std::string& name, int len) {
// return;
static int counter = 0;
std::string old_string = std::to_string(counter);
std::string new_string =
std::string(3 - old_string.length(), '0') + old_string;
std::string file = "arm_" + new_string + name;
counter++;
std::cout
<< "-------------------------- saving file: --------------------------"
<< file << std::endl;
std::ofstream ofs;
ofs.open(file);
// float* data = dst;
for (int i = 0; i < len; i++) {
float value = data[i];
ofs << value << std::endl;
}
ofs.close();
}
inline void save_tensor(lite::Tensor* t,
const std::string& name,
bool convert = true) {
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
if (convert) {
chw_to_hwc(t, dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
inline void save_tensor(const lite::Tensor* t,
const std::string& name,
bool convert = true) {
// return;
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
if (convert) {
chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
} // namespace lite
} // namespace paddle
...@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lite/backends/fpga/KD/dl_engine.hpp" #include "lite/backends/fpga/KD/dl_engine.hpp"
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
DLEngine::DLEngine() { DLEngine::DLEngine() {
open_device(); open_device();
struct DeviceInfo info; int ret = get_device_info(info_);
int ret = get_device_info(info); filter::set_filter_capacity(info_.filter_cap);
filter::set_filter_capacity(info.filter_cap); filter::set_colunm(info_.colunm);
} }
} // namespace zynqmp } // namespace zynqmp
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include "lite/backends/fpga/KD/llapi/filter.h" #include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
...@@ -29,8 +28,15 @@ class DLEngine { ...@@ -29,8 +28,15 @@ class DLEngine {
return s_instance; return s_instance;
} }
DeviceInfo& deviceInfo();
bool isZU3() { return info_.device_type / 100 == 3; }
float* out_data = nullptr;
private: private:
DLEngine(); DLEngine();
DeviceInfo info_;
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -22,6 +22,7 @@ namespace paddle { ...@@ -22,6 +22,7 @@ namespace paddle {
namespace zynqmp { namespace zynqmp {
enum LayoutType { enum LayoutType {
None,
N, N,
NC, NC,
NCHW, NCHW,
...@@ -39,6 +40,15 @@ class Layout { ...@@ -39,6 +40,15 @@ class Layout {
virtual int elementCount(const std::vector<int>& dims) = 0; virtual int elementCount(const std::vector<int>& dims) = 0;
}; };
struct None : Layout {
int numIndex() { return -1; }
int channelIndex() { return -1; }
int heightIndex() { return -1; }
int widthIndex() { return -1; }
int alignedElementCount(const std::vector<int>& dims) { return 16; }
virtual int elementCount(const std::vector<int>& dims) { return 1; }
};
struct NCHW : Layout { struct NCHW : Layout {
int numIndex() { return 0; } int numIndex() { return 0; }
int channelIndex() { return 1; } int channelIndex() { return 1; }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <memory.h> #include <memory.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/bias_scale.h" #include "lite/backends/fpga/KD/llapi/bias_scale.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
...@@ -54,7 +55,7 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -54,7 +55,7 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
*data_in = ptr_aligned; *data_in = ptr_aligned;
} }
void interleave(float **data_in, int num_after_alignment) { size_t interleave(float **data_in, int num_after_alignment) {
float *ptr_uninterleaved = *data_in; float *ptr_uninterleaved = *data_in;
float *ptr_interleaved = float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
...@@ -69,6 +70,7 @@ void interleave(float **data_in, int num_after_alignment) { ...@@ -69,6 +70,7 @@ void interleave(float **data_in, int num_after_alignment) {
fpga_free(ptr_uninterleaved); fpga_free(ptr_uninterleaved);
*data_in = ptr_interleaved; *data_in = ptr_interleaved;
return 2 * num_after_alignment * sizeof(float);
} }
void format_bias_scale_array(float **bias_scale_array, void format_bias_scale_array(float **bias_scale_array,
...@@ -78,8 +80,9 @@ void format_bias_scale_array(float **bias_scale_array, ...@@ -78,8 +80,9 @@ void format_bias_scale_array(float **bias_scale_array,
int div_num = (num + element_num_per_division - 1) / element_num_per_division; int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division = int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
size_t mem =
interleave(bias_scale_array, div_num * element_num_after_division); interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); fpga_flush(*bias_scale_array, mem);
} }
void format_bias_array(float **bias_array, int num) { void format_bias_array(float **bias_array, int num) {
float *ptr_unaligned = *bias_array; float *ptr_unaligned = *bias_array;
......
...@@ -19,7 +19,7 @@ namespace zynqmp { ...@@ -19,7 +19,7 @@ namespace zynqmp {
namespace bias_scale { namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num); void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment); size_t interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array, void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int element_num_per_division,
int num); int num);
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "lite/backends/fpga/KD/llapi/filter.h" #include "lite/backends/fpga/KD/llapi/filter.h"
#include <memory.h> #include <memory.h>
#include <algorithm> #include <algorithm>
#include <fstream>
#include <string>
#include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
...@@ -23,11 +25,42 @@ namespace zynqmp { ...@@ -23,11 +25,42 @@ namespace zynqmp {
namespace filter { namespace filter {
static int FILTER_SIZE = 2048; static int FILTER_SIZE = 2048;
static int COLUMN = 4;
void saveToFile(std::string name, void* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
// int8_t* data = (int8_t*)data_in;
// for (int i = 0; i < size; i++) {
// float value = data[i];
// ofs << value << std::endl;
// }
// ofs.close();
}
void saveFloatToFile(std::string name, float* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
// for (int i = 0; i < size; i++) {
// float value = data_in[i];
// ofs << value << std::endl;
// }
// ofs.close();
}
void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; } void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
void set_colunm(uint32_t column) { COLUMN = column; }
// replace zynqmp_api.h #define FILTER_NUM_ALIGNMENT
int get_filter_num_alignment() { return COLUMN * 4; }
int calc_division_capacity(int chw) { int calc_division_capacity(int chw) {
int n = FILTER_SIZE / ((chw + 15) / 16) * 32; // int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
int filter_num_alignment = get_filter_num_alignment();
int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
return n < FILTER_SIZE ? n : FILTER_SIZE; return n < FILTER_SIZE ? n : FILTER_SIZE;
} }
...@@ -52,28 +85,28 @@ int calc_num_per_div(int num, int group_num, int division_capacity) { ...@@ -52,28 +85,28 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
} }
} }
void convert_to_hwc( void convert_to_hwc(int8_t* chw_data,
char **data_in, int num, int channel, int height, int width) { int8_t* hwc_data,
char *tmp = *data_in; int num,
int channel,
int height,
int width) {
int chw = channel * height * width; int chw = channel * height * width;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT int wc = width * channel;
int index = 0;
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) { for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) { for (int w = 0; w < width; w++) {
*(data_tmp + n * chw + offset_height + w * channel + c) = hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
*((*data_in)++); index++;
} }
} }
} }
} }
*data_in = data_tmp;
fpga_free(tmp);
} }
float find_max(float *data_in, int data_size) { float find_max(float* data_in, int data_size) {
float max = 0.0; float max = 0.0;
for (int i = 0; i < data_size; ++i) { for (int i = 0; i < data_size; ++i) {
float value = data_in[i]; float value = data_in[i];
...@@ -83,166 +116,194 @@ float find_max(float *data_in, int data_size) { ...@@ -83,166 +116,194 @@ float find_max(float *data_in, int data_size) {
return max; return max;
} }
signed char float_to_int8(float fdata) { int8_t float_to_int8(float fdata) {
if (fdata < 0.0) { if (fdata < 0.0) {
fdata -= 0.5; fdata -= 0.5;
} else { } else {
fdata += 0.5; fdata += 0.5;
} }
return (signed char)fdata; return (int8_t)fdata;
} }
void quantize(float **data_in, int data_size, float max) { void quantize(float* src, int8_t* dst, int len, float max) {
float *tmp = *data_in;
float fix_range = 127; float fix_range = 127;
float scale = fix_range / max; float scale = fix_range / max;
for (size_t i = 0; i < len; i++) {
signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); dst[i] = float_to_int8(src[i] * scale);
for (int i = 0; i < data_size; i++) {
tmp_data[i] = float_to_int8(
(*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale);
} }
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
} }
void align_element(char **data_in, int num, int chw) { bool should_align_chw(int chw) {
int j = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) { return align_chw != chw;
char *tmp = *data_in; }
char *data_tmp =
(char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
memset(data_tmp, 0, num * align_chw); memset(dst, 0, num * aligned_chw);
for (j = 0; j < num; j++) { for (int j = 0; j < num; j++) {
memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw); memcpy((dst + j * aligned_chw), (src + j * chw), chw);
}
*data_in = data_tmp;
fpga_free(tmp);
} }
} }
void align_num(char **data_in, void align_num(int8_t* src,
int8_t* dst,
int num_per_div_before_alignment, int num_per_div_before_alignment,
int num, int num,
int chw) { int align_chw) {
int i = 0; int filter_num_alignment = get_filter_num_alignment();
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, filter_num_alignment);
char *tmp = *data_in;
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw; int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char));
memset(dst, 0, num_element * sizeof(int8_t));
int i = 0;
for (i = 0; i < div_num - 1; i++) { for (i = 0; i < div_num - 1; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, memcpy(dst + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i, src + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw); num_per_div_before_alignment * align_chw);
} }
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, memcpy(dst + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i, src + num_per_div_before_alignment * align_chw * i,
(num - (div_num - 1) * num_per_div_before_alignment) * align_chw); (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
*data_in = data_tmp;
fpga_free(tmp);
} }
void reorder(char **data_in, int num_after_alignment, int chw) { void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int index = 0; int index = 0;
int new_index = 0; int new_index = 0;
int filter_num_alignment = get_filter_num_alignment();
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) { for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) + new_index = index / filter_num_alignment * filter_num_alignment +
(index / 16 % 2 * 4); (index % (filter_num_alignment / 2) / 4 * 8) +
memcpy(data_tmp + index * chw_align, (index % (filter_num_alignment / 2) % 4) +
*data_in + new_index * chw_align, (index / (filter_num_alignment / 2) % 2 * 4);
chw_align); memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
} }
*data_in = data_tmp;
fpga_free(tmp);
} }
size_t interleave(char **data_in, int num_after_alignment, int chw) { void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int i = 0;
int j = 0;
int k = 0;
int interleave_per_num = 16; int interleave_per_num = 16;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num; int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) { for (int i = 0; i < num_after_alignment; i += 2) {
for (j = 0, k = 0; j < interleave_num; j += 2, k++) { for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(data_tmp + i * chw_align + interleave_per_num * j, memcpy(dst + i * chw_align + interleave_per_num * j,
*data_in + i * chw_align + interleave_per_num * k, src + i * chw_align + interleave_per_num * k,
interleave_per_num); interleave_per_num);
memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1), memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
*data_in + (i + 1) * chw_align + interleave_per_num * k, src + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num); interleave_per_num);
} }
} }
*data_in = data_tmp;
fpga_free(tmp);
return chw_align * num_after_alignment;
} }
size_t format_filter(float **data_in, int8_t* format_filter(float* data_in,
int& mem_size_a, // NOLINT
int num, int num,
int channel, int channel,
int height, int height,
int width, int width,
int group_num, int group_num,
float max) { float max,
std::vector<float>& filter_max) { // NOLINT
int data_size = channel * height * width * num; int data_size = channel * height * width * num;
int chw = channel * height * width; int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw); int division_capacity = calc_division_capacity(chw);
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_before_alignment = int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity); calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, filter_num_alignment);
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment; int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) + ((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT); align_to_x(residual, filter_num_alignment);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT // saveFloatToFile("quantize_before", data_in, data_size);
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw); int8_t* quantized_data =
if (num_after_alignment != num) { reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
align_num(quantize_data, num_per_div_before_alignment, num, chw);
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
float f_max = find_max(filter_start, chw);
int8_t* quantized_start = quantized_data + n * chw;
// quantize(filter_start, quantized_start, chw, f_max);
quantize(filter_start, quantized_start, chw, max);
// filter_max.push_back(f_max);
filter_max.push_back(max);
} }
reorder(quantize_data, num_after_alignment, chw); // saveToFile("chw.txt", quantized_data, data_size);
size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, int8_t* hwc_data =
reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
fpga_free(quantized_data);
// saveToFile("hwc.txt", hwc_data, data_size);
int8_t* temp_data = hwc_data; // NOLINT
int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (should_align_chw(chw)) {
int8_t* hwc_aligned_data = reinterpret_cast<int8_t*>(
fpga_malloc(num * chw_aligned * sizeof(int8_t)));
align_chw(hwc_data, hwc_aligned_data, num, chw);
// saveToFile("align_el.txt", hwc_aligned_data, data_size * 2);
temp_data = hwc_aligned_data;
fpga_free(hwc_data);
}
if (num_after_alignment != num) {
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment);
// int div_num =
// (num + num_per_div_before_alignment - 1) /
// num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
align_num(temp_data,
num_aligned_data,
num_per_div_before_alignment,
num,
chw_aligned);
// saveToFile("align_num.txt", num_aligned_data, data_size * 8);
fpga_free(temp_data);
temp_data = num_aligned_data;
}
int8_t* aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
reorder(temp_data, aligned_data, num_after_alignment, chw);
// saveToFile("reorder.txt", aligned_data, data_size * 8);
fpga_free(temp_data);
int8_t* interleaved_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
interleave(aligned_data, interleaved_data, num_after_alignment, chw);
// saveToFile("interleave.txt", interleaved_data, data_size * 8);
fpga_free(aligned_data);
fpga_flush(interleaved_data,
align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment * align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
sizeof(char)); sizeof(char));
return mem_size; mem_size_a = num_after_alignment * chw_aligned;
return interleaved_data;
} }
void convert_to_hwn(int16_t **data_in, int num, int height, int width) { void convert_to_hwn(int16_t** data_in, int num, int height, int width) {
int16_t *tmp = *data_in; int16_t* tmp = *data_in;
int16_t *data_tmp = int16_t* data_tmp =
(int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT (int16_t*)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) { for (int w = 0; w < width; w++) {
...@@ -254,16 +315,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) { ...@@ -254,16 +315,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
fpga_free(tmp); fpga_free(tmp);
} }
size_t align_element_n(int16_t **data_in, int num, int height, int width) { size_t align_element_n(int16_t** data_in, int num, int height, int width) {
int unalign_n = num; int unalign_n = num;
int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT); int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
int num_element = height * width * align_n; int num_element = height * width * align_n;
if (unalign_n != align_n) { if (unalign_n != align_n) {
int16_t *tmp = *data_in; int16_t* tmp = *data_in;
int num_element = height * width * align_n; int num_element = height * width * align_n;
int16_t *data_tmp = int16_t* data_tmp =
(int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT (int16_t*)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(int16_t)); memset(data_tmp, 0, num_element * sizeof(int16_t));
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
...@@ -276,17 +337,37 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) { ...@@ -276,17 +337,37 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) {
} }
} }
*data_in = data_tmp; *data_in = data_tmp;
free(tmp); fpga_free(tmp);
} }
return num_element * sizeof(int16_t); return num_element * sizeof(int16_t);
} }
void to_fp16(float* src,
float16* dst,
int num,
int height,
int width,
float* scale_ptr) {
int size = num * height * width;
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int index = n * height * width + h * width + w;
float value = src[index] * scale_val;
dst[index] = float_to_half(value);
}
}
}
fpga_flush(dst, size * sizeof(int16_t));
}
void quantize_to_fp16( void quantize_to_fp16(
float **data_in, int num, int height, int width, float *scale_ptr) { float** data_in, int num, int height, int width, float* scale_ptr) {
float *tmp = *data_in; float* tmp = *data_in;
int size = num * height * width; int size = num * height * width;
float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16)); // NOLINT float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16)); // NOLINT
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n]; float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
...@@ -298,13 +379,14 @@ void quantize_to_fp16( ...@@ -298,13 +379,14 @@ void quantize_to_fp16(
} }
} }
fpga_flush(tmp_data, size * sizeof(int16_t)); fpga_flush(tmp_data, size * sizeof(int16_t));
*data_in = (float *)tmp_data; // NOLINT *data_in = (float*)tmp_data; // NOLINT
fpga_free(tmp); fpga_free(tmp);
} }
size_t format_dwconv_filter( size_t format_dwconv_filter(
float **data_in, int num, int height, int width, float *scale_ptr) { float** data_in, int num, int height, int width, float* scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr); quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
convert_to_hwn(quantize_data, num, height, width); convert_to_hwn(quantize_data, num, height, width);
size_t size = align_element_n(quantize_data, num, height, width); size_t size = align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, fpga_flush(*quantize_data,
......
...@@ -18,38 +18,35 @@ limitations under the License. */ ...@@ -18,38 +18,35 @@ limitations under the License. */
#include <cstdlib> #include <cstdlib>
#include <cwchar> #include <cwchar>
#include <vector>
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
namespace filter { namespace filter {
void set_filter_capacity(uint32_t cap); void set_filter_capacity(uint32_t cap);
void set_colunm(uint32_t column);
int get_filter_num_alignment();
int calc_division_capacity(int chw); int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity); int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity); int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity);
void convert_to_hwc(
char** data_in, int num, int channel, int height, int width);
float find_max(float* data_in, int data_size); float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max); int8_t* format_filter(float* data_in,
void align_element(char** data_in, int num, int chw); int& mem_size, // NOLINT
void align_num(char** data_in,
int num_per_div_before_alignment,
int num,
int chw);
void reorder(char** data_in, int num_after_alignment, int chw);
size_t interleave(char** data_in, int num_after_alignment, int chw);
size_t format_filter(float** data_in,
int num, int num,
int channel, int channel,
int height, int height,
int width, int width,
int group_num, int group_num,
float max); float max, // NOLINT
std::vector<float>& filter_max); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width); void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width); size_t align_element_n(int16_t** data_in, int num, int height, int width);
void quantize_to_fp16( // void quantize_to_fp16(float** data_in, int num, int height, int width,
float** data_in, int num, int height, int width, float* scale_ptr); // float* scale_ptr);
size_t format_dwconv_filter( size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr); float** data_in, int num, int height, int width, float* scale_ptr);
......
...@@ -23,13 +23,12 @@ limitations under the License. */ ...@@ -23,13 +23,12 @@ limitations under the License. */
#include <map> #include <map>
#include <utility> #include <utility>
#include "lite/backends/fpga/KD/llapi/config.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
#define PADDLE_LITE_OS_LINUX #define PADDLE_OS_LINUX
static int fd = -1; static int fd = -1;
static const char *device_path = "/dev/fpgadrv0"; static const char *device_path = "/dev/fpgadrv0";
...@@ -39,14 +38,10 @@ static size_t memory_size_max = 0; ...@@ -39,14 +38,10 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0; static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) { static inline int do_ioctl(uint64_t req, const void *arg) {
int ret = -1; #ifdef PADDLE_OS_LINUX
#ifdef PADDLE_LITE_OS_LINUX return ioctl(fd, req, arg);
ret = ioctl(fd, req, arg);
if (ret != 0) {
throw - 1;
}
#else #else
return ret; return -1;
#endif #endif
} }
...@@ -66,7 +61,10 @@ void reset_device() { ...@@ -66,7 +61,10 @@ void reset_device() {
// memory management; // memory management;
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
#ifdef PADDLE_LITE_OS_LINUX #ifdef ENABLE_DEBUG
// std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_OS_LINUX
void *ptr = reinterpret_cast<void *>( void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) { if (ptr == NULL) {
...@@ -105,11 +103,8 @@ void fpga_free(void *ptr) { ...@@ -105,11 +103,8 @@ void fpga_free(void *ptr) {
size = iter->second; size = iter->second;
memory_map.erase(iter); memory_map.erase(iter);
} }
memory_size -= size; memory_size -= size;
#ifdef PADDLE_OS_LINUX
#ifdef PADDLE_LITE_OS_LINUX
munmap(ptr, size); munmap(ptr, size);
#else #else
free(ptr); free(ptr);
...@@ -150,6 +145,11 @@ void fpga_copy(void *dest, const void *src, size_t num) { ...@@ -150,6 +145,11 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num); memcpy(dest, src, num);
} }
int fpga_reset() {
struct FpgaResetArgs args;
return do_ioctl(IOCTL_FPGA_RESET, &args);
}
int ioctl_conv(const struct ConvArgs &args) { int ioctl_conv(const struct ConvArgs &args) {
return do_ioctl(IOCTL_CONFIG_CONV, &args); return do_ioctl(IOCTL_CONFIG_CONV, &args);
} }
...@@ -166,7 +166,6 @@ int compute_fpga_conv(const struct SplitConvArgs &args) { ...@@ -166,7 +166,6 @@ int compute_fpga_conv(const struct SplitConvArgs &args) {
} }
if (split_num > 1) { if (split_num > 1) {
std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
exit(-1); exit(-1);
} }
return ret; return ret;
...@@ -186,6 +185,7 @@ int get_device_info(const struct DeviceInfo &args) { ...@@ -186,6 +185,7 @@ int get_device_info(const struct DeviceInfo &args) {
} }
int perform_bypass(const struct BypassArgs &args) { int perform_bypass(const struct BypassArgs &args) {
int ret = -1;
int size = args.image.channels * args.image.width * args.image.height; int size = args.image.channels * args.image.width * args.image.height;
int max_size = 1 << 21; int max_size = 1 << 21;
...@@ -213,7 +213,7 @@ int perform_bypass(const struct BypassArgs &args) { ...@@ -213,7 +213,7 @@ int perform_bypass(const struct BypassArgs &args) {
reinterpret_cast<char *>(input_address + i * max_size * type_size); reinterpret_cast<char *>(input_address + i * max_size * type_size);
bypassArgs.output.address = bypassArgs.output.address =
reinterpret_cast<char *>(output_address + i * max_size * out_type_size); reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]); scale = std::max(scale, scales[0]);
if (ret != 0) { if (ret != 0) {
...@@ -222,13 +222,15 @@ int perform_bypass(const struct BypassArgs &args) { ...@@ -222,13 +222,15 @@ int perform_bypass(const struct BypassArgs &args) {
} }
int remainder = size - max_size * count; int remainder = size - max_size * count;
if (remainder > 0) {
bypassArgs.image.channels = remainder; bypassArgs.image.channels = remainder;
bypassArgs.image.address = bypassArgs.image.address =
reinterpret_cast<char *>(input_address + count * max_size * type_size); reinterpret_cast<char *>(input_address + count * max_size * type_size);
bypassArgs.output.address = reinterpret_cast<char *>( bypassArgs.output.address = reinterpret_cast<char *>(
output_address + count * max_size * out_type_size); output_address + count * max_size * out_type_size);
int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]); scale = std::max(scale, scales[0]);
}
args.output.scale_address[0] = scale; args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale; args.output.scale_address[1] = 1.0f / scale;
return ret; return ret;
...@@ -237,52 +239,21 @@ int perform_bypass(const struct BypassArgs &args) { ...@@ -237,52 +239,21 @@ int perform_bypass(const struct BypassArgs &args) {
int compute_fpga_concat(const struct ConcatArgs &args) { return -1; } int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
int compute_fpga_scale(const struct ScaleArgs &args) { int compute_fpga_scale(const struct ScaleArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Scale======";
std::cout << "scale_address:" << args.scale_address << std::endl;
std::cout << "bias_address:" << args.bias_address << std::endl;
std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_SCALE, &args); return do_ioctl(IOCTL_CONFIG_SCALE, &args);
} }
int compute_fpga_dwconv(const struct DWconvArgs &args) { int compute_fpga_dwconv(const struct DWconvArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Basic Conv======";
std::cout << " relu_enabled:" << args.relu_enabled
<< " filter_address:" << args.filter_address;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_DWCONV, &args); return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
} }
int config_activation(const struct ActiveParamterArgs &args) {
return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
}
// int config_power(const struct PowerArgs& args) {
// return do_ioctl(IOCTL_CONFIG_POWER, &args);
// }
int config_inplace(const struct InplaceArgs &args) { int config_inplace(const struct InplaceArgs &args) {
return do_ioctl(IOCTL_CONFIG_INPLACE, &args); return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
} }
......
...@@ -14,6 +14,9 @@ limitations under the License. */ ...@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#include <stdint.h> #include <stdint.h>
#include <cstddef> #include <cstddef>
#include <iostream> #include <iostream>
...@@ -40,6 +43,13 @@ enum DLayoutType { ...@@ -40,6 +43,13 @@ enum DLayoutType {
LAYOUT_HWC = 0, LAYOUT_HWC = 0,
}; };
enum ActiveType {
TYPE_RELU = 0,
TYPE_RELU6 = 1,
TYPE_LEAK_RELU = 2,
TYPE_SIGMOID = 3,
};
struct VersionArgs { struct VersionArgs {
void* buffer; void* buffer;
}; };
...@@ -48,7 +58,7 @@ struct DeviceInfo { ...@@ -48,7 +58,7 @@ struct DeviceInfo {
uint32_t filter_cap; uint32_t filter_cap;
uint32_t version; uint32_t version;
uint16_t device_type; uint16_t device_type;
uint32_t reserved0; uint32_t colunm;
uint32_t reserved1; uint32_t reserved1;
uint32_t reserved2; uint32_t reserved2;
uint32_t reserved3; uint32_t reserved3;
...@@ -108,6 +118,7 @@ struct ConvArgs { ...@@ -108,6 +118,7 @@ struct ConvArgs {
void* filter_scale_address; void* filter_scale_address;
uint32_t filter_num; uint32_t filter_num;
uint32_t group_num; uint32_t group_num;
uint32_t dilation;
struct KernelArgs kernel; struct KernelArgs kernel;
struct ImageInputArgs image; // input image; struct ImageInputArgs image; // input image;
...@@ -199,9 +210,16 @@ struct NormalizeParameterArgs { ...@@ -199,9 +210,16 @@ struct NormalizeParameterArgs {
uint32_t hight_width; uint32_t hight_width;
}; };
struct ActiveParamterArgs {
ActiveType type;
uint16_t leaky_relu_factor;
};
struct InplaceArgs { struct InplaceArgs {
bool leaky_relu_enable; bool leaky_relu_enable;
bool relu_enable; bool relu_enable;
bool sigmoid_enable;
bool relu6_enable;
bool power_enable; bool power_enable;
bool normalize_enable; bool normalize_enable;
}; };
...@@ -216,7 +234,9 @@ struct FpgaRegReadArgs { ...@@ -216,7 +234,9 @@ struct FpgaRegReadArgs {
uint64_t value; uint64_t value;
}; };
struct FpgaResetArgs {}; struct FpgaResetArgs {
uint32_t val;
};
#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4) #define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
...@@ -248,6 +268,8 @@ struct FpgaResetArgs {}; ...@@ -248,6 +268,8 @@ struct FpgaResetArgs {};
_IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs) _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
#define IOCTL_CONFIG_NORMALIZE_PARAMETER \ #define IOCTL_CONFIG_NORMALIZE_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs) _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs) #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs) #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs) #define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
...@@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args); ...@@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args);
int compute_fpga_concat(const struct ConcatArgs& args); int compute_fpga_concat(const struct ConcatArgs& args);
int compute_fpga_resize(const struct ResizeArgs& args); int compute_fpga_resize(const struct ResizeArgs& args);
int config_activation(const struct ActiveParamterArgs& args);
int config_power(const struct PowerArgs& args); int config_power(const struct PowerArgs& args);
int compute_fpga_dwconv(const struct DWconvArgs& args); int compute_fpga_dwconv(const struct DWconvArgs& args);
int config_norm_param(const struct NormalizeParameterArgs& args); int config_norm_param(const struct NormalizeParameterArgs& args);
...@@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args); ...@@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args);
int flush_cache(void* addr, int size); int flush_cache(void* addr, int size);
int invalidate_cache(void* addr, int size); int invalidate_cache(void* addr, int size);
int fpga_reset();
int16_t fp32_2_fp16(float fp32_num); int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num); float fp16_2_fp32(int16_t fp16_num);
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
#endif // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
...@@ -32,6 +32,5 @@ class PE { ...@@ -32,6 +32,5 @@ class PE {
virtual ~PE() {} virtual ~PE() {}
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <string>
#include <vector> #include <vector>
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
...@@ -26,6 +27,7 @@ namespace zynqmp { ...@@ -26,6 +27,7 @@ namespace zynqmp {
struct ReLUParam { struct ReLUParam {
public: public:
bool enabled = false; bool enabled = false;
float leaky_relu_factor = 0.0f;
}; };
struct PEParam { struct PEParam {
...@@ -98,6 +100,24 @@ struct DepthwiseConvParam : ConvParam { ...@@ -98,6 +100,24 @@ struct DepthwiseConvParam : ConvParam {
Tensor* quantizedFilter_ = new Tensor(); Tensor* quantizedFilter_ = new Tensor();
}; };
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
enum PoolingType : int { enum PoolingType : int {
MAX = 0, MAX = 0,
AVERAGE = 1, AVERAGE = 1,
...@@ -133,6 +153,12 @@ struct ElementwiseAddParam : PEParam { ...@@ -133,6 +153,12 @@ struct ElementwiseAddParam : PEParam {
EWAddArgs ewargs; EWAddArgs ewargs;
}; };
struct ElementwiseMulParam : PEParam {
public:
std::vector<Tensor*> inputs;
Tensor* output = nullptr;
};
struct FullyConnectedParam : PEParam { struct FullyConnectedParam : PEParam {
public: public:
Tensor* input = nullptr; Tensor* input = nullptr;
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <arm_neon.h> #include <arm_neon.h>
#include <algorithm>
#include <vector> #include <vector>
#include "lite/backends/fpga/KD/pe.hpp" #include "lite/backends/fpga/KD/pe.hpp"
...@@ -49,7 +50,111 @@ class ConvPE : public PE { ...@@ -49,7 +50,111 @@ class ConvPE : public PE {
concatPE_.init(); concatPE_.init();
concatPE_.apply(); concatPE_.apply();
} }
if (DLEngine::get_instance().isZU3() &&
param_.input->shape().dimSize() == 4 &&
param_.input->shape().width() == 1 &&
param_.input->shape().width() >= 2048) {
use_cpu_ = true;
}
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) {
// use_cpu_ = true;
}
if (!use_cpu_) {
// param_.filter->releaseData();
} }
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape());
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.filter->shape().height();
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
// int out_channel = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width;
float max = 0;
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend =
std::min(hstart + kernel_height, static_cast<int>(image_height));
int wend =
std::min(wstart + kernel_width, static_cast<int>(image_width));
hstart = std::max(hstart, static_cast<int>(0));
wstart = std::max(wstart, static_cast<int>(0));
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
for (int c = 0; c < image_channels; c++) {
for (int h = hstart; h < hend; h++) {
int hi = 0;
if (ph == 0) {
hi = h - hstart + image_pad_h;
} else {
hi = h - hstart;
}
for (int w = wstart; w < wend; w++) {
int wi = 0;
if (pw == 0) {
wi = w - wstart + image_pad_w;
} else {
wi = w - wstart;
}
const int index = (h * image_width + w) * image_channels + c;
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index];
sum += value;
}
}
}
if (param_.relu.enabled && sum < 0) {
sum = -sum;
}
if (sum > max) {
max = sum;
}
out[pool_index] = sum;
}
}
}
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
}
void cpu_compute() { void cpu_compute() {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
...@@ -59,43 +164,79 @@ class ConvPE : public PE { ...@@ -59,43 +164,79 @@ class ConvPE : public PE {
Tensor float_output; Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape()); float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape()); float* out = float_output.mutableData<float>(FP32, output->shape());
float* bias_data = param_.bias()->data<float>();
int out_width = output->shape().width();
int out_channel = output->shape().channel(); int out_channel = output->shape().channel();
int in_channel = input->shape().channel(); int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>(); float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel]; float* mi = new float[in_channel];
float max = 0;
int out_index = 0;
for (int i = 0; i < out_channel; i++) { for (int i = 0; i < out_channel; i++) {
float* image = image_addr; float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel; float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi; float* out_ptr = mi;
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
float value = image_addr[j] * filter_ptr[j];
mi[j] = value;
}
for (int h = 0; h < output->shape().height(); h++) {
for (int w = 0; w < output->shape().width(); w++) {
float sum = 0; float sum = 0;
// #pragma omp parallel for
for (int j = 0; j < in_channel; j++) { for (int j = 0; j < in_channel; j++) {
sum += mi[j]; int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j];
sum += value;
}
sum += bias_data[i];
if (param_.relu.enabled && sum < 0) {
sum = 0;
}
if (sum > max) {
max = sum;
}
out_index = h * out_width * out_channel + w * out_channel + i;
out[out_index] = sum;
// out_index++;
}
} }
out[i] = sum;
} }
delete[] mi; delete[] mi;
float_output.flush(); float_output.flush();
output->copyFrom(&float_output); output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
} }
bool dispatch() { bool dispatch() {
inplace_.relu_enable = param_.relu.enabled; if (use_cpu_) {
cpu_compute();
return true;
}
inplace_.leaky_relu_enable =
(param_.relu.leaky_relu_factor != 0) ? true : false;
inplace_.relu_enable =
inplace_.leaky_relu_enable ? false : param_.relu.enabled;
inplace_.power_enable = false; inplace_.power_enable = false;
inplace_.normalize_enable = false; inplace_.normalize_enable = false;
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
if (param_.relu.enabled) {
inplace_.relu_enable = param_.relu.enabled;
config_inplace(inplace_); config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.leaky_relu_factor =
fp32_2_fp16(param_.relu.leaky_relu_factor);
config_activation(activeParamterArgs);
}
} }
std::vector<BasicConvParam*>& params = param_.splitParams(); std::vector<BasicConvParam*>& params = param_.splitParams();
...@@ -104,9 +245,16 @@ class ConvPE : public PE { ...@@ -104,9 +245,16 @@ class ConvPE : public PE {
ret |= compute_fpga_conv_basic(conv_param->args); ret |= compute_fpga_conv_basic(conv_param->args);
} }
if (param_.relu.enabled) { if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
inplace_.relu_enable = false; inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
config_inplace(inplace_); config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
config_activation(activeParamterArgs);
}
} }
size_t size = params.size(); size_t size = params.size();
...@@ -127,11 +275,13 @@ class ConvPE : public PE { ...@@ -127,11 +275,13 @@ class ConvPE : public PE {
ConvParam& param() { return param_; } ConvParam& param() { return param_; }
private: private:
bool use_cpu_ = false;
ConvParam param_; ConvParam param_;
ConcatPE concatPE_; ConcatPE concatPE_;
ElementwiseAddPE addPE_; ElementwiseAddPE addPE_;
int split_axis = 0; int split_axis = 0;
InplaceArgs inplace_ = {0}; InplaceArgs inplace_ = {0};
ActiveParamterArgs activeParamterArgs;
}; };
} // namespace zynqmp } // namespace zynqmp
......
...@@ -14,6 +14,9 @@ limitations under the License. */ ...@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#ifndef conv_process_hpp
#define conv_process_hpp
#include <string.h> #include <string.h>
#include <cmath> #include <cmath>
#include <vector> #include <vector>
...@@ -45,7 +48,9 @@ inline int get_split_num(Tensor* filter) { ...@@ -45,7 +48,9 @@ inline int get_split_num(Tensor* filter) {
filter->shape().width(); filter->shape().width();
auto num = filter->shape().num(); auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw); int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity); int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num = align_to_x(num, filter_num_alignment);
return filter::calc_split_num(aligned_num, div_capacity);
} }
inline void fill_scale_bias_const(ConvParam* param_) { inline void fill_scale_bias_const(ConvParam* param_) {
...@@ -126,41 +131,87 @@ inline void format_scale_bias(Tensor* scale, ...@@ -126,41 +131,87 @@ inline void format_scale_bias(Tensor* scale,
bias_data = bias->data<float>(); bias_data = bias->data<float>();
} }
int channel = filter->shape().num(); int channel = filter->shape().num();
Shape bias_scale_shape(N, {2 * channel}); int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
int c_per_group = channel / group;
int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
Shape bias_scale_shape(N, {2 * scale_bias_len});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape); float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
for (int i = 0; i < channel; i++) { float* temp_data =
float scale_value = scale_data == nullptr ? 1 : scale_data[i]; reinterpret_cast<float*>(fpga_malloc(2 * scale_bias_len * sizeof(float)));
float bias_value = bias_data == nullptr ? 0 : bias_data[i]; memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
bs_data[i + channel] = scale_value;
bs_data[i] = bias_value; std::vector<float> scales;
if (scale_data != nullptr) {
for (int i = 0; i < channel; ++i) {
scales.push_back(scale_data[i]);
}
for (int i = 0; i < scale_bias_len - channel; i++) {
scales.push_back(1);
}
} else {
for (int i = 0; i < scale_bias_len; i++) {
scales.push_back(1);
}
}
for (int i = 0; i < scale_bias_len; ++i) {
temp_data[i + scale_bias_len] = 1;
temp_data[i] = 0;
}
for (int g = 0; g < group; g++) {
for (int c = 0; c < c_per_group; c++) {
int src_index = g * c_per_group + c;
int dst_index = g * aligned_c_per_group + c;
float scale_value = scales[src_index];
float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
temp_data[dst_index + scale_bias_len] = scale_value;
temp_data[dst_index] = bias_value;
}
} }
int element_num_per_div = get_filter_num_per_div(filter, group); // int element_num_per_div = get_filter_num_per_div(filter, group);
bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel); // int scale_bias_len = align_to_x(channel / group, 8) * group;
bias_scale::format_bias_scale_array(
&temp_data, scale_bias_len / group, scale_bias_len);
memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
} }
inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) { inline void format_filter(Tensor* filter,
Tensor* quantized_filter,
int group,
std::vector<float>& scales) { // NOLINT
float max_value = find_max(*filter); float max_value = find_max(*filter);
Shape& filter_shape = filter->shape(); Shape& filter_shape = filter->shape();
quantized_filter->setAligned(true);
quantized_filter->mutableData<int8_t>(INT8, filter->shape());
quantized_filter->scale()[0] = max_value / 127.0f;
quantized_filter->scale()[1] = 127.0f / max_value;
auto memory_size = filter->shape().memorySize(sizeof(float)); int mem_size;
auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size)); std::vector<float> max_values;
memcpy(new_data, filter->data<float>(), memory_size); int8_t* quantized_data = filter::format_filter(filter->data<float>(),
size_t mem_size = filter::format_filter(&new_data, mem_size,
filter_shape.num(), filter_shape.num(),
filter_shape.channel(), filter_shape.channel(),
filter_shape.height(), filter_shape.height(),
filter_shape.width(), filter_shape.width(),
group, group,
max_value); max_value,
max_values);
float mem_factor = mem_size * 1.0f / filter->shape().numel();
quantized_filter->setMemScale(mem_factor);
quantized_filter->setAligned(true);
int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape()); int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
memcpy(src, new_data, mem_size); quantized_filter->scale()[0] = max_value / 127.0f;
fpga_free(new_data); quantized_filter->scale()[1] = 127.0f / max_value;
memcpy(src, quantized_data, mem_size);
quantized_filter->flush(); quantized_filter->flush();
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value);
}
} }
inline void format_dw_filter(Tensor* filter, inline void format_dw_filter(Tensor* filter,
...@@ -207,10 +258,20 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -207,10 +258,20 @@ inline void split_filter_num(const ConvParam& c_param) {
Tensor* out = param.output; Tensor* out = param.output;
Tensor* filter = param.filter; Tensor* filter = param.filter;
auto channel = out->shape().channel(); auto channel = out->shape().channel();
int split_num = param.groups == 1 ? get_split_num(param.filter) : 1; int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
int filter_num_per_div = get_filter_num_per_div(filter, param.groups); int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups;
// int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) *
// param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape(); Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) { for (int i = 0; i < split_num; i++) {
BasicConvParam* conv_param = new BasicConvParam(); BasicConvParam* conv_param = new BasicConvParam();
...@@ -251,9 +312,17 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -251,9 +312,17 @@ inline void split_filter_num(const ConvParam& c_param) {
filter->data<float>() + i * filter_num_per_div * filter_hwc, filter->data<float>() + i * filter_num_per_div * filter_hwc,
filter_num * filter_hwc * sizeof(float)); filter_num * filter_hwc * sizeof(float));
new_filter.flush(); new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape); conv_param->filter.mutableData<float>(FP32, f_shape);
format_filter(&new_filter, &(conv_param->filter), param.groups);
if (param.groups != 1) {
int mem_factor =
32 / filter_num_per_div; // TODO(chonwhite): change 32 to param;
conv_param->filter.setMemScale(mem_factor);
}
std::vector<float> v; // TODO(chonwhite): change local variable name
format_filter(&new_filter, &(conv_param->filter), param.groups, v);
conv_param->filter.setDataType(INT8);
int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT); int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
Tensor scale; Tensor scale;
...@@ -265,7 +334,7 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -265,7 +334,7 @@ inline void split_filter_num(const ConvParam& c_param) {
float* scale_data = scale.mutableData<float>(FP32, s_shape); float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape); float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start]; scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
} }
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start]; bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
...@@ -276,11 +345,14 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -276,11 +345,14 @@ inline void split_filter_num(const ConvParam& c_param) {
&conv_param->filter, &conv_param->filter,
&conv_param->scaleBias, &conv_param->scaleBias,
param.groups); param.groups);
conv_param->scaleBias.flush(); conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>();
args.group_num = param.groups; args.group_num = param.groups;
args.relu_enabled = param.relu.enabled; args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>(); args.sb_address = conv_param->scaleBias.data<float>();
args.sb_address = bs_data;
args.kernel.stride_h = param.strides[1]; args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0]; args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height(); args.kernel.height = new_filter.shape().height();
...@@ -294,17 +366,13 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -294,17 +366,13 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.channels = input->shape().channel(); args.image.channels = input->shape().channel();
args.image.width = input->shape().width(); args.image.width = input->shape().width();
args.image.height = input->shape().height(); args.image.height = input->shape().height();
auto paddings = *param.padding; args.image.pad_width = param.paddings[1];
args.image.pad_width = param.paddings[2];
args.image.pad_height = param.paddings[0]; args.image.pad_height = param.paddings[0];
// dilations[0] = dilations[1] ;
args.dilation = param.dilations[0];
args.output.address = out_address; args.output.address = out_address;
args.output.scale_address = out_scale_address; args.output.scale_address = out_scale_address;
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.splitParams().push_back(conv_param); param.splitParams().push_back(conv_param);
} }
} }
...@@ -317,7 +385,7 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -317,7 +385,7 @@ inline void split_channel(const ConvParam& c_param) {
int num = ceil(input->shape().channel() * 1.0f / 2047); int num = ceil(input->shape().channel() * 1.0f / 2047);
int channel = input->shape().channel() / num; int channel = input->shape().channel() / num;
std::cout << "channel::" << channel << "num::" << num << std::endl;
Shape bs_shape(N, {channel}); Shape bs_shape(N, {channel});
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
...@@ -331,6 +399,7 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -331,6 +399,7 @@ inline void split_channel(const ConvParam& c_param) {
// filter transformation; // filter transformation;
Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1}); Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
Tensor new_filter; Tensor new_filter;
float* dst = new_filter.mutableData<float>(FP32, f_shape); float* dst = new_filter.mutableData<float>(FP32, f_shape);
...@@ -341,7 +410,8 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -341,7 +410,8 @@ inline void split_channel(const ConvParam& c_param) {
src += param.filter->shape().channel(); src += param.filter->shape().channel();
} }
new_filter.flush(); new_filter.flush();
format_filter(&new_filter, &(conv_param->filter), param.groups); std::vector<float> scales;
format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
Tensor bias; Tensor bias;
Tensor scale; Tensor scale;
...@@ -354,6 +424,7 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -354,6 +424,7 @@ inline void split_channel(const ConvParam& c_param) {
} }
scale.flush(); scale.flush();
bias.flush(); bias.flush();
// Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale, format_scale_bias(&scale,
&bias, &bias,
&conv_param->filter, &conv_param->filter,
...@@ -379,18 +450,12 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -379,18 +450,12 @@ inline void split_channel(const ConvParam& c_param) {
args.image.channels = conv_param->input.shape().channel(); args.image.channels = conv_param->input.shape().channel();
args.image.width = conv_param->input.shape().width(); args.image.width = conv_param->input.shape().width();
args.image.height = conv_param->input.shape().height(); args.image.height = conv_param->input.shape().height();
auto paddings = *param.paddings; args.image.pad_width = param.paddings[1];
args.image.pad_width = paddings[2]; args.image.pad_height = param.paddings[0];
args.image.pad_height = paddings[0]; // dilations[0] = dilations[1]
args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>(); args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale(); args.output.scale_address = conv_param->output.scale();
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.splitParams().push_back(conv_param); param.splitParams().push_back(conv_param);
} }
} }
...@@ -418,11 +483,11 @@ inline bool compute_conv(const ConvParam& c_conv_params) { ...@@ -418,11 +483,11 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
} }
size_t size = params.size(); size_t size = params.size();
if (ret == 0 && size > 1) { if (ret == 0 && size > 1) {
// Tensor* output = conv_params.output;
Tensor& img = params[0]->output; Tensor& img = params[0]->output;
for (int i = 0; i < 1; i++) { for (int i = 0; i < 1; i++) {
for (int i = 0; i < img.shape().numel(); i++) { for (int i = 0; i < img.shape().numel(); i++) {
float value = half_to_float(img.data<float16>()[i]); float value = half_to_float(img.data<float16>()[i]);
std::cout << "value:" << value << std::endl;
} }
} }
} }
...@@ -431,3 +496,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) { ...@@ -431,3 +496,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
#endif /* conv_process_hpp */
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pes/crop_pe.hpp" #include "lite/backends/fpga/KD/pes/crop_pe.hpp"
#include <vector>
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <cstring> #include <cstring>
#include <vector> #include <vector>
......
...@@ -37,19 +37,38 @@ class DepthwiseConvPE : public PE { ...@@ -37,19 +37,38 @@ class DepthwiseConvPE : public PE {
Tensor* output = param.output; Tensor* output = param.output;
int channel = output->shape().channel(); int channel = output->shape().channel();
float* new_scale_data = param_.scale()->data<float>();
float* new_bias_data = param_.bias()->data<float>();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape()); float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]); b_data[i] = float_to_half(new_bias_data[i]);
} }
bias_.flush(); bias_.flush();
} else {
float16* new_bias_data = param_.bias()->data<float16>();
memcpy(b_data, new_bias_data, channel * sizeof(float16));
bias_.flush();
}
if (param_.scale()->dataType() == FP32) {
float* new_scale_data = param_.scale()->data<float>();
Tensor* quantized_filter = param.quantizedFilter(); Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape()); quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data); format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
} else {
// filter 全为1时,且channal为对齐时
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
param.quantizedFilter()->flush();
}
DWconvArgs args = {0}; DWconvArgs args = {0};
args.bias_address = b_data; args.bias_address = b_data;
args.filter_address = param.quantizedFilter()->data<void>(); args.filter_address = param.quantizedFilter()->data<void>();
...@@ -61,21 +80,14 @@ class DepthwiseConvPE : public PE { ...@@ -61,21 +80,14 @@ class DepthwiseConvPE : public PE {
args.image.channels = input->shape().channel(); args.image.channels = input->shape().channel();
args.image.height = input->shape().height(); args.image.height = input->shape().height();
args.image.width = input->shape().width(); args.image.width = input->shape().width();
auto paddings = *param.paddings; args.image.pad_width = param.paddings[0];
args.image.pad_width = param.paddings[2]; args.image.pad_height = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.image.scale_address = input->scale(); args.image.scale_address = input->scale();
args.output.address = output->data<void>(); args.output.address = output->data<void>();
args.output.scale_address = output->scale(); args.output.scale_address = output->scale();
args.out_width = param.output->shape().width(); args.out_width = param.output->shape().width();
args.out_height = param.output->shape().height(); args.out_height = param.output->shape().height();
args.sub_conv_num = 1; args.sub_conv_num = 1;
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.args = args; param.args = args;
inplace_.relu_enable = param_.relu.enabled; inplace_.relu_enable = param_.relu.enabled;
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
namespace paddle {
namespace zynqmp {
class ElementwiseMulPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
output->setDataLocation(Device);
return true;
}
void apply() {
Tensor* input = param_.inputs[0];
Tensor* output = param_.output;
int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
// int wc_aligned = / 32 * 32;
Shape s(N, {wc_aligned});
float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
memset(bias_data, 0, wc_aligned * sizeof(float16));
ScaleArgs& args = args_;
args.scale_address = param_.inputs[1]->data<void>();
args.bias_address = bias_tensor.data<void>();
args.wc_alignment = wc_aligned;
args.channel_alignment = wc_aligned;
args.image.address = input->data<void>();
args.image.scale_address = input->scale();
args.image.channels = wc_aligned;
args.image.height = 1;
args.image.width = 1;
args.image.pad_width = 0;
args.image.pad_height = 0;
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
}
void updateInput(Tensor* t, int index) {
if (index == 0) {
args_.scale_address = t->data<void>(); // replace inputs?
}
}
bool dispatch() {
compute_fpga_scale(args_) == 0;
return true;
}
ElementwiseMulParam& param() { return param_; }
private:
ElementwiseMulParam param_;
ScaleArgs args_ = {0};
Tensor bias_tensor;
};
} // namespace zynqmp
} // namespace paddle
...@@ -37,7 +37,10 @@ class FullyConnectedPE : public PE { ...@@ -37,7 +37,10 @@ class FullyConnectedPE : public PE {
ConvParam& convParam_ = convPE_.param(); ConvParam& convParam_ = convPE_.param();
Tensor* input = param_.input; Tensor* input = param_.input;
convParam_.input = param_.input; convParam_.input = param_.input;
num_ = param_.input->shape().num();
convParam_.output = param_.output; convParam_.output = param_.output;
convParam_.groups = 1; convParam_.groups = 1;
convParam_.strides = {1, 1}; convParam_.strides = {1, 1};
convParam_.paddings = {0, 0}; convParam_.paddings = {0, 0};
...@@ -63,7 +66,6 @@ class FullyConnectedPE : public PE { ...@@ -63,7 +66,6 @@ class FullyConnectedPE : public PE {
new_filter_data[i * chw + j] = scale; new_filter_data[i * chw + j] = scale;
} }
} }
conv_filter->flush(); conv_filter->flush();
convParam_.filter = conv_filter; convParam_.filter = conv_filter;
...@@ -89,6 +91,8 @@ class FullyConnectedPE : public PE { ...@@ -89,6 +91,8 @@ class FullyConnectedPE : public PE {
private: private:
FullyConnectedParam param_; FullyConnectedParam param_;
ConvPE convPE_; ConvPE convPE_;
Tensor tempOut_;
int num_ = 1;
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/arm/math/sgemm.h"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
#include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace zynqmp {
struct GRUTensors {
Tensor* gate;
Tensor* pre_output;
Tensor* output;
Tensor* reset_output;
};
class GRUPE : public PE {
public:
bool init() {
// Tensor* output = param_.output;
// output->setAligned(true);
// output->setDataLocation(Device);
return true;
}
void apply() {
auto hidden = param_.hidden;
// auto hidden_dims = hidden->dims();
int frame_size = hidden->shape().channel();
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
// copy 2/3 weight from param.weight;
zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float));
weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memcpy(weight_data,
param_.weight->data<float>(),
weight_shape.numel() * sizeof(float));
Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
gate_ping_.mutableData<void>(FP32, gate_shape);
gate_pong_.mutableData<void>(FP16, gate_shape);
zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param();
pre_out_param.input = &prev_hidden_;
pre_out_param.output = &gate_pong_;
pre_out_param.filter = &weight_;
pre_out_param.bias = &gate_ping_;
pre_out_pe_.init();
pre_out_pe_.apply();
// // ============= C
// ElementwiseAddParam& bias_add_param = bias_ew_pe_.param();
// bias_add_param.inputs = {&pre_output_, &pre_input_};
// bias_add_param.output = &pre_input_;
// bias_ew_pe_.init();
// bias_ew_pe_.apply();
// // ====================
// Shape state_weight_shape(NC,{frame_size, frame_size});
// float* state_weight_data = state_weight_.mutableData<float>(FP32,
// state_weight_shape);
// memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
// state_weight_shape.numel() * sizeof(float));
// FullyConnectedParam& reset_out_param = reset_out_pe_.param();
// reset_out_param.input = &prev_hidden;
// reset_out_param.output = &gate_ping;
// reset_out_param.filter = &state_weight_;
// // ============== unit reset;
// update_gate_.mutableData<void>(FP16, pre_input_shape);
// InputParam& relu_param = update_relu_pe_.param();
// relu_param.input = &tempTensor;
// relu_param.output = &update_gate_;
// update_relu_pe_.init();
// update_relu_pe_.apply();
reset_gate_.mutableData<void>(FP16, hidden_shape);
prev_hidden_.mutableData<void>(FP16, hidden_shape);
reset_hidden_.mutableData<void>(FP16, hidden_shape);
// InputParam& reset_param = reset_relu_pe_.param();
// reset_param.input = &tempTensor;
// reset_param.output = &reset_gate_;
// reset_relu_pe_.init();
// reset_relu_pe_.apply();
// float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
// memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16));
// // TODO
// reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
ElementwiseMulParam& mul_param = mul_pe_.param();
mul_param.inputs = {&reset_gate_, &prev_hidden_};
mul_param.output = &reset_hidden_;
mul_pe_.init();
mul_pe_.apply();
// ==============
}
bool dispatch() { return true; }
void gru_unit_reset_act(const lite_api::ActivationType active_gate,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
int stride_update = 3 * frame_size;
int stride_cell_state = 3 * frame_size;
int stride_hidden_prev = frame_size;
int stride_hidden = frame_size;
// Tensor* gate = value.gate;
// value.gate->saveToFile("value_input.txt");
float* update_gate_data = gate_ping_.data<float>();
float* reset_gate_data = update_gate_data + frame_size;
for (int b = 0; b < batch_size; b++) {
// memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel()
// * sizeof(float));
// tempTensor.flush();
Tensor tmp;
Shape s(NC, {1, frame_size});
float* tmp_data = tmp.mutableData<float>(FP32, s);
for (int i = 0; i < frame_size; i++) {
// f(x) = x / (1 + abs(x))?
update_gate_data[i] =
lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
update_gate_data[i]);
reset_gate_data[i] =
lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
reset_gate_data[i]);
}
memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
tmp.flush();
reset_gate_.copyFrom(&tmp);
// reset_gate_.copyFrom(&tempTensor);
Tensor* hidden_prev = value.pre_output;
if (hidden_prev) {
// memcpy(prev_data, )
// TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
}
// // 4.0 reset_date * hidden_prev;
// // reset_hidden_prev[i] = reset_gate[i] * prev;
mul_pe_.dispatch();
reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update;
reset_gate_data += stride_update;
// reset_hidden_prev += stride_hidden;// TODO
}
}
void gru_unit_out_act(const lite_api::ActivationType active_node,
bool origin_mode,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
// int stride_update = 3 * frame_size;
// int stride_cell_state = 3 * frame_size;
// int stride_hidden_prev = frame_size;
// int stride_hidden = frame_size;
// Tensor* hidden = value.output_value;
// float* hidden_prev = nullptr;
// if (hidden) {
// hidden_prev = hidden->data<float>();
// }
// float* cell_state = value.gate->data<float>() + 2 * frame_size;
// float* updata_gate = value.gate->data<float>();
// // float* reset_gate_data = update_gate_data + frame_size;
// float prev = 0.0f;
// for (int b = 0; b < batch_size; ++b) {
// if (origin_mode) {
// // for (int i = 0; i < frame_size; i++) {
// // float prev = 0;
// // if (hidden_prev) {
// // prev = hidden_prev[i];
// // }
// // cell_state[i] =
// lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
// // hidden[i] =
// // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] *
// prev;
// // }
// } else {
// for (int i = 0; i < frame_size; ++i) {
// cell_state[i] =
// lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
// if (hidden_prev) {
// prev = hidden_prev[i];
// }
// float hidden_value =
// prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
// hidden_prev[i] = hidden_value;
// std::cout << "hidden_value::" << hidden_value << std::endl;
// }
// }
// updata_gate += stride_update;
// cell_state += stride_cell_state;
// hidden_prev += frame_size;
// }
}
void copy_input(GRUTensors& value) { // NOLINT
float max = find_max(*(value.gate));
gate_ping_.mutableData<void>(FP32, value.gate->shape());
gate_ping_.copyFrom(value.gate);
// update input pointer?
// gate_.readFromFile("input/in.txt");
// // pre_input_.saveToFile("pppp_in.txt");
// gate_.scale()[0] = max / 127;
// gate_.scale()[1] = 127 / max;
// gate_.printScale("pre_input_");
// gate_.saveToFile("pre_input_.txt");
// pre_out_pe_.dispatch();
// pre_output_.saveToFile("pp_out.txt");
}
void GRUCOmpute(GRUTensors& value, // NOLINT
int frame_size,
int batch_size,
const lite_api::ActivationType active_node,
const lite_api::ActivationType active_gate,
bool origin_mode) {
copy_input(value);
if (value.pre_output) {
// copy by batch;
pre_out_pe_.dispatch();
gate_ping_.copyFrom(&gate_pong_);
}
gru_unit_reset_act(active_gate, value, frame_size, batch_size);
// if (value.pre_output) {
// // state weight;
// reset_out_pe_.dispatch();
// }
// gru_unit_out_act(active_node, origin_mode, value, frame_size,
// batch_size);
}
GRUParam& param() { return param_; }
// Tensor* preOutput() {
// return &pre_output_;
// }
// Tensor* gate() {
// return &gate_;
// }
Tensor* updateGate() { return &update_gate_; }
Tensor* resetGate() { return &reset_gate_; }
private:
GRUParam param_;
zynqmp::Tensor gate_ping_;
zynqmp::Tensor gate_pong_;
zynqmp::Tensor bias_;
zynqmp::Tensor weight_;
zynqmp::Tensor state_weight_;
// =================================
zynqmp::Tensor update_gate_;
zynqmp::Tensor reset_gate_;
zynqmp::Tensor cell_state_;
zynqmp::Tensor prev_hidden_;
zynqmp::Tensor reset_hidden_;
Tensor tempTensor;
// =================================
ReluPE update_relu_pe_;
ReluPE reset_relu_pe_;
zynqmp::ElementwiseMulPE mul_pe_;
zynqmp::FullyConnectedPE pre_out_pe_;
zynqmp::FullyConnectedPE reset_out_pe_;
zynqmp::ElementwiseAddPE bias_ew_pe_;
};
} // namespace zynqmp
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/backends/arm/math/gru_utils.h"
namespace paddle {
namespace lite {
namespace fpga {}
}
}
...@@ -25,6 +25,8 @@ class OutputPE : public PE { ...@@ -25,6 +25,8 @@ class OutputPE : public PE {
bool init() { bool init() {
Tensor* output = param_.output; Tensor* output = param_.output;
output->setAligned(false); output->setAligned(false);
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
return true; return true;
} }
...@@ -41,6 +43,15 @@ class OutputPE : public PE { ...@@ -41,6 +43,15 @@ class OutputPE : public PE {
} else { } else {
output->copyFrom(input); output->copyFrom(input);
} }
//
output->syncToCPU();
if (DLEngine::get_instance().out_data == nullptr) {
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
}
memcpy(DLEngine::get_instance().out_data,
output->data<void>(),
output->shape().numel() * sizeof(float));
return true; return true;
} }
......
...@@ -35,24 +35,25 @@ class PoolingPE : public PE { ...@@ -35,24 +35,25 @@ class PoolingPE : public PE {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
uint32_t k_width = param_.kernelSize[0]; uint32_t k_height = param_.kernelSize[0];
uint32_t k_height = param_.kernelSize[1]; uint32_t k_width = param_.kernelSize[1];
if (param_.globalPooling) { if (param_.globalPooling) {
k_width = input->shape().width(); k_width = input->shape().width();
k_height = input->shape().height(); k_height = input->shape().height();
param_.kernelSize[0] = k_height;
param_.kernelSize[1] = k_width;
} }
PoolingArgs args = {0}; PoolingArgs args = {0};
args.mode = param_.type; args.mode = param_.type;
auto paddings = *param_.paddings;
args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height)); args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
args.image.address = input->data<float16>(); args.image.address = input->data<float16>();
args.image.channels = input->shape().channel(); args.image.channels = input->shape().channel();
args.image.height = input->shape().height(); args.image.height = input->shape().height();
args.image.width = input->shape().width(); args.image.width = input->shape().width();
args.image.pad_height = paddings[0]; args.image.pad_height = param_.paddings[0];
args.image.pad_width = paddings[2]; args.image.pad_width = param_.paddings[1];
args.image.scale_address = input->scale(); args.image.scale_address = input->scale();
args.output.address = output->mutableData<float16>(); args.output.address = output->mutableData<float16>();
args.output.scale_address = output->scale(); args.output.scale_address = output->scale();
...@@ -66,6 +67,10 @@ class PoolingPE : public PE { ...@@ -66,6 +67,10 @@ class PoolingPE : public PE {
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7); (k_width > 7 || k_height > 7);
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// &&
// (k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE;
} }
void compute() { void compute() {
...@@ -74,16 +79,16 @@ class PoolingPE : public PE { ...@@ -74,16 +79,16 @@ class PoolingPE : public PE {
input->syncToCPU(); input->syncToCPU();
Tensor float_input; Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape()); float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
float16* data_out = output->data<float16>(); float16* data_out = output->data<float16>();
auto paddings = *param_.paddings;
int image_height = input->shape().height(); int image_height = input->shape().height();
int image_width = input->shape().width(); int image_width = input->shape().width();
int image_channels = input->shape().channel(); int image_channels = input->shape().channel();
int image_pad_h = paddings[0]; int image_pad_h = param_.paddings[0];
int image_pad_w = paddings[2]; int image_pad_w = param_.paddings[1];
int kernel_height = param_.kernelSize[1]; int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0]; int kernel_width = param_.kernelSize[0];
int kernel_step_h = param_.strides[0]; int kernel_step_h = param_.strides[0];
...@@ -129,7 +134,7 @@ class PoolingPE : public PE { ...@@ -129,7 +134,7 @@ class PoolingPE : public PE {
output->flush(); output->flush();
} }
void cpu_compute() { void cpu_compute1() {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
input->syncToCPU(); input->syncToCPU();
...@@ -138,7 +143,6 @@ class PoolingPE : public PE { ...@@ -138,7 +143,6 @@ class PoolingPE : public PE {
float_input.mutableData<float>(FP32, input->shape()); float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
float16* data_out = output->data<float16>(); float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1]; int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0; float scale_max = 0;
...@@ -154,13 +158,43 @@ class PoolingPE : public PE { ...@@ -154,13 +158,43 @@ class PoolingPE : public PE {
} }
output->scale()[0] = scale_max / 127.0f; output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max; output->scale()[1] = 127.0f / scale_max;
std::cout << "pool scale:" << scale_max / 127.0f << std::endl; output->flush();
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
float* float_input_data =
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0;
for (int i = 0; i < output->shape().channel(); i++) {
float sum = 0;
for (int j = 0; j < kernel_hw; j++) {
sum += float_input_data[i * kernel_hw + j];
}
float value = sum / kernel_hw;
data_out[i] = float_to_half(value);
scale_max = std::max(scale_max, std::abs(value));
}
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush(); output->flush();
} }
bool dispatch() { bool dispatch() {
if (use_cpu_) { if (use_cpu_) {
// cpu_compute();
compute(); compute();
// exit(-1);
return true; return true;
} }
param_.input->syncToDevice(); param_.input->syncToDevice();
......
...@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() { ...@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() {
if (cachedBoxes_ == nullptr) { if (cachedBoxes_ == nullptr) {
cachedBoxes_ = new Tensor(); cachedBoxes_ = new Tensor();
cachedVariances_ = new Tensor(); cachedVariances_ = new Tensor();
cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape()); cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
cachedVariances_->mutableData<float16>(FP16, cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
param_.outputVariances->shape());
cachedBoxes_->setDataLocation(CPU); cachedBoxes_->setDataLocation(CPU);
cachedVariances_->setDataLocation(CPU); cachedVariances_->setDataLocation(CPU);
compute_prior_box(); compute_prior_box();
......
...@@ -14,11 +14,16 @@ limitations under the License. */ ...@@ -14,11 +14,16 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include "lite/backends/fpga/KD/pe.hpp" #include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp" #include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
class ScalePE : public PE { class ScalePE : public PE {
public: public:
inline int gcd(int a, int b) { inline int gcd(int a, int b) {
...@@ -42,6 +47,8 @@ class ScalePE : public PE { ...@@ -42,6 +47,8 @@ class ScalePE : public PE {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
Shape& input_shape = input->shape(); Shape& input_shape = input->shape();
DepthwiseConvParam& dw_param = dw_pe_.param();
int channel = input_shape.channel(); int channel = input_shape.channel();
int repeat = 1; int repeat = 1;
int alignment = 16; int alignment = 16;
...@@ -51,13 +58,24 @@ class ScalePE : public PE { ...@@ -51,13 +58,24 @@ class ScalePE : public PE {
int c_lcm = lcm(channel, alignment); int c_lcm = lcm(channel, alignment);
repeat = c_lcm / (channel); repeat = c_lcm / (channel);
} }
// FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现
Shape shape(N, {channel * repeat}); Shape shape(N, {channel * repeat});
param_.alignedBias()->mutableData<float16>(FP16, shape);
param_.alignedScale()->mutableData<float16>(FP16, shape);
float16* bias_data = param_.alignedBias()->data<float16>(); float* filter_data = filter.mutableData<float>(FP32, shape);
float16* scale_data = param_.alignedScale()->data<float16>(); std::fill_n(filter_data, input->shape().channel(), 1.0f);
Tensor* scale = dw_param.scale();
float16* scale_data = scale->mutableData<float16>(FP16, shape);
// memcpy(scale_data, param_.scale->data<float>(), input->shape().channel()
// * sizeof(float));
Tensor* bias = dw_param.bias();
float16* bias_data = bias->mutableData<float16>(FP16, shape);
std::fill_n(bias_data, input->shape().channel(), 0);
if (param_.scale->dataType() == FP32) {
// std::cout << "scale dataType FP32:" << std::endl;
if (param_.bias != nullptr) { if (param_.bias != nullptr) {
float* bias_data_float = param_.bias->data<float>(); float* bias_data_float = param_.bias->data<float>();
for (int i = 0; i < repeat; i++) { for (int i = 0; i < repeat; i++) {
...@@ -82,39 +100,112 @@ class ScalePE : public PE { ...@@ -82,39 +100,112 @@ class ScalePE : public PE {
scale_data[i * length + j] = value; scale_data[i * length + j] = value;
} }
} }
} else {
if (param_.bias != nullptr) {
float16* bias_data_float = param_.bias->data<float16>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = bias_data_float[j];
bias_data[i * length + j] = value;
}
}
} else {
float16 zero = float_to_half(0.0f);
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
bias_data[i * length + j] = zero;
}
}
}
param_.alignedScale()->flush(); float16* scale_data_float = param_.scale->data<float16>();
param_.alignedBias()->flush(); for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = scale_data_float[j];
scale_data[i * length + j] = value;
}
}
}
// if (param_.bias != nullptr) {
// memcpy(bias_data, param_.bias->data<float>(), input->shape().channel()
// * sizeof(float));
// }
int wc = input_shape.width() * input_shape.channel(); dw_param.input = param_.input;
int wc_aligned = align_image(wc); dw_param.output = param_.output;
dw_param.filter = &filter;
ScaleArgs& args = param_.args; dw_param.strides = {1, 1};
args.scale_address = param_.alignedScale()->data<void>(); dw_param.paddings = {0, 0};
args.bias_address = param_.alignedBias()->data<void>(); dw_param.kernelSize = {1, 1};
args.wc_alignment = wc_aligned; dw_param.dilations = {1, 1};
args.channel_alignment = channel * repeat;
args.image.address = input->data<void>(); dw_pe_.init();
args.image.scale_address = input->scale(); dw_pe_.apply();
args.image.channels = channel; }
args.image.height = input_shape.height();
args.image.width = input_shape.width(); void cpu_compute() {
args.image.pad_width = 0; Tensor* input = param_.input;
args.image.pad_height = 0; Tensor* output = param_.output;
args.output.address = output->data<void>(); Tensor float_input;
args.output.scale_address = output->scale(); float* image_addr = float_input.mutableData<float>(FP32, input->shape());
input->syncToCPU();
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
float* scale_data = param_.scale->data<float>();
int wh = input->shape().width() * input->shape().height();
float16* in_data = input->data<float16>();
float max = 0;
for (int i = 0; i < wh; i++) {
for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c];
data_out[index] = float_to_half(value);
if (value < 0) {
value = -value;
}
if (value > max) {
max = value;
}
}
}
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
} }
bool dispatch() { bool dispatch() {
// cpu_compute();
// return true;
if (param_.scale->dataType() == FP16) {
DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
param_.scale->data<float16>(),
param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
dw_param.quantizedFilter()->flush();
// apply();
}
param_.input->syncToDevice(); param_.input->syncToDevice();
return compute_fpga_scale(param_.args) == 0; return dw_pe_.dispatch();
} }
ScaleParam& param() { return param_; } ScaleParam& param() { return param_; }
private: private:
ScaleParam param_; ScaleParam param_;
Tensor filter;
DepthwiseConvPE dw_pe_;
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
static struct None none_;
static struct NCHW nchw_; static struct NCHW nchw_;
static struct NHWC nhwc_; static struct NHWC nhwc_;
static struct NC nc_; static struct NC nc_;
...@@ -82,6 +83,9 @@ class Shape { ...@@ -82,6 +83,9 @@ class Shape {
void setLayoutType(LayoutType layout) { void setLayoutType(LayoutType layout) {
this->layoutType_ = layout; this->layoutType_ = layout;
switch (layout) { switch (layout) {
case None:
layout_ = &none_;
break;
case NCHW: case NCHW:
layout_ = &nchw_; layout_ = &nchw_;
break; break;
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <unistd.h>
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
...@@ -24,13 +25,10 @@ limitations under the License. */ ...@@ -24,13 +25,10 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
// #include "lite/core/tensor.h"
#include "lite/backends/fpga/KD/dl_engine.hpp" #include "lite/backends/fpga/KD/dl_engine.hpp"
#include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/shape.hpp" #include "lite/backends/fpga/KD/shape.hpp"
// #include "lite/backends/fpga/KD/types.hpp"
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
...@@ -117,7 +115,8 @@ class Tensor { ...@@ -117,7 +115,8 @@ class Tensor {
template <typename Dtype> template <typename Dtype>
Dtype* mutableData() { Dtype* mutableData() {
size_t memorySize = shape_->memorySize(CellSize(dataType_)); size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
if (placeHolder_ != nullptr) { if (placeHolder_ != nullptr) {
if (memorySize > placeHolder_->memorySize()) { if (memorySize > placeHolder_->memorySize()) {
placeHolder_.reset(new PlaceHolder(memorySize)); placeHolder_.reset(new PlaceHolder(memorySize));
...@@ -241,6 +240,10 @@ class Tensor { ...@@ -241,6 +240,10 @@ class Tensor {
} }
} }
void setMemScale(float scale_factor) {
this->mem_scale_factor_ = scale_factor;
}
void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); } void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) { void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
...@@ -276,9 +279,11 @@ class Tensor { ...@@ -276,9 +279,11 @@ class Tensor {
.height = 1, .height = 1,
.pad_width = 0u, .pad_width = 0u,
.pad_height = 0u}; .pad_height = 0u};
args.output = {
ImageOutputArgs output = {
.address = data<void>(), .scale_address = scale(), .address = data<void>(), .scale_address = scale(),
}; };
args.output = output;
src->syncToDevice(); src->syncToDevice();
size_t aligned_remainder = src->shape().numel() % 16; size_t aligned_remainder = src->shape().numel() % 16;
if (aligned_remainder > 0) { if (aligned_remainder > 0) {
...@@ -294,10 +299,16 @@ class Tensor { ...@@ -294,10 +299,16 @@ class Tensor {
this->invalidate(); this->invalidate();
} }
void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); } void flush() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
}
void invalidate() { void invalidate() {
fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize()); size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_invalidate(placeHolder_->data(), memorySize);
} }
void sync() { void sync() {
...@@ -337,6 +348,18 @@ class Tensor { ...@@ -337,6 +348,18 @@ class Tensor {
if (placeHolder_ == nullptr) { if (placeHolder_ == nullptr) {
return; return;
} }
std::cout << scale()[0] << " , " << scale()[1] << std::endl;
}
void printScale(std::string type) {
std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width())
<< std::endl;
std::cout << type << " \n";
printScale();
} }
std::string dimsFileName() { std::string dimsFileName() {
...@@ -358,33 +381,14 @@ class Tensor { ...@@ -358,33 +381,14 @@ class Tensor {
saveToFile(path); saveToFile(path);
} }
friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
os << "tensor:"
<< "\n";
os << "dims: {";
for (int i = 0; i < tensor.shape().dimSize(); ++i) {
os << tensor.shape()[i] << " ";
}
os << "}\n";
for (int i = 0; i < tensor.shape().numel(); i++) {
float value = 0;
if (tensor.dataType() == FP32) {
value = tensor.data<float>()[i];
} else {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
}
void saveToFile(std::string path) { void saveToFile(std::string path) {
syncToCPU(); syncToCPU();
invalidate();
std::ofstream ofs; std::ofstream ofs;
static int counter = 0; static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path; std::string npath = std::to_string(counter) + "_" + path;
counter++; counter++;
std::cout << "======== saving file:" << npath << " ============\n";
save_file_with_name(npath); save_file_with_name(npath);
} }
...@@ -392,14 +396,16 @@ class Tensor { ...@@ -392,14 +396,16 @@ class Tensor {
// return; // return;
invalidate(); invalidate();
std::ofstream ofs; std::ofstream ofs;
ofs.open(path); ofs.open(path);
for (int i = 0; i < shape_->numel(); i++) { for (int i = 0; i < shape_->numel(); i++) {
float value = 0; float value = 0;
if (dataType_ == FP32) { if (dataType_ == FP32) {
value = data<float>()[i]; value = data<float>()[i];
} else { } else if (dataType_ == FP16) {
value = half_to_float(data<float16>()[i]); value = half_to_float(data<float16>()[i]);
} else {
value = data<int8_t>()[i];
} }
ofs << value << std::endl; ofs << value << std::endl;
} }
...@@ -415,6 +421,7 @@ class Tensor { ...@@ -415,6 +421,7 @@ class Tensor {
int num = shape_->numel(); int num = shape_->numel();
invalidate(); invalidate();
float max = 0.0f; float max = 0.0f;
if (dataType_ == FP16) {
float16* data = mutableData<float16>(); float16* data = mutableData<float16>();
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
float value = 0; float value = 0;
...@@ -422,11 +429,41 @@ class Tensor { ...@@ -422,11 +429,41 @@ class Tensor {
max = std::max(std::abs(value), max); max = std::max(std::abs(value), max);
data[i] = float_to_half(value); data[i] = float_to_half(value);
} }
} else {
float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = value;
}
}
flush(); flush();
placeHolder_->scale_[0] = max / 127.0f; placeHolder_->scale_[0] = max / 127.0f;
placeHolder_->scale_[1] = 127.0f / max; placeHolder_->scale_[1] = 127.0f / max;
} }
friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
os << "tensor:"
<< "\n";
os << "dims: {";
for (int i = 0; i < tensor.shape().dimSize(); ++i) {
os << tensor.shape()[i] << " ";
}
os << "}\n";
for (int i = 0; i < tensor.shape().numel(); i++) {
float value = 0;
if (tensor.dataType() == FP32) {
value = tensor.data<float>()[i];
} else {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
}
~Tensor() { ~Tensor() {
if (shape_ != nullptr) { if (shape_ != nullptr) {
delete shape_; delete shape_;
...@@ -436,6 +473,7 @@ class Tensor { ...@@ -436,6 +473,7 @@ class Tensor {
private: private:
int offset = 0; int offset = 0;
float mem_scale_factor_ = 1.0f;
std::shared_ptr<PlaceHolder> placeHolder_; std::shared_ptr<PlaceHolder> placeHolder_;
Shape* shape_ = nullptr; Shape* shape_ = nullptr;
DataType dataType_ = FP32; DataType dataType_ = FP32;
......
...@@ -95,16 +95,14 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { ...@@ -95,16 +95,14 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
dims_ = other.dims_; dims_ = other.dims_;
target_ = other.target_; target_ = other.target_;
lod_ = other.lod_; lod_ = other.lod_;
// memory_size_ = other.memory_size_; auto dt = zynq_tensor_->dataType();
// buffer_->CopyDataFrom(*other.buffer_, memory_size_);
zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
other.zynq_tensor_->shape());
}
// template <typename T> auto shape = other.zynq_tensor_->shape();
// void TensorLite::mutable_data_internal() {
// } Resize(other.dims());
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
this->ZynqTensor()->copyFrom(other.ZynqTensor());
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -106,7 +106,7 @@ class TensorLite { ...@@ -106,7 +106,7 @@ class TensorLite {
// For other devices, T and R may be the same type. // For other devices, T and R may be the same type.
template <typename T, typename R = T> template <typename T, typename R = T>
const R *data() const { const R *data() const {
return zynq_tensor_->data<R>(); return zynq_tensor_->data<R>() + offset_;
} }
void Resize(const DDimLite &ddim) { dims_ = ddim; } void Resize(const DDimLite &ddim) { dims_ = ddim; }
...@@ -125,6 +125,7 @@ class TensorLite { ...@@ -125,6 +125,7 @@ class TensorLite {
bool persistable() const { return persistable_; } bool persistable() const { return persistable_; }
void set_persistable(bool persistable) { persistable_ = persistable; } void set_persistable(bool persistable) { persistable_ = persistable; }
// T is the data type and R is the return type // T is the data type and R is the return type
// For OpenCL, the return type can be cl::Buffer // For OpenCL, the return type can be cl::Buffer
// and the data type can be float/int8_t. // and the data type can be float/int8_t.
...@@ -147,6 +148,8 @@ class TensorLite { ...@@ -147,6 +148,8 @@ class TensorLite {
size_t memory_size() const { return zynq_tensor_->memorySize(); } size_t memory_size() const { return zynq_tensor_->memorySize(); }
size_t offset() const { return offset_; }
bool IsInitialized() const { return buffer_->data(); } bool IsInitialized() const { return buffer_->data(); }
// Other share data to this. // Other share data to this.
...@@ -157,8 +160,14 @@ class TensorLite { ...@@ -157,8 +160,14 @@ class TensorLite {
template <typename T> template <typename T>
TensorLite Slice(int64_t begin, int64_t end) const; TensorLite Slice(int64_t begin, int64_t end) const;
template <typename T>
void Slice(TensorLite &dst, int64_t begin, int64_t end) const; // NOLINT
TargetType target() const { return target_; } TargetType target() const { return target_; }
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; } zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) { friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
...@@ -173,16 +182,21 @@ class TensorLite { ...@@ -173,16 +182,21 @@ class TensorLite {
private: private:
TargetType target_{TargetType::kHost}; TargetType target_{TargetType::kHost};
// precision_ and persistable_ are only used for persistable vars.
// If your tensor wants to be saved and loaded correctly, you must
// set values of precision_ and persistable_ after updating it.
// If your tensor is just a temp tensor, such as activations,
// you can ignore these two attributes.
PrecisionType precision_{PrecisionType::kUnk};
bool persistable_{false};
DDimLite dims_; DDimLite dims_;
std::shared_ptr<Buffer> buffer_; std::shared_ptr<Buffer> buffer_;
LoD lod_; LoD lod_;
size_t memory_size_{}; size_t memory_size_{};
size_t offset_{0}; size_t offset_{0};
PrecisionType precision_{PrecisionType::kUnk};
bool persistable_{false};
zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor(); zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
template <typename T> template <typename T>
...@@ -197,6 +211,9 @@ R *TensorLite::mutable_data() { ...@@ -197,6 +211,9 @@ R *TensorLite::mutable_data() {
} }
zynqmp::LayoutType layout_type = zynqmp::NCHW; zynqmp::LayoutType layout_type = zynqmp::NCHW;
switch (v.size()) { switch (v.size()) {
case 0:
layout_type = zynqmp::None;
break;
case 1: case 1:
layout_type = zynqmp::N; layout_type = zynqmp::N;
break; break;
...@@ -228,24 +245,63 @@ R *TensorLite::mutable_data(TargetType target) { ...@@ -228,24 +245,63 @@ R *TensorLite::mutable_data(TargetType target) {
return mutable_data<T>(); return mutable_data<T>();
} }
template <typename TensorT>
bool TensorCompareWith(const TensorT &a, const TensorT &b) {
if (a.dims() != b.dims()) return false;
if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
return true;
}
template <typename T> template <typename T>
TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
throw - 1;
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
if (dims_[0] == 1) {
return *this;
} else {
int64_t base = numel() / dims_[0]; int64_t base = numel() / dims_[0];
TensorLite dst; TensorLite dst;
dst.buffer_ = buffer_; // dst.buffer_ = buffer_;
// dst.zynq_tensor_ = zynq_tensor_;
dst.target_ = target_; dst.target_ = target_;
auto dst_dims = dims_; auto dst_dims = dims_;
dst_dims[0] = end - begin; dst_dims[0] = end - begin;
dst.Resize(dst_dims); dst.Resize(dst_dims);
dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T); void *dst_data = dst.mutable_data<T>();
T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data,
src_data + static_cast<size_t>(begin * base) * sizeof(T),
dst_dims.production() * sizeof(T));
dst.ZynqTensor()->saveToFile("_slice", true);
// dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
return dst; return dst;
}
} }
template <typename T>
void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
dst.target_ = target_;
auto dst_dims = dims_;
dst_dims[0] = end - begin;
dst.Resize(dst_dims);
void *dst_data = dst.mutable_data<T>();
int64_t base = numel() / dims_[0];
T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data,
src_data + static_cast<size_t>(begin * dst_dims.production()),
dst_dims.production() * sizeof(T));
}
template <typename TensorT>
bool TensorCompareWith(const TensorT &a, const TensorT &b) {
if (a.dims() != b.dims()) return false;
if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
return true;
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册