提交 2e2c9d4b 编写于 作者: M MyPandaShaoxiang

fix: update backend fpga patch

上级 41826a31
......@@ -3,13 +3,35 @@ if (NOT LITE_WITH_FPGA)
endif()
set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD")
set(LITE_FPGA_KD_LLAPI_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/llapi")
set(LITE_FPGA_KD_PE_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/pes")
set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga")
message("fpga_kd_path ${LITE_FPGA_KD_PATH}")
message("fpga_path ${LITE_FPGA_PATH}")
file(GLOB_RECURSE KD_CPP *.cpp *.cc)
file(GLOB KD_CPP "${LITE_FPGA_KD_PATH}/*.cpp")
file(GLOB PE_CPP "${LITE_FPGA_KD_PE_PATH}/*.cpp")
file(GLOB LLAPI_CPP "${LITE_FPGA_KD_LLAPI_PATH}/*.cpp")
file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc")
cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
set(FPGA_ALL_CPP "")
FOREACH(FILE_PATH ${KD_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list(APPEND FPGA_ALL_CPP KD/${FILE_NAME})
ENDFOREACH(FILE_PATH)
FOREACH(FILE_PATH ${PE_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list(APPEND FPGA_ALL_CPP KD/pes/${FILE_NAME})
ENDFOREACH(FILE_PATH)
FOREACH(FILE_PATH ${LLAPI_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list(APPEND FPGA_ALL_CPP KD/llapi/${FILE_NAME})
ENDFOREACH(FILE_PATH)
FOREACH(FILE_PATH ${FPGA_CPP})
STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
list( APPEND FPGA_ALL_CPP ${FILE_NAME})
ENDFOREACH(FILE_PATH)
message("fpga kd: ${FPGA_ALL_CPP}")
cc_library(kernel_fpga SRCS ${FPGA_ALL_CPP})
#cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory)
cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga)
cc_library(fpga_target_wrapper SRCS target_wrapper.cc DEPS kernel_fpga)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
// #include "lite/backends/fpga/lite_tensor.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
#define FPGA_PRINT_TENSOR
class Debugger {
public:
static Debugger& get_instance() {
static Debugger s_instance;
return s_instance;
}
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
// tensor->printScale();
if (op_type != "conv") {
// tensor->saveToFile(op_type, true);
}
}
private:
std::unordered_map<std::string, bool> op_config;
Debugger() {
op_config["concat"] = true;
op_config["conv"] = true;
op_config["crop"] = true;
}
};
inline void chw_to_hwc(Tensor* t, float* dst) {
int num = t->dims()[0];
int channel = t->dims()[1];
int height = 1;
int width = 1;
if (t->dims().size() > 2) {
height = t->dims()[2];
}
if (t->dims().size() > 3) {
width = t->dims()[3];
}
// int width = t->dims()[3];
const float* chw_data = t->data<float>();
float* hwc_data = dst;
int chw = channel * height * width;
int wc = width * channel;
int index = 0;
for (int n = 0; n < num; n++) {
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
index++;
}
}
}
}
}
inline void read_from_file(lite::Tensor* t, const std::string& path) {
std::ifstream file_stream;
file_stream.open(path);
if (!file_stream) {
return;
}
float* data = t->mutable_data<float>();
int num = t->numel();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
// flush();
}
inline void save_float(float* data, const std::string& name, int len) {
// return;
static int counter = 0;
std::string old_string = std::to_string(counter);
std::string new_string =
std::string(3 - old_string.length(), '0') + old_string;
std::string file = "arm_" + new_string + name;
counter++;
std::cout
<< "-------------------------- saving file: --------------------------"
<< file << std::endl;
std::ofstream ofs;
ofs.open(file);
// float* data = dst;
for (int i = 0; i < len; i++) {
float value = data[i];
ofs << value << std::endl;
}
ofs.close();
}
inline void save_tensor(lite::Tensor* t,
const std::string& name,
bool convert = true) {
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
if (convert) {
chw_to_hwc(t, dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
inline void save_tensor(const lite::Tensor* t,
const std::string& name,
bool convert = true) {
// return;
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
if (convert) {
chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
} // namespace lite
} // namespace paddle
......@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/dl_engine.hpp"
namespace paddle {
namespace zynqmp {
DLEngine::DLEngine() {
open_device();
struct DeviceInfo info;
int ret = get_device_info(info);
filter::set_filter_capacity(info.filter_cap);
int ret = get_device_info(info_);
filter::set_filter_capacity(info_.filter_cap);
filter::set_colunm(info_.colunm);
}
} // namespace zynqmp
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -29,8 +28,15 @@ class DLEngine {
return s_instance;
}
DeviceInfo& deviceInfo();
bool isZU3() { return info_.device_type / 100 == 3; }
float* out_data = nullptr;
private:
DLEngine();
DeviceInfo info_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -22,6 +22,7 @@ namespace paddle {
namespace zynqmp {
enum LayoutType {
None,
N,
NC,
NCHW,
......@@ -39,6 +40,15 @@ class Layout {
virtual int elementCount(const std::vector<int>& dims) = 0;
};
struct None : Layout {
int numIndex() { return -1; }
int channelIndex() { return -1; }
int heightIndex() { return -1; }
int widthIndex() { return -1; }
int alignedElementCount(const std::vector<int>& dims) { return 16; }
virtual int elementCount(const std::vector<int>& dims) { return 1; }
};
struct NCHW : Layout {
int numIndex() { return 0; }
int channelIndex() { return 1; }
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <memory.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/bias_scale.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -54,7 +55,7 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
*data_in = ptr_aligned;
}
void interleave(float **data_in, int num_after_alignment) {
size_t interleave(float **data_in, int num_after_alignment) {
float *ptr_uninterleaved = *data_in;
float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
......@@ -69,6 +70,7 @@ void interleave(float **data_in, int num_after_alignment) {
fpga_free(ptr_uninterleaved);
*data_in = ptr_interleaved;
return 2 * num_after_alignment * sizeof(float);
}
void format_bias_scale_array(float **bias_scale_array,
......@@ -78,8 +80,9 @@ void format_bias_scale_array(float **bias_scale_array,
int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
size_t mem =
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, mem);
}
void format_bias_array(float **bias_array, int num) {
float *ptr_unaligned = *bias_array;
......
......@@ -19,7 +19,7 @@ namespace zynqmp {
namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment);
size_t interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division,
int num);
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "lite/backends/fpga/KD/llapi/filter.h"
#include <memory.h>
#include <algorithm>
#include <fstream>
#include <string>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -23,11 +25,42 @@ namespace zynqmp {
namespace filter {
static int FILTER_SIZE = 2048;
static int COLUMN = 4;
void saveToFile(std::string name, void* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
// int8_t* data = (int8_t*)data_in;
// for (int i = 0; i < size; i++) {
// float value = data[i];
// ofs << value << std::endl;
// }
// ofs.close();
}
void saveFloatToFile(std::string name, float* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
// for (int i = 0; i < size; i++) {
// float value = data_in[i];
// ofs << value << std::endl;
// }
// ofs.close();
}
void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
void set_colunm(uint32_t column) { COLUMN = column; }
// replace zynqmp_api.h #define FILTER_NUM_ALIGNMENT
int get_filter_num_alignment() { return COLUMN * 4; }
int calc_division_capacity(int chw) {
int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
// int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
int filter_num_alignment = get_filter_num_alignment();
int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
return n < FILTER_SIZE ? n : FILTER_SIZE;
}
......@@ -52,28 +85,28 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
}
}
void convert_to_hwc(
char **data_in, int num, int channel, int height, int width) {
char *tmp = *data_in;
void convert_to_hwc(int8_t* chw_data,
int8_t* hwc_data,
int num,
int channel,
int height,
int width) {
int chw = channel * height * width;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
int wc = width * channel;
int index = 0;
for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + n * chw + offset_height + w * channel + c) =
*((*data_in)++);
hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
index++;
}
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
float find_max(float *data_in, int data_size) {
float find_max(float* data_in, int data_size) {
float max = 0.0;
for (int i = 0; i < data_size; ++i) {
float value = data_in[i];
......@@ -83,166 +116,194 @@ float find_max(float *data_in, int data_size) {
return max;
}
signed char float_to_int8(float fdata) {
int8_t float_to_int8(float fdata) {
if (fdata < 0.0) {
fdata -= 0.5;
} else {
fdata += 0.5;
}
return (signed char)fdata;
return (int8_t)fdata;
}
void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in;
void quantize(float* src, int8_t* dst, int len, float max) {
float fix_range = 127;
float scale = fix_range / max;
signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) {
tmp_data[i] = float_to_int8(
(*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale);
for (size_t i = 0; i < len; i++) {
dst[i] = float_to_int8(src[i] * scale);
}
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
}
void align_element(char **data_in, int num, int chw) {
int j = 0;
bool should_align_chw(int chw) {
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) {
char *tmp = *data_in;
char *data_tmp =
(char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT
memset(data_tmp, 0, num * align_chw);
for (j = 0; j < num; j++) {
memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
}
*data_in = data_tmp;
fpga_free(tmp);
return align_chw != chw;
}
void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
memset(dst, 0, num * aligned_chw);
for (int j = 0; j < num; j++) {
memcpy((dst + j * aligned_chw), (src + j * chw), chw);
}
}
void align_num(char **data_in,
void align_num(int8_t* src,
int8_t* dst,
int num_per_div_before_alignment,
int num,
int chw) {
int i = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int align_chw) {
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
align_to_x(num_per_div_before_alignment, filter_num_alignment);
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char));
memset(dst, 0, num_element * sizeof(int8_t));
int i = 0;
for (i = 0; i < div_num - 1; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
memcpy(dst + num_per_div_after_alignment * align_chw * i,
src + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
memcpy(dst + num_per_div_after_alignment * align_chw * i,
src + num_per_div_before_alignment * align_chw * i,
(num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
*data_in = data_tmp;
fpga_free(tmp);
}
void reorder(char **data_in, int num_after_alignment, int chw) {
void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int index = 0;
int new_index = 0;
int filter_num_alignment = get_filter_num_alignment();
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
(index / 16 % 2 * 4);
memcpy(data_tmp + index * chw_align,
*data_in + new_index * chw_align,
chw_align);
new_index = index / filter_num_alignment * filter_num_alignment +
(index % (filter_num_alignment / 2) / 4 * 8) +
(index % (filter_num_alignment / 2) % 4) +
(index / (filter_num_alignment / 2) % 2 * 4);
memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
}
*data_in = data_tmp;
fpga_free(tmp);
}
size_t interleave(char **data_in, int num_after_alignment, int chw) {
int i = 0;
int j = 0;
int k = 0;
void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int interleave_per_num = 16;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) {
for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(data_tmp + i * chw_align + interleave_per_num * j,
*data_in + i * chw_align + interleave_per_num * k,
for (int i = 0; i < num_after_alignment; i += 2) {
for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(dst + i * chw_align + interleave_per_num * j,
src + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
*data_in + (i + 1) * chw_align + interleave_per_num * k,
memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
src + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num);
}
}
*data_in = data_tmp;
fpga_free(tmp);
return chw_align * num_after_alignment;
}
size_t format_filter(float **data_in,
int num,
int channel,
int height,
int width,
int group_num,
float max) {
int8_t* format_filter(float* data_in,
int& mem_size_a, // NOLINT
int num,
int channel,
int height,
int width,
int group_num,
float max,
std::vector<float>& filter_max) { // NOLINT
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
align_to_x(num_per_div_before_alignment, filter_num_alignment);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
align_to_x(residual, filter_num_alignment);
// saveFloatToFile("quantize_before", data_in, data_size);
int8_t* quantized_data =
reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
float f_max = find_max(filter_start, chw);
int8_t* quantized_start = quantized_data + n * chw;
// quantize(filter_start, quantized_start, chw, f_max);
quantize(filter_start, quantized_start, chw, max);
// filter_max.push_back(f_max);
filter_max.push_back(max);
}
reorder(quantize_data, num_after_alignment, chw);
size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data,
// saveToFile("chw.txt", quantized_data, data_size);
int8_t* hwc_data =
reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
fpga_free(quantized_data);
// saveToFile("hwc.txt", hwc_data, data_size);
int8_t* temp_data = hwc_data; // NOLINT
int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (should_align_chw(chw)) {
int8_t* hwc_aligned_data = reinterpret_cast<int8_t*>(
fpga_malloc(num * chw_aligned * sizeof(int8_t)));
align_chw(hwc_data, hwc_aligned_data, num, chw);
// saveToFile("align_el.txt", hwc_aligned_data, data_size * 2);
temp_data = hwc_aligned_data;
fpga_free(hwc_data);
}
if (num_after_alignment != num) {
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment);
// int div_num =
// (num + num_per_div_before_alignment - 1) /
// num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
align_num(temp_data,
num_aligned_data,
num_per_div_before_alignment,
num,
chw_aligned);
// saveToFile("align_num.txt", num_aligned_data, data_size * 8);
fpga_free(temp_data);
temp_data = num_aligned_data;
}
int8_t* aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
reorder(temp_data, aligned_data, num_after_alignment, chw);
// saveToFile("reorder.txt", aligned_data, data_size * 8);
fpga_free(temp_data);
int8_t* interleaved_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
interleave(aligned_data, interleaved_data, num_after_alignment, chw);
// saveToFile("interleave.txt", interleaved_data, data_size * 8);
fpga_free(aligned_data);
fpga_flush(interleaved_data,
align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
sizeof(char));
return mem_size;
mem_size_a = num_after_alignment * chw_aligned;
return interleaved_data;
}
void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
int16_t *tmp = *data_in;
int16_t *data_tmp =
(int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width) {
int16_t* tmp = *data_in;
int16_t* data_tmp =
(int16_t*)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
......@@ -254,16 +315,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
fpga_free(tmp);
}
size_t align_element_n(int16_t **data_in, int num, int height, int width) {
size_t align_element_n(int16_t** data_in, int num, int height, int width) {
int unalign_n = num;
int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
int num_element = height * width * align_n;
if (unalign_n != align_n) {
int16_t *tmp = *data_in;
int16_t* tmp = *data_in;
int num_element = height * width * align_n;
int16_t *data_tmp =
(int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
int16_t* data_tmp =
(int16_t*)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(int16_t));
for (int h = 0; h < height; h++) {
......@@ -276,17 +337,37 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) {
}
}
*data_in = data_tmp;
free(tmp);
fpga_free(tmp);
}
return num_element * sizeof(int16_t);
}
void to_fp16(float* src,
float16* dst,
int num,
int height,
int width,
float* scale_ptr) {
int size = num * height * width;
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int index = n * height * width + h * width + w;
float value = src[index] * scale_val;
dst[index] = float_to_half(value);
}
}
}
fpga_flush(dst, size * sizeof(int16_t));
}
void quantize_to_fp16(
float **data_in, int num, int height, int width, float *scale_ptr) {
float *tmp = *data_in;
float** data_in, int num, int height, int width, float* scale_ptr) {
float* tmp = *data_in;
int size = num * height * width;
float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16)); // NOLINT
float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16)); // NOLINT
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
......@@ -298,13 +379,14 @@ void quantize_to_fp16(
}
}
fpga_flush(tmp_data, size * sizeof(int16_t));
*data_in = (float *)tmp_data; // NOLINT
*data_in = (float*)tmp_data; // NOLINT
fpga_free(tmp);
}
size_t format_dwconv_filter(
float **data_in, int num, int height, int width, float *scale_ptr) {
float** data_in, int num, int height, int width, float* scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
convert_to_hwn(quantize_data, num, height, width);
size_t size = align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data,
......
......@@ -18,38 +18,35 @@ limitations under the License. */
#include <cstdlib>
#include <cwchar>
#include <vector>
namespace paddle {
namespace zynqmp {
namespace filter {
void set_filter_capacity(uint32_t cap);
void set_colunm(uint32_t column);
int get_filter_num_alignment();
int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity);
void convert_to_hwc(
char** data_in, int num, int channel, int height, int width);
float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max);
void align_element(char** data_in, int num, int chw);
void align_num(char** data_in,
int num_per_div_before_alignment,
int num,
int chw);
void reorder(char** data_in, int num_after_alignment, int chw);
size_t interleave(char** data_in, int num_after_alignment, int chw);
size_t format_filter(float** data_in,
int num,
int channel,
int height,
int width,
int group_num,
float max);
int8_t* format_filter(float* data_in,
int& mem_size, // NOLINT
int num,
int channel,
int height,
int width,
int group_num,
float max, // NOLINT
std::vector<float>& filter_max); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width);
void quantize_to_fp16(
float** data_in, int num, int height, int width, float* scale_ptr);
// void quantize_to_fp16(float** data_in, int num, int height, int width,
// float* scale_ptr);
size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr);
......
......@@ -23,13 +23,12 @@ limitations under the License. */
#include <map>
#include <utility>
#include "lite/backends/fpga/KD/llapi/config.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
namespace paddle {
namespace zynqmp {
#define PADDLE_LITE_OS_LINUX
#define PADDLE_OS_LINUX
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
......@@ -39,14 +38,10 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) {
int ret = -1;
#ifdef PADDLE_LITE_OS_LINUX
ret = ioctl(fd, req, arg);
if (ret != 0) {
throw - 1;
}
#ifdef PADDLE_OS_LINUX
return ioctl(fd, req, arg);
#else
return ret;
return -1;
#endif
}
......@@ -66,7 +61,10 @@ void reset_device() {
// memory management;
void *fpga_malloc(size_t size) {
#ifdef PADDLE_LITE_OS_LINUX
#ifdef ENABLE_DEBUG
// std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) {
......@@ -105,11 +103,8 @@ void fpga_free(void *ptr) {
size = iter->second;
memory_map.erase(iter);
}
memory_size -= size;
#ifdef PADDLE_LITE_OS_LINUX
#ifdef PADDLE_OS_LINUX
munmap(ptr, size);
#else
free(ptr);
......@@ -150,6 +145,11 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int fpga_reset() {
struct FpgaResetArgs args;
return do_ioctl(IOCTL_FPGA_RESET, &args);
}
int ioctl_conv(const struct ConvArgs &args) {
return do_ioctl(IOCTL_CONFIG_CONV, &args);
}
......@@ -166,7 +166,6 @@ int compute_fpga_conv(const struct SplitConvArgs &args) {
}
if (split_num > 1) {
std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
exit(-1);
}
return ret;
......@@ -186,6 +185,7 @@ int get_device_info(const struct DeviceInfo &args) {
}
int perform_bypass(const struct BypassArgs &args) {
int ret = -1;
int size = args.image.channels * args.image.width * args.image.height;
int max_size = 1 << 21;
......@@ -213,7 +213,7 @@ int perform_bypass(const struct BypassArgs &args) {
reinterpret_cast<char *>(input_address + i * max_size * type_size);
bypassArgs.output.address =
reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
if (ret != 0) {
......@@ -222,13 +222,15 @@ int perform_bypass(const struct BypassArgs &args) {
}
int remainder = size - max_size * count;
bypassArgs.image.channels = remainder;
bypassArgs.image.address =
reinterpret_cast<char *>(input_address + count * max_size * type_size);
bypassArgs.output.address = reinterpret_cast<char *>(
output_address + count * max_size * out_type_size);
int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
if (remainder > 0) {
bypassArgs.image.channels = remainder;
bypassArgs.image.address =
reinterpret_cast<char *>(input_address + count * max_size * type_size);
bypassArgs.output.address = reinterpret_cast<char *>(
output_address + count * max_size * out_type_size);
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
}
args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale;
return ret;
......@@ -237,52 +239,21 @@ int perform_bypass(const struct BypassArgs &args) {
int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
int compute_fpga_scale(const struct ScaleArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Scale======";
std::cout << "scale_address:" << args.scale_address << std::endl;
std::cout << "bias_address:" << args.bias_address << std::endl;
std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_SCALE, &args);
}
int compute_fpga_dwconv(const struct DWconvArgs &args) {
#ifdef ENABLE_DEBUG
std::cout << "======Compute Basic Conv======";
std::cout << " relu_enabled:" << args.relu_enabled
<< " filter_address:" << args.filter_address;
std::cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
std::cout << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
}
int config_activation(const struct ActiveParamterArgs &args) {
return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
}
// int config_power(const struct PowerArgs& args) {
// return do_ioctl(IOCTL_CONFIG_POWER, &args);
// }
int config_inplace(const struct InplaceArgs &args) {
return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
}
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#include <stdint.h>
#include <cstddef>
#include <iostream>
......@@ -40,6 +43,13 @@ enum DLayoutType {
LAYOUT_HWC = 0,
};
enum ActiveType {
TYPE_RELU = 0,
TYPE_RELU6 = 1,
TYPE_LEAK_RELU = 2,
TYPE_SIGMOID = 3,
};
struct VersionArgs {
void* buffer;
};
......@@ -48,7 +58,7 @@ struct DeviceInfo {
uint32_t filter_cap;
uint32_t version;
uint16_t device_type;
uint32_t reserved0;
uint32_t colunm;
uint32_t reserved1;
uint32_t reserved2;
uint32_t reserved3;
......@@ -108,6 +118,7 @@ struct ConvArgs {
void* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
uint32_t dilation;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
......@@ -199,9 +210,16 @@ struct NormalizeParameterArgs {
uint32_t hight_width;
};
struct ActiveParamterArgs {
ActiveType type;
uint16_t leaky_relu_factor;
};
struct InplaceArgs {
bool leaky_relu_enable;
bool relu_enable;
bool sigmoid_enable;
bool relu6_enable;
bool power_enable;
bool normalize_enable;
};
......@@ -216,7 +234,9 @@ struct FpgaRegReadArgs {
uint64_t value;
};
struct FpgaResetArgs {};
struct FpgaResetArgs {
uint32_t val;
};
#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
......@@ -248,6 +268,8 @@ struct FpgaResetArgs {};
_IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
#define IOCTL_CONFIG_NORMALIZE_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
......@@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args);
int compute_fpga_concat(const struct ConcatArgs& args);
int compute_fpga_resize(const struct ResizeArgs& args);
int config_activation(const struct ActiveParamterArgs& args);
int config_power(const struct PowerArgs& args);
int compute_fpga_dwconv(const struct DWconvArgs& args);
int config_norm_param(const struct NormalizeParameterArgs& args);
......@@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args);
int flush_cache(void* addr, int size);
int invalidate_cache(void* addr, int size);
int fpga_reset();
int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num);
} // namespace zynqmp
} // namespace paddle
#endif // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
......@@ -32,6 +32,5 @@ class PE {
virtual ~PE() {}
};
} // namespace zynqmp
} // namespace paddle
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include <string>
#include <vector>
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -26,6 +27,7 @@ namespace zynqmp {
struct ReLUParam {
public:
bool enabled = false;
float leaky_relu_factor = 0.0f;
};
struct PEParam {
......@@ -98,6 +100,24 @@ struct DepthwiseConvParam : ConvParam {
Tensor* quantizedFilter_ = new Tensor();
};
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
enum PoolingType : int {
MAX = 0,
AVERAGE = 1,
......@@ -133,6 +153,12 @@ struct ElementwiseAddParam : PEParam {
EWAddArgs ewargs;
};
struct ElementwiseMulParam : PEParam {
public:
std::vector<Tensor*> inputs;
Tensor* output = nullptr;
};
struct FullyConnectedParam : PEParam {
public:
Tensor* input = nullptr;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <arm_neon.h>
#include <algorithm>
#include <vector>
#include "lite/backends/fpga/KD/pe.hpp"
......@@ -49,7 +50,111 @@ class ConvPE : public PE {
concatPE_.init();
concatPE_.apply();
}
if (DLEngine::get_instance().isZU3() &&
param_.input->shape().dimSize() == 4 &&
param_.input->shape().width() == 1 &&
param_.input->shape().width() >= 2048) {
use_cpu_ = true;
}
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) {
// use_cpu_ = true;
}
if (!use_cpu_) {
// param_.filter->releaseData();
}
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape());
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.filter->shape().height();
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
// int out_channel = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width;
float max = 0;
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend =
std::min(hstart + kernel_height, static_cast<int>(image_height));
int wend =
std::min(wstart + kernel_width, static_cast<int>(image_width));
hstart = std::max(hstart, static_cast<int>(0));
wstart = std::max(wstart, static_cast<int>(0));
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
for (int c = 0; c < image_channels; c++) {
for (int h = hstart; h < hend; h++) {
int hi = 0;
if (ph == 0) {
hi = h - hstart + image_pad_h;
} else {
hi = h - hstart;
}
for (int w = wstart; w < wend; w++) {
int wi = 0;
if (pw == 0) {
wi = w - wstart + image_pad_w;
} else {
wi = w - wstart;
}
const int index = (h * image_width + w) * image_channels + c;
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index];
sum += value;
}
}
}
if (param_.relu.enabled && sum < 0) {
sum = -sum;
}
if (sum > max) {
max = sum;
}
out[pool_index] = sum;
}
}
}
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
......@@ -59,43 +164,79 @@ class ConvPE : public PE {
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape());
float* bias_data = param_.bias()->data<float>();
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel];
float max = 0;
int out_index = 0;
for (int i = 0; i < out_channel; i++) {
float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi;
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
float value = image_addr[j] * filter_ptr[j];
mi[j] = value;
}
float sum = 0;
for (int j = 0; j < in_channel; j++) {
sum += mi[j];
for (int h = 0; h < output->shape().height(); h++) {
for (int w = 0; w < output->shape().width(); w++) {
float sum = 0;
// #pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j];
sum += value;
}
sum += bias_data[i];
if (param_.relu.enabled && sum < 0) {
sum = 0;
}
if (sum > max) {
max = sum;
}
out_index = h * out_width * out_channel + w * out_channel + i;
out[out_index] = sum;
// out_index++;
}
}
out[i] = sum;
}
delete[] mi;
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
}
bool dispatch() {
inplace_.relu_enable = param_.relu.enabled;
if (use_cpu_) {
cpu_compute();
return true;
}
inplace_.leaky_relu_enable =
(param_.relu.leaky_relu_factor != 0) ? true : false;
inplace_.relu_enable =
inplace_.leaky_relu_enable ? false : param_.relu.enabled;
inplace_.power_enable = false;
inplace_.normalize_enable = false;
if (param_.relu.enabled) {
inplace_.relu_enable = param_.relu.enabled;
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.leaky_relu_factor =
fp32_2_fp16(param_.relu.leaky_relu_factor);
config_activation(activeParamterArgs);
}
}
std::vector<BasicConvParam*>& params = param_.splitParams();
......@@ -104,9 +245,16 @@ class ConvPE : public PE {
ret |= compute_fpga_conv_basic(conv_param->args);
}
if (param_.relu.enabled) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
config_activation(activeParamterArgs);
}
}
size_t size = params.size();
......@@ -127,11 +275,13 @@ class ConvPE : public PE {
ConvParam& param() { return param_; }
private:
bool use_cpu_ = false;
ConvParam param_;
ConcatPE concatPE_;
ElementwiseAddPE addPE_;
int split_axis = 0;
InplaceArgs inplace_ = {0};
ActiveParamterArgs activeParamterArgs;
};
} // namespace zynqmp
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#ifndef conv_process_hpp
#define conv_process_hpp
#include <string.h>
#include <cmath>
#include <vector>
......@@ -45,7 +48,9 @@ inline int get_split_num(Tensor* filter) {
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num = align_to_x(num, filter_num_alignment);
return filter::calc_split_num(aligned_num, div_capacity);
}
inline void fill_scale_bias_const(ConvParam* param_) {
......@@ -126,41 +131,87 @@ inline void format_scale_bias(Tensor* scale,
bias_data = bias->data<float>();
}
int channel = filter->shape().num();
Shape bias_scale_shape(N, {2 * channel});
int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
int c_per_group = channel / group;
int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
Shape bias_scale_shape(N, {2 * scale_bias_len});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
for (int i = 0; i < channel; i++) {
float scale_value = scale_data == nullptr ? 1 : scale_data[i];
float bias_value = bias_data == nullptr ? 0 : bias_data[i];
bs_data[i + channel] = scale_value;
bs_data[i] = bias_value;
float* temp_data =
reinterpret_cast<float*>(fpga_malloc(2 * scale_bias_len * sizeof(float)));
memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
std::vector<float> scales;
if (scale_data != nullptr) {
for (int i = 0; i < channel; ++i) {
scales.push_back(scale_data[i]);
}
for (int i = 0; i < scale_bias_len - channel; i++) {
scales.push_back(1);
}
} else {
for (int i = 0; i < scale_bias_len; i++) {
scales.push_back(1);
}
}
int element_num_per_div = get_filter_num_per_div(filter, group);
bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
for (int i = 0; i < scale_bias_len; ++i) {
temp_data[i + scale_bias_len] = 1;
temp_data[i] = 0;
}
for (int g = 0; g < group; g++) {
for (int c = 0; c < c_per_group; c++) {
int src_index = g * c_per_group + c;
int dst_index = g * aligned_c_per_group + c;
float scale_value = scales[src_index];
float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
temp_data[dst_index + scale_bias_len] = scale_value;
temp_data[dst_index] = bias_value;
}
}
// int element_num_per_div = get_filter_num_per_div(filter, group);
// int scale_bias_len = align_to_x(channel / group, 8) * group;
bias_scale::format_bias_scale_array(
&temp_data, scale_bias_len / group, scale_bias_len);
memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
}
inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
inline void format_filter(Tensor* filter,
Tensor* quantized_filter,
int group,
std::vector<float>& scales) { // NOLINT
float max_value = find_max(*filter);
Shape& filter_shape = filter->shape();
int mem_size;
std::vector<float> max_values;
int8_t* quantized_data = filter::format_filter(filter->data<float>(),
mem_size,
filter_shape.num(),
filter_shape.channel(),
filter_shape.height(),
filter_shape.width(),
group,
max_value,
max_values);
float mem_factor = mem_size * 1.0f / filter->shape().numel();
quantized_filter->setMemScale(mem_factor);
quantized_filter->setAligned(true);
quantized_filter->mutableData<int8_t>(INT8, filter->shape());
int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
quantized_filter->scale()[0] = max_value / 127.0f;
quantized_filter->scale()[1] = 127.0f / max_value;
auto memory_size = filter->shape().memorySize(sizeof(float));
auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
memcpy(new_data, filter->data<float>(), memory_size);
size_t mem_size = filter::format_filter(&new_data,
filter_shape.num(),
filter_shape.channel(),
filter_shape.height(),
filter_shape.width(),
group,
max_value);
int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
memcpy(src, new_data, mem_size);
fpga_free(new_data);
memcpy(src, quantized_data, mem_size);
quantized_filter->flush();
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value);
}
}
inline void format_dw_filter(Tensor* filter,
......@@ -207,10 +258,20 @@ inline void split_filter_num(const ConvParam& c_param) {
Tensor* out = param.output;
Tensor* filter = param.filter;
auto channel = out->shape().channel();
int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups;
// int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) *
// param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
......@@ -251,9 +312,17 @@ inline void split_filter_num(const ConvParam& c_param) {
filter->data<float>() + i * filter_num_per_div * filter_hwc,
filter_num * filter_hwc * sizeof(float));
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
format_filter(&new_filter, &(conv_param->filter), param.groups);
if (param.groups != 1) {
int mem_factor =
32 / filter_num_per_div; // TODO(chonwhite): change 32 to param;
conv_param->filter.setMemScale(mem_factor);
}
std::vector<float> v; // TODO(chonwhite): change local variable name
format_filter(&new_filter, &(conv_param->filter), param.groups, v);
conv_param->filter.setDataType(INT8);
int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
Tensor scale;
......@@ -265,7 +334,7 @@ inline void split_filter_num(const ConvParam& c_param) {
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
}
for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
......@@ -276,11 +345,14 @@ inline void split_filter_num(const ConvParam& c_param) {
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>();
args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>();
args.sb_address = bs_data;
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
......@@ -294,17 +366,13 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.channels = input->shape().channel();
args.image.width = input->shape().width();
args.image.height = input->shape().height();
auto paddings = *param.padding;
args.image.pad_width = param.paddings[2];
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
// dilations[0] = dilations[1] ;
args.dilation = param.dilations[0];
args.output.address = out_address;
args.output.scale_address = out_scale_address;
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.splitParams().push_back(conv_param);
}
}
......@@ -317,7 +385,7 @@ inline void split_channel(const ConvParam& c_param) {
int num = ceil(input->shape().channel() * 1.0f / 2047);
int channel = input->shape().channel() / num;
std::cout << "channel::" << channel << "num::" << num << std::endl;
Shape bs_shape(N, {channel});
for (int i = 0; i < num; i++) {
......@@ -331,6 +399,7 @@ inline void split_channel(const ConvParam& c_param) {
// filter transformation;
Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
Tensor new_filter;
float* dst = new_filter.mutableData<float>(FP32, f_shape);
......@@ -341,7 +410,8 @@ inline void split_channel(const ConvParam& c_param) {
src += param.filter->shape().channel();
}
new_filter.flush();
format_filter(&new_filter, &(conv_param->filter), param.groups);
std::vector<float> scales;
format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
Tensor bias;
Tensor scale;
......@@ -354,6 +424,7 @@ inline void split_channel(const ConvParam& c_param) {
}
scale.flush();
bias.flush();
// Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale,
&bias,
&conv_param->filter,
......@@ -379,18 +450,12 @@ inline void split_channel(const ConvParam& c_param) {
args.image.channels = conv_param->input.shape().channel();
args.image.width = conv_param->input.shape().width();
args.image.height = conv_param->input.shape().height();
auto paddings = *param.paddings;
args.image.pad_width = paddings[2];
args.image.pad_height = paddings[0];
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
// dilations[0] = dilations[1]
args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale();
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.splitParams().push_back(conv_param);
}
}
......@@ -418,11 +483,11 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
}
size_t size = params.size();
if (ret == 0 && size > 1) {
// Tensor* output = conv_params.output;
Tensor& img = params[0]->output;
for (int i = 0; i < 1; i++) {
for (int i = 0; i < img.shape().numel(); i++) {
float value = half_to_float(img.data<float16>()[i]);
std::cout << "value:" << value << std::endl;
}
}
}
......@@ -431,3 +496,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
} // namespace zynqmp
} // namespace paddle
#endif /* conv_process_hpp */
......@@ -14,8 +14,6 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pes/crop_pe.hpp"
#include <vector>
namespace paddle {
namespace zynqmp {
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <cstring>
#include <vector>
......
......@@ -37,18 +37,37 @@ class DepthwiseConvPE : public PE {
Tensor* output = param.output;
int channel = output->shape().channel();
float* new_scale_data = param_.scale()->data<float>();
float* new_bias_data = param_.bias()->data<float>();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]);
if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16
for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]);
}
bias_.flush();
} else {
float16* new_bias_data = param_.bias()->data<float16>();
memcpy(b_data, new_bias_data, channel * sizeof(float16));
bias_.flush();
}
bias_.flush();
Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
if (param_.scale()->dataType() == FP32) {
float* new_scale_data = param_.scale()->data<float>();
Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
} else {
// filter 全为1时,且channal为对齐时
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
param.quantizedFilter()->flush();
}
DWconvArgs args = {0};
args.bias_address = b_data;
......@@ -61,21 +80,14 @@ class DepthwiseConvPE : public PE {
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
auto paddings = *param.paddings;
args.image.pad_width = param.paddings[2];
args.image.pad_height = param.paddings[0];
args.image.pad_width = param.paddings[0];
args.image.pad_height = param.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
args.out_width = param.output->shape().width();
args.out_height = param.output->shape().height();
args.sub_conv_num = 1;
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.args = args;
inplace_.relu_enable = param_.relu.enabled;
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
namespace paddle {
namespace zynqmp {
class ElementwiseMulPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
output->setDataLocation(Device);
return true;
}
void apply() {
Tensor* input = param_.inputs[0];
Tensor* output = param_.output;
int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
// int wc_aligned = / 32 * 32;
Shape s(N, {wc_aligned});
float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
memset(bias_data, 0, wc_aligned * sizeof(float16));
ScaleArgs& args = args_;
args.scale_address = param_.inputs[1]->data<void>();
args.bias_address = bias_tensor.data<void>();
args.wc_alignment = wc_aligned;
args.channel_alignment = wc_aligned;
args.image.address = input->data<void>();
args.image.scale_address = input->scale();
args.image.channels = wc_aligned;
args.image.height = 1;
args.image.width = 1;
args.image.pad_width = 0;
args.image.pad_height = 0;
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
}
void updateInput(Tensor* t, int index) {
if (index == 0) {
args_.scale_address = t->data<void>(); // replace inputs?
}
}
bool dispatch() {
compute_fpga_scale(args_) == 0;
return true;
}
ElementwiseMulParam& param() { return param_; }
private:
ElementwiseMulParam param_;
ScaleArgs args_ = {0};
Tensor bias_tensor;
};
} // namespace zynqmp
} // namespace paddle
......@@ -37,7 +37,10 @@ class FullyConnectedPE : public PE {
ConvParam& convParam_ = convPE_.param();
Tensor* input = param_.input;
convParam_.input = param_.input;
num_ = param_.input->shape().num();
convParam_.output = param_.output;
convParam_.groups = 1;
convParam_.strides = {1, 1};
convParam_.paddings = {0, 0};
......@@ -63,7 +66,6 @@ class FullyConnectedPE : public PE {
new_filter_data[i * chw + j] = scale;
}
}
conv_filter->flush();
convParam_.filter = conv_filter;
......@@ -89,6 +91,8 @@ class FullyConnectedPE : public PE {
private:
FullyConnectedParam param_;
ConvPE convPE_;
Tensor tempOut_;
int num_ = 1;
};
} // namespace zynqmp
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/arm/math/sgemm.h"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
#include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace zynqmp {
struct GRUTensors {
Tensor* gate;
Tensor* pre_output;
Tensor* output;
Tensor* reset_output;
};
class GRUPE : public PE {
public:
bool init() {
// Tensor* output = param_.output;
// output->setAligned(true);
// output->setDataLocation(Device);
return true;
}
void apply() {
auto hidden = param_.hidden;
// auto hidden_dims = hidden->dims();
int frame_size = hidden->shape().channel();
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
// copy 2/3 weight from param.weight;
zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float));
weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memcpy(weight_data,
param_.weight->data<float>(),
weight_shape.numel() * sizeof(float));
Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
gate_ping_.mutableData<void>(FP32, gate_shape);
gate_pong_.mutableData<void>(FP16, gate_shape);
zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param();
pre_out_param.input = &prev_hidden_;
pre_out_param.output = &gate_pong_;
pre_out_param.filter = &weight_;
pre_out_param.bias = &gate_ping_;
pre_out_pe_.init();
pre_out_pe_.apply();
// // ============= C
// ElementwiseAddParam& bias_add_param = bias_ew_pe_.param();
// bias_add_param.inputs = {&pre_output_, &pre_input_};
// bias_add_param.output = &pre_input_;
// bias_ew_pe_.init();
// bias_ew_pe_.apply();
// // ====================
// Shape state_weight_shape(NC,{frame_size, frame_size});
// float* state_weight_data = state_weight_.mutableData<float>(FP32,
// state_weight_shape);
// memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
// state_weight_shape.numel() * sizeof(float));
// FullyConnectedParam& reset_out_param = reset_out_pe_.param();
// reset_out_param.input = &prev_hidden;
// reset_out_param.output = &gate_ping;
// reset_out_param.filter = &state_weight_;
// // ============== unit reset;
// update_gate_.mutableData<void>(FP16, pre_input_shape);
// InputParam& relu_param = update_relu_pe_.param();
// relu_param.input = &tempTensor;
// relu_param.output = &update_gate_;
// update_relu_pe_.init();
// update_relu_pe_.apply();
reset_gate_.mutableData<void>(FP16, hidden_shape);
prev_hidden_.mutableData<void>(FP16, hidden_shape);
reset_hidden_.mutableData<void>(FP16, hidden_shape);
// InputParam& reset_param = reset_relu_pe_.param();
// reset_param.input = &tempTensor;
// reset_param.output = &reset_gate_;
// reset_relu_pe_.init();
// reset_relu_pe_.apply();
// float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
// memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16));
// // TODO
// reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
ElementwiseMulParam& mul_param = mul_pe_.param();
mul_param.inputs = {&reset_gate_, &prev_hidden_};
mul_param.output = &reset_hidden_;
mul_pe_.init();
mul_pe_.apply();
// ==============
}
bool dispatch() { return true; }
void gru_unit_reset_act(const lite_api::ActivationType active_gate,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
int stride_update = 3 * frame_size;
int stride_cell_state = 3 * frame_size;
int stride_hidden_prev = frame_size;
int stride_hidden = frame_size;
// Tensor* gate = value.gate;
// value.gate->saveToFile("value_input.txt");
float* update_gate_data = gate_ping_.data<float>();
float* reset_gate_data = update_gate_data + frame_size;
for (int b = 0; b < batch_size; b++) {
// memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel()
// * sizeof(float));
// tempTensor.flush();
Tensor tmp;
Shape s(NC, {1, frame_size});
float* tmp_data = tmp.mutableData<float>(FP32, s);
for (int i = 0; i < frame_size; i++) {
// f(x) = x / (1 + abs(x))?
update_gate_data[i] =
lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
update_gate_data[i]);
reset_gate_data[i] =
lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
reset_gate_data[i]);
}
memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
tmp.flush();
reset_gate_.copyFrom(&tmp);
// reset_gate_.copyFrom(&tempTensor);
Tensor* hidden_prev = value.pre_output;
if (hidden_prev) {
// memcpy(prev_data, )
// TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
}
// // 4.0 reset_date * hidden_prev;
// // reset_hidden_prev[i] = reset_gate[i] * prev;
mul_pe_.dispatch();
reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update;
reset_gate_data += stride_update;
// reset_hidden_prev += stride_hidden;// TODO
}
}
void gru_unit_out_act(const lite_api::ActivationType active_node,
bool origin_mode,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
// int stride_update = 3 * frame_size;
// int stride_cell_state = 3 * frame_size;
// int stride_hidden_prev = frame_size;
// int stride_hidden = frame_size;
// Tensor* hidden = value.output_value;
// float* hidden_prev = nullptr;
// if (hidden) {
// hidden_prev = hidden->data<float>();
// }
// float* cell_state = value.gate->data<float>() + 2 * frame_size;
// float* updata_gate = value.gate->data<float>();
// // float* reset_gate_data = update_gate_data + frame_size;
// float prev = 0.0f;
// for (int b = 0; b < batch_size; ++b) {
// if (origin_mode) {
// // for (int i = 0; i < frame_size; i++) {
// // float prev = 0;
// // if (hidden_prev) {
// // prev = hidden_prev[i];
// // }
// // cell_state[i] =
// lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
// // hidden[i] =
// // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] *
// prev;
// // }
// } else {
// for (int i = 0; i < frame_size; ++i) {
// cell_state[i] =
// lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
// if (hidden_prev) {
// prev = hidden_prev[i];
// }
// float hidden_value =
// prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
// hidden_prev[i] = hidden_value;
// std::cout << "hidden_value::" << hidden_value << std::endl;
// }
// }
// updata_gate += stride_update;
// cell_state += stride_cell_state;
// hidden_prev += frame_size;
// }
}
void copy_input(GRUTensors& value) { // NOLINT
float max = find_max(*(value.gate));
gate_ping_.mutableData<void>(FP32, value.gate->shape());
gate_ping_.copyFrom(value.gate);
// update input pointer?
// gate_.readFromFile("input/in.txt");
// // pre_input_.saveToFile("pppp_in.txt");
// gate_.scale()[0] = max / 127;
// gate_.scale()[1] = 127 / max;
// gate_.printScale("pre_input_");
// gate_.saveToFile("pre_input_.txt");
// pre_out_pe_.dispatch();
// pre_output_.saveToFile("pp_out.txt");
}
void GRUCOmpute(GRUTensors& value, // NOLINT
int frame_size,
int batch_size,
const lite_api::ActivationType active_node,
const lite_api::ActivationType active_gate,
bool origin_mode) {
copy_input(value);
if (value.pre_output) {
// copy by batch;
pre_out_pe_.dispatch();
gate_ping_.copyFrom(&gate_pong_);
}
gru_unit_reset_act(active_gate, value, frame_size, batch_size);
// if (value.pre_output) {
// // state weight;
// reset_out_pe_.dispatch();
// }
// gru_unit_out_act(active_node, origin_mode, value, frame_size,
// batch_size);
}
GRUParam& param() { return param_; }
// Tensor* preOutput() {
// return &pre_output_;
// }
// Tensor* gate() {
// return &gate_;
// }
Tensor* updateGate() { return &update_gate_; }
Tensor* resetGate() { return &reset_gate_; }
private:
GRUParam param_;
zynqmp::Tensor gate_ping_;
zynqmp::Tensor gate_pong_;
zynqmp::Tensor bias_;
zynqmp::Tensor weight_;
zynqmp::Tensor state_weight_;
// =================================
zynqmp::Tensor update_gate_;
zynqmp::Tensor reset_gate_;
zynqmp::Tensor cell_state_;
zynqmp::Tensor prev_hidden_;
zynqmp::Tensor reset_hidden_;
Tensor tempTensor;
// =================================
ReluPE update_relu_pe_;
ReluPE reset_relu_pe_;
zynqmp::ElementwiseMulPE mul_pe_;
zynqmp::FullyConnectedPE pre_out_pe_;
zynqmp::FullyConnectedPE reset_out_pe_;
zynqmp::ElementwiseAddPE bias_ew_pe_;
};
} // namespace zynqmp
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/backends/arm/math/gru_utils.h"
namespace paddle {
namespace lite {
namespace fpga {}
}
}
......@@ -25,6 +25,8 @@ class OutputPE : public PE {
bool init() {
Tensor* output = param_.output;
output->setAligned(false);
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
return true;
}
......@@ -41,6 +43,15 @@ class OutputPE : public PE {
} else {
output->copyFrom(input);
}
//
output->syncToCPU();
if (DLEngine::get_instance().out_data == nullptr) {
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
}
memcpy(DLEngine::get_instance().out_data,
output->data<void>(),
output->shape().numel() * sizeof(float));
return true;
}
......
......@@ -35,24 +35,25 @@ class PoolingPE : public PE {
Tensor* input = param_.input;
Tensor* output = param_.output;
uint32_t k_width = param_.kernelSize[0];
uint32_t k_height = param_.kernelSize[1];
uint32_t k_height = param_.kernelSize[0];
uint32_t k_width = param_.kernelSize[1];
if (param_.globalPooling) {
k_width = input->shape().width();
k_height = input->shape().height();
param_.kernelSize[0] = k_height;
param_.kernelSize[1] = k_width;
}
PoolingArgs args = {0};
args.mode = param_.type;
auto paddings = *param_.paddings;
args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
args.image.address = input->data<float16>();
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_height = paddings[0];
args.image.pad_width = paddings[2];
args.image.pad_height = param_.paddings[0];
args.image.pad_width = param_.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output->mutableData<float16>();
args.output.scale_address = output->scale();
......@@ -66,6 +67,10 @@ class PoolingPE : public PE {
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7);
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// &&
// (k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE;
}
void compute() {
......@@ -74,16 +79,16 @@ class PoolingPE : public PE {
input->syncToCPU();
Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
auto paddings = *param_.paddings;
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = paddings[0];
int image_pad_w = paddings[2];
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0];
int kernel_step_h = param_.strides[0];
......@@ -129,7 +134,7 @@ class PoolingPE : public PE {
output->flush();
}
void cpu_compute() {
void cpu_compute1() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
......@@ -138,7 +143,6 @@ class PoolingPE : public PE {
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0;
......@@ -154,13 +158,43 @@ class PoolingPE : public PE {
}
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
std::cout << "pool scale:" << scale_max / 127.0f << std::endl;
output->flush();
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
float* float_input_data =
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0;
for (int i = 0; i < output->shape().channel(); i++) {
float sum = 0;
for (int j = 0; j < kernel_hw; j++) {
sum += float_input_data[i * kernel_hw + j];
}
float value = sum / kernel_hw;
data_out[i] = float_to_half(value);
scale_max = std::max(scale_max, std::abs(value));
}
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
}
bool dispatch() {
if (use_cpu_) {
// cpu_compute();
compute();
// exit(-1);
return true;
}
param_.input->syncToDevice();
......
......@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() {
if (cachedBoxes_ == nullptr) {
cachedBoxes_ = new Tensor();
cachedVariances_ = new Tensor();
cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape());
cachedVariances_->mutableData<float16>(FP16,
param_.outputVariances->shape());
cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
cachedBoxes_->setDataLocation(CPU);
cachedVariances_->setDataLocation(CPU);
compute_prior_box();
......
......@@ -14,11 +14,16 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
namespace paddle {
namespace zynqmp {
class ScalePE : public PE {
public:
inline int gcd(int a, int b) {
......@@ -42,6 +47,8 @@ class ScalePE : public PE {
Tensor* input = param_.input;
Tensor* output = param_.output;
Shape& input_shape = input->shape();
DepthwiseConvParam& dw_param = dw_pe_.param();
int channel = input_shape.channel();
int repeat = 1;
int alignment = 16;
......@@ -51,70 +58,154 @@ class ScalePE : public PE {
int c_lcm = lcm(channel, alignment);
repeat = c_lcm / (channel);
}
// FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现
Shape shape(N, {channel * repeat});
param_.alignedBias()->mutableData<float16>(FP16, shape);
param_.alignedScale()->mutableData<float16>(FP16, shape);
float16* bias_data = param_.alignedBias()->data<float16>();
float16* scale_data = param_.alignedScale()->data<float16>();
float* filter_data = filter.mutableData<float>(FP32, shape);
std::fill_n(filter_data, input->shape().channel(), 1.0f);
Tensor* scale = dw_param.scale();
float16* scale_data = scale->mutableData<float16>(FP16, shape);
// memcpy(scale_data, param_.scale->data<float>(), input->shape().channel()
// * sizeof(float));
Tensor* bias = dw_param.bias();
float16* bias_data = bias->mutableData<float16>(FP16, shape);
std::fill_n(bias_data, input->shape().channel(), 0);
if (param_.scale->dataType() == FP32) {
// std::cout << "scale dataType FP32:" << std::endl;
if (param_.bias != nullptr) {
float* bias_data_float = param_.bias->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(bias_data_float[j]);
bias_data[i * length + j] = value;
}
}
} else {
float16 zero = float_to_half(0.0f);
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
bias_data[i * length + j] = zero;
}
}
}
if (param_.bias != nullptr) {
float* bias_data_float = param_.bias->data<float>();
float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(bias_data_float[j]);
bias_data[i * length + j] = value;
float16 value = float_to_half(scale_data_float[j]);
scale_data[i * length + j] = value;
}
}
} else {
float16 zero = float_to_half(0.0f);
if (param_.bias != nullptr) {
float16* bias_data_float = param_.bias->data<float16>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = bias_data_float[j];
bias_data[i * length + j] = value;
}
}
} else {
float16 zero = float_to_half(0.0f);
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
bias_data[i * length + j] = zero;
}
}
}
float16* scale_data_float = param_.scale->data<float16>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
bias_data[i * length + j] = zero;
float16 value = scale_data_float[j];
scale_data[i * length + j] = value;
}
}
}
float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(scale_data_float[j]);
scale_data[i * length + j] = value;
// if (param_.bias != nullptr) {
// memcpy(bias_data, param_.bias->data<float>(), input->shape().channel()
// * sizeof(float));
// }
dw_param.input = param_.input;
dw_param.output = param_.output;
dw_param.filter = &filter;
dw_param.strides = {1, 1};
dw_param.paddings = {0, 0};
dw_param.kernelSize = {1, 1};
dw_param.dilations = {1, 1};
dw_pe_.init();
dw_pe_.apply();
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
Tensor float_input;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
input->syncToCPU();
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
float* scale_data = param_.scale->data<float>();
int wh = input->shape().width() * input->shape().height();
float16* in_data = input->data<float16>();
float max = 0;
for (int i = 0; i < wh; i++) {
for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c];
data_out[index] = float_to_half(value);
if (value < 0) {
value = -value;
}
if (value > max) {
max = value;
}
}
}
param_.alignedScale()->flush();
param_.alignedBias()->flush();
int wc = input_shape.width() * input_shape.channel();
int wc_aligned = align_image(wc);
ScaleArgs& args = param_.args;
args.scale_address = param_.alignedScale()->data<void>();
args.bias_address = param_.alignedBias()->data<void>();
args.wc_alignment = wc_aligned;
args.channel_alignment = channel * repeat;
args.image.address = input->data<void>();
args.image.scale_address = input->scale();
args.image.channels = channel;
args.image.height = input_shape.height();
args.image.width = input_shape.width();
args.image.pad_width = 0;
args.image.pad_height = 0;
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
}
bool dispatch() {
// cpu_compute();
// return true;
if (param_.scale->dataType() == FP16) {
DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
param_.scale->data<float16>(),
param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
dw_param.quantizedFilter()->flush();
// apply();
}
param_.input->syncToDevice();
return compute_fpga_scale(param_.args) == 0;
return dw_pe_.dispatch();
}
ScaleParam& param() { return param_; }
private:
ScaleParam param_;
Tensor filter;
DepthwiseConvPE dw_pe_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle {
namespace zynqmp {
static struct None none_;
static struct NCHW nchw_;
static struct NHWC nhwc_;
static struct NC nc_;
......@@ -82,6 +83,9 @@ class Shape {
void setLayoutType(LayoutType layout) {
this->layoutType_ = layout;
switch (layout) {
case None:
layout_ = &none_;
break;
case NCHW:
layout_ = &nchw_;
break;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include <unistd.h>
#include <algorithm>
#include <cmath>
#include <cstring>
......@@ -24,13 +25,10 @@ limitations under the License. */
#include <string>
#include <vector>
// #include "lite/core/tensor.h"
#include "lite/backends/fpga/KD/dl_engine.hpp"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/shape.hpp"
// #include "lite/backends/fpga/KD/types.hpp"
namespace paddle {
namespace zynqmp {
......@@ -117,7 +115,8 @@ class Tensor {
template <typename Dtype>
Dtype* mutableData() {
size_t memorySize = shape_->memorySize(CellSize(dataType_));
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
if (placeHolder_ != nullptr) {
if (memorySize > placeHolder_->memorySize()) {
placeHolder_.reset(new PlaceHolder(memorySize));
......@@ -241,6 +240,10 @@ class Tensor {
}
}
void setMemScale(float scale_factor) {
this->mem_scale_factor_ = scale_factor;
}
void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
......@@ -276,9 +279,11 @@ class Tensor {
.height = 1,
.pad_width = 0u,
.pad_height = 0u};
args.output = {
ImageOutputArgs output = {
.address = data<void>(), .scale_address = scale(),
};
args.output = output;
src->syncToDevice();
size_t aligned_remainder = src->shape().numel() % 16;
if (aligned_remainder > 0) {
......@@ -294,10 +299,16 @@ class Tensor {
this->invalidate();
}
void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
void flush() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
}
void invalidate() {
fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_invalidate(placeHolder_->data(), memorySize);
}
void sync() {
......@@ -337,6 +348,18 @@ class Tensor {
if (placeHolder_ == nullptr) {
return;
}
std::cout << scale()[0] << " , " << scale()[1] << std::endl;
}
void printScale(std::string type) {
std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width())
<< std::endl;
std::cout << type << " \n";
printScale();
}
std::string dimsFileName() {
......@@ -358,33 +381,14 @@ class Tensor {
saveToFile(path);
}
friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
os << "tensor:"
<< "\n";
os << "dims: {";
for (int i = 0; i < tensor.shape().dimSize(); ++i) {
os << tensor.shape()[i] << " ";
}
os << "}\n";
for (int i = 0; i < tensor.shape().numel(); i++) {
float value = 0;
if (tensor.dataType() == FP32) {
value = tensor.data<float>()[i];
} else {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
}
void saveToFile(std::string path) {
syncToCPU();
invalidate();
std::ofstream ofs;
static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path;
counter++;
std::cout << "======== saving file:" << npath << " ============\n";
save_file_with_name(npath);
}
......@@ -392,14 +396,16 @@ class Tensor {
// return;
invalidate();
std::ofstream ofs;
ofs.open(path);
for (int i = 0; i < shape_->numel(); i++) {
float value = 0;
if (dataType_ == FP32) {
value = data<float>()[i];
} else {
} else if (dataType_ == FP16) {
value = half_to_float(data<float16>()[i]);
} else {
value = data<int8_t>()[i];
}
ofs << value << std::endl;
}
......@@ -415,18 +421,49 @@ class Tensor {
int num = shape_->numel();
invalidate();
float max = 0.0f;
float16* data = mutableData<float16>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = float_to_half(value);
if (dataType_ == FP16) {
float16* data = mutableData<float16>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = float_to_half(value);
}
} else {
float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = value;
}
}
flush();
placeHolder_->scale_[0] = max / 127.0f;
placeHolder_->scale_[1] = 127.0f / max;
}
friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
os << "tensor:"
<< "\n";
os << "dims: {";
for (int i = 0; i < tensor.shape().dimSize(); ++i) {
os << tensor.shape()[i] << " ";
}
os << "}\n";
for (int i = 0; i < tensor.shape().numel(); i++) {
float value = 0;
if (tensor.dataType() == FP32) {
value = tensor.data<float>()[i];
} else {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
}
~Tensor() {
if (shape_ != nullptr) {
delete shape_;
......@@ -436,6 +473,7 @@ class Tensor {
private:
int offset = 0;
float mem_scale_factor_ = 1.0f;
std::shared_ptr<PlaceHolder> placeHolder_;
Shape* shape_ = nullptr;
DataType dataType_ = FP32;
......
......@@ -95,16 +95,14 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
dims_ = other.dims_;
target_ = other.target_;
lod_ = other.lod_;
// memory_size_ = other.memory_size_;
// buffer_->CopyDataFrom(*other.buffer_, memory_size_);
zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
other.zynq_tensor_->shape());
}
auto dt = zynq_tensor_->dataType();
// template <typename T>
// void TensorLite::mutable_data_internal() {
auto shape = other.zynq_tensor_->shape();
// }
Resize(other.dims());
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
this->ZynqTensor()->copyFrom(other.ZynqTensor());
}
} // namespace lite
} // namespace paddle
......@@ -106,7 +106,7 @@ class TensorLite {
// For other devices, T and R may be the same type.
template <typename T, typename R = T>
const R *data() const {
return zynq_tensor_->data<R>();
return zynq_tensor_->data<R>() + offset_;
}
void Resize(const DDimLite &ddim) { dims_ = ddim; }
......@@ -125,6 +125,7 @@ class TensorLite {
bool persistable() const { return persistable_; }
void set_persistable(bool persistable) { persistable_ = persistable; }
// T is the data type and R is the return type
// For OpenCL, the return type can be cl::Buffer
// and the data type can be float/int8_t.
......@@ -147,6 +148,8 @@ class TensorLite {
size_t memory_size() const { return zynq_tensor_->memorySize(); }
size_t offset() const { return offset_; }
bool IsInitialized() const { return buffer_->data(); }
// Other share data to this.
......@@ -157,8 +160,14 @@ class TensorLite {
template <typename T>
TensorLite Slice(int64_t begin, int64_t end) const;
template <typename T>
void Slice(TensorLite &dst, int64_t begin, int64_t end) const; // NOLINT
TargetType target() const { return target_; }
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
......@@ -173,16 +182,21 @@ class TensorLite {
private:
TargetType target_{TargetType::kHost};
// precision_ and persistable_ are only used for persistable vars.
// If your tensor wants to be saved and loaded correctly, you must
// set values of precision_ and persistable_ after updating it.
// If your tensor is just a temp tensor, such as activations,
// you can ignore these two attributes.
PrecisionType precision_{PrecisionType::kUnk};
bool persistable_{false};
DDimLite dims_;
std::shared_ptr<Buffer> buffer_;
LoD lod_;
size_t memory_size_{};
size_t offset_{0};
PrecisionType precision_{PrecisionType::kUnk};
bool persistable_{false};
zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
template <typename T>
......@@ -197,6 +211,9 @@ R *TensorLite::mutable_data() {
}
zynqmp::LayoutType layout_type = zynqmp::NCHW;
switch (v.size()) {
case 0:
layout_type = zynqmp::None;
break;
case 1:
layout_type = zynqmp::N;
break;
......@@ -228,24 +245,63 @@ R *TensorLite::mutable_data(TargetType target) {
return mutable_data<T>();
}
template <typename TensorT>
bool TensorCompareWith(const TensorT &a, const TensorT &b) {
if (a.dims() != b.dims()) return false;
if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
return true;
}
template <typename T>
TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
int64_t base = numel() / dims_[0];
throw - 1;
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
if (dims_[0] == 1) {
return *this;
} else {
int64_t base = numel() / dims_[0];
TensorLite dst;
// dst.buffer_ = buffer_;
// dst.zynq_tensor_ = zynq_tensor_;
dst.target_ = target_;
auto dst_dims = dims_;
dst_dims[0] = end - begin;
dst.Resize(dst_dims);
void *dst_data = dst.mutable_data<T>();
T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data,
src_data + static_cast<size_t>(begin * base) * sizeof(T),
dst_dims.production() * sizeof(T));
dst.ZynqTensor()->saveToFile("_slice", true);
// dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
return dst;
}
}
template <typename T>
void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
TensorLite dst;
dst.buffer_ = buffer_;
dst.target_ = target_;
auto dst_dims = dims_;
dst_dims[0] = end - begin;
dst.Resize(dst_dims);
dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
return dst;
void *dst_data = dst.mutable_data<T>();
int64_t base = numel() / dims_[0];
T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data,
src_data + static_cast<size_t>(begin * dst_dims.production()),
dst_dims.production() * sizeof(T));
}
template <typename TensorT>
bool TensorCompareWith(const TensorT &a, const TensorT &b) {
if (a.dims() != b.dims()) return false;
if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
return true;
}
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册