提交 b8923be2 编写于 作者: C chonwhite

updated backends/fpga and kernels/fpga files

上级 97882cfa
./lite/tools/build.sh \
--arm_os=armlinux \
--arm_abi=armv8 \
--arm_lang=gcc \
test
./lite/tools/build.sh \
--arm_os=armlinux \
--arm_abi=armv8 \
--arm_lang=gcc \
test
文件模式从 100644 更改为 100755
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nhwc2nchw);
USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nchw2nhwc);
USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nhwc2nchw);
USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_sub_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
\ No newline at end of file
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_OP(feed);
USE_LITE_OP(mul);
USE_LITE_OP(search_fc);
USE_LITE_OP(conv2d);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(layout);
USE_LITE_OP(split);
USE_LITE_OP(stack);
USE_LITE_OP(scale);
USE_LITE_OP(multiclass_nms);
USE_LITE_OP(fusion_elementwise_sub_activation);
USE_LITE_OP(fusion_elementwise_add_activation);
USE_LITE_OP(fusion_elementwise_mul_activation);
USE_LITE_OP(fusion_elementwise_max_activation);
USE_LITE_OP(fusion_elementwise_div_activation);
USE_LITE_OP(unsqueeze);
USE_LITE_OP(unsqueeze2);
USE_LITE_OP(sequence_topk_avg_pooling);
USE_LITE_OP(transpose);
USE_LITE_OP(transpose2);
USE_LITE_OP(dropout);
USE_LITE_OP(pool2d);
USE_LITE_OP(elementwise_sub);
USE_LITE_OP(elementwise_add);
USE_LITE_OP(elementwise_mul);
USE_LITE_OP(elementwise_max);
USE_LITE_OP(elementwise_div);
USE_LITE_OP(io_copy_once);
USE_LITE_OP(batch_norm);
USE_LITE_OP(reshape);
USE_LITE_OP(reshape2);
USE_LITE_OP(expand);
USE_LITE_OP(matmul);
USE_LITE_OP(concat);
USE_LITE_OP(slice);
USE_LITE_OP(graph_op);
USE_LITE_OP(pad2d);
USE_LITE_OP(squeeze);
USE_LITE_OP(squeeze2);
USE_LITE_OP(arg_max);
USE_LITE_OP(fill_constant);
USE_LITE_OP(fill_constant_batch_size_like);
USE_LITE_OP(square);
USE_LITE_OP(relu);
USE_LITE_OP(leaky_relu);
USE_LITE_OP(relu_clipped);
USE_LITE_OP(prelu);
USE_LITE_OP(sigmoid);
USE_LITE_OP(tanh);
USE_LITE_OP(swish);
USE_LITE_OP(relu6);
USE_LITE_OP(log);
USE_LITE_OP(exp);
USE_LITE_OP(floor);
USE_LITE_OP(hard_sigmoid);
USE_LITE_OP(sqrt);
USE_LITE_OP(rsqrt);
USE_LITE_OP(softsign);
USE_LITE_OP(range);
USE_LITE_OP(box_coder);
USE_LITE_OP(calib);
USE_LITE_OP(mean);
USE_LITE_OP(conv2d_transpose);
USE_LITE_OP(fetch);
USE_LITE_OP(prior_box);
USE_LITE_OP(fake_quantize_moving_average_abs_max);
USE_LITE_OP(io_copy);
USE_LITE_OP(nearest_interp);
USE_LITE_OP(bilinear_interp);
USE_LITE_OP(softmax);
USE_LITE_OP(reduce_mean);
USE_LITE_OP(affine_channel);
USE_LITE_OP(cast);
USE_LITE_OP(fake_dequantize_max_abs);
USE_LITE_OP(shuffle_channel);
USE_LITE_OP(yolo_box);
USE_LITE_OP(fc);
\ No newline at end of file
此差异已折叠。
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
// #include "lite/backends/fpga/lite_tensor.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
class Debugger {
public:
static Debugger& get_instance() {
static Debugger s_instance;
return s_instance;
}
void registerOutput(std::string op_type, Tensor* tensor) {
// tensor->printScale();
// tensor->saveToFile(op_type, true);
}
private:
std::unordered_map<std::string, bool> op_config;
Debugger() {
op_config["concat"] = true;
op_config["conv_add_bn"] = true;
op_config["conv_add_bn_relu"] = true;
op_config["conv_add"] = true;
op_config["conv_add_relu"] = true;
op_config["conv_bn"] = true;
op_config["conv_bn_relu"] = true;
op_config["crop"] = true;
}
};
inline void chw_to_hwc(Tensor* t, float* dst) {
int num = t->dims()[0];
int channel = t->dims()[1];
int height = 1;
int width = 1;
if (t->dims().size() > 2){
height = t->dims()[2];
}
if (t->dims().size() > 3){
width = t->dims()[3];
}
// int width = t->dims()[3];
const float* chw_data = t->data<float>();
float* hwc_data = dst;
int chw = channel * height * width;
int wc = width * channel;
int index = 0;
for (int n = 0; n < num; n++) {
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
index++;
}
}
}
}
}
inline void read_from_file(lite::Tensor* t,const std::string& path) {
std::ifstream file_stream;
file_stream.open(path);
if (!file_stream) {
return;
}
float* data = t->mutable_data<float>();
int num = t->numel();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
// flush();
}
inline void save_float(float* data, const std::string& name, int len) {
return;
static int counter = 0;
std::string old_string = std::to_string(counter);
std::string new_string = std::string(3 - old_string.length(), '0') + old_string;
std::string file = "arm_" + new_string + name;
counter++;
std::cout << "-------------------------- saving file: --------------------------" << file << std::endl;
std::ofstream ofs;
ofs.open(file);
// float* data = dst;
for (int i = 0; i < len; i++) {
float value = data[i];
ofs << value << std::endl;
}
ofs.close();
}
inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert = true) {
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
if (convert) {
chw_to_hwc(t, dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
inline void save_tensor(const lite::Tensor* t,const std::string& name, bool convert = true) {
// return;
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
if (convert) {
chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
}
}
\ No newline at end of file
......@@ -13,14 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/dl_engine.hpp"
namespace paddle {
namespace zynqmp {
DLEngine::DLEngine() {
open_device();
struct DeviceInfo info;
int ret = get_device_info(info);
filter::set_filter_capacity(info.filter_cap);
int ret = get_device_info(info_);
// filter::set_filter_capacity(2048);
filter::set_filter_capacity(info_.filter_cap);
filter::set_colunm(info_.colunm);
std::cout << " version:" << info_.version;
std::cout << " device_type:" << info_.device_type;
std::cout << " filter_cap:" << info_.filter_cap;
std::cout << " colunm:" << info_.colunm << std::endl;
}
} // namespace zynqmp
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -29,8 +28,16 @@ class DLEngine {
return s_instance;
}
DeviceInfo& deviceInfo();
// bool isZU3() { return info_.device_type / 100 == 3; }
bool isZU3() { return true; }
float* out_data = nullptr;
private:
DLEngine();
DeviceInfo info_;
};
} // namespace zynqmp
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/fpga_cv.hpp"
using paddle::zynqmp::float16;
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height) {
paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
paddle::zynqmp::config_inplace(inplace_args);
paddle::zynqmp::ImageInputArgs input_args = {nullptr};
input_args.address = nullptr;
input_args.scale_address = nullptr;
float16* input_image_address =
reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
input_width * input_height * input_channel * sizeof(float16)));
int index = 0;
for (int i = 0; i < input_width * input_height * input_channel; i++) {
input_image_address[i] = float16(1.0 * input[i]);
}
paddle::zynqmp::ResizeArgs resize_args = {0};
resize_args.input_width = input_width;
resize_args.input_height = input_height;
resize_args.image_channel = input_channel;
resize_args.output_width = output_width;
resize_args.output_height = output_height;
float height_ratio = static_cast<float>(input_height) /
static_cast<float>(resize_args.output_height);
float width_ratio = static_cast<float>(input_width) /
static_cast<float>(resize_args.output_width);
resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
int output_size =
resize_args.output_width * resize_args.output_height * input_channel;
float16* fpga_output = reinterpret_cast<float16*>(
paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
resize_args.input_image_address = input_image_address;
resize_args.output_image_address = fpga_output;
memset(fpga_output, 0, output_size * sizeof(float16));
paddle::zynqmp::fpga_flush(
input_image_address,
input_width * input_height * input_channel * sizeof(float16));
paddle::zynqmp::fpga_flush(resize_args.output_image_address,
output_size * sizeof(float16));
int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
if (ret == 0) {
paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
output_size * sizeof(float16));
}
for (int i = 0; i < output_size; i++) {
output[i] = fpga_output[i];
}
}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height);
......@@ -22,6 +22,7 @@ namespace paddle {
namespace zynqmp {
enum LayoutType {
None,
N,
NC,
NCHW,
......@@ -39,6 +40,19 @@ class Layout {
virtual int elementCount(const std::vector<int>& dims) = 0;
};
struct None : Layout {
int numIndex() { return -1; }
int channelIndex() { return -1; }
int heightIndex() { return -1; }
int widthIndex() { return -1; }
int alignedElementCount(const std::vector<int>& dims) {
return 16;
}
virtual int elementCount(const std::vector<int>& dims) {
return 1;
}
};
struct NCHW : Layout {
int numIndex() { return 0; }
int channelIndex() { return 1; }
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <memory.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/bias_scale.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -24,74 +25,72 @@ namespace bias_scale {
void align_element(float **data_in, int num_per_div_before_alignment, int num) {
int copynum = 0;
float *ptr_unaligned = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned =
(float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment,
BS_NUM_ALIGNMENT);
int num_element = 2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, num_element * sizeof(float));
for (int i = 0; i < div_num; i++) {
if (i == div_num - 1) {
copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
if (i == div_num - 1) {
copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
}
fpga_free(ptr_unaligned);
*data_in = ptr_aligned;
}
void interleave(float **data_in, int num_after_alignment) {
size_t interleave(float **data_in, int num_after_alignment) {
float *ptr_uninterleaved = *data_in;
float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
int num = num_after_alignment / 4;
for (int i = 0; i < num; i++) {
memcpy(
ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, 4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment + 4 * i,
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
}
fpga_free(ptr_uninterleaved);
*data_in = ptr_interleaved;
return 2 * num_after_alignment * sizeof(float);
}
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division,
int num) {
int element_num_per_division, int num) {
align_element(bias_scale_array, element_num_per_division, num);
int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
size_t mem = interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, mem);
}
void format_bias_array(float **bias_array, int num) {
float *ptr_unaligned = *bias_array;
int num_before_align = num;
int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
int16_t *ptr_aligned =
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
std::cout << "bias::" << std::endl;
for (int i = 0; i < num_before_align; i++) {
float value = ptr_aligned[i];
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
float value = ptr_aligned[i];
std::cout << "@:" << i << " = " << value << std::endl;
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
}
*bias_array = (float *)ptr_aligned; // NOLINT
fpga_free(ptr_unaligned);
......
......@@ -19,10 +19,9 @@ namespace zynqmp {
namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment);
size_t interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division,
int num);
int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
} // namespace bias_scale
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define PADDLE_LITE_ZU5
#define FPGA_PRINT_MODE
#define PADDLE_LITE_PROFILE
......@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/llapi/filter.h"
#include <memory.h>
#include <algorithm>
#include <fstream>
#include <string>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -23,11 +25,48 @@ namespace zynqmp {
namespace filter {
static int FILTER_SIZE = 2048;
static int COLUMN = 4;
void saveToFile(std::string name,void* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
// int8_t* data = (int8_t*)data_in;
// for (int i = 0; i < size; i++) {
// float value = data[i];
// ofs << value << std::endl;
// }
// ofs.close();
}
void saveFloatToFile(std::string name,float* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
// for (int i = 0; i < size; i++) {
// float value = data_in[i];
// ofs << value << std::endl;
// }
// ofs.close();
}
void set_filter_capacity(uint32_t cap) {
FILTER_SIZE = cap;
}
void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
void set_colunm(uint32_t column) {
COLUMN = column;
}
// replace zynqmp_api.h #define FILTER_NUM_ALIGNMENT
int get_filter_num_alignment() {
return COLUMN * 4;
}
int calc_division_capacity(int chw) {
int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
// int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
int filter_num_alignment = get_filter_num_alignment();
int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
return n < FILTER_SIZE ? n : FILTER_SIZE;
}
......@@ -52,28 +91,24 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
}
}
void convert_to_hwc(
char **data_in, int num, int channel, int height, int width) {
char *tmp = *data_in;
void convert_to_hwc(int8_t* chw_data, int8_t* hwc_data, int num, int channel,
int height, int width) {
int chw = channel * height * width;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
int wc = width * channel;
int index = 0;
for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + n * chw + offset_height + w * channel + c) =
*((*data_in)++);
hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
index++;
}
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
float find_max(float *data_in, int data_size) {
float find_max(float* data_in, int data_size) {
float max = 0.0;
for (int i = 0; i < data_size; ++i) {
float value = data_in[i];
......@@ -83,166 +118,178 @@ float find_max(float *data_in, int data_size) {
return max;
}
signed char float_to_int8(float fdata) {
int8_t float_to_int8(float fdata) {
if (fdata < 0.0) {
fdata -= 0.5;
} else {
fdata += 0.5;
}
return (signed char)fdata;
return (int8_t)fdata;
}
void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in;
void quantize(float* src, int8_t* dst, int len, float max) {
float fix_range = 127;
float scale = fix_range / max;
signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) {
tmp_data[i] = float_to_int8(
(*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale);
for (size_t i = 0; i < len; i++) {
dst[i] = float_to_int8(src[i] * scale);
}
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
}
void align_element(char **data_in, int num, int chw) {
int j = 0;
bool should_align_chw(int chw) {
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) {
char *tmp = *data_in;
char *data_tmp =
(char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT
memset(data_tmp, 0, num * align_chw);
for (j = 0; j < num; j++) {
memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
}
*data_in = data_tmp;
fpga_free(tmp);
return align_chw != chw;
}
void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
memset(dst, 0, num * aligned_chw);
for (int j = 0; j < num; j++) {
memcpy((dst + j * aligned_chw), (src + j * chw), chw);
}
}
void align_num(char **data_in,
int num_per_div_before_alignment,
int num,
int chw) {
int i = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
void align_num(int8_t* src, int8_t* dst, int num_per_div_before_alignment,
int num, int align_chw) {
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
align_to_x(num_per_div_before_alignment, filter_num_alignment);
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char));
memset(dst, 0, num_element * sizeof(int8_t));
int i = 0;
for (i = 0; i < div_num - 1; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
memcpy(dst + num_per_div_after_alignment * align_chw * i,
src + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
memcpy(dst + num_per_div_after_alignment * align_chw * i,
src + num_per_div_before_alignment * align_chw * i,
(num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
*data_in = data_tmp;
fpga_free(tmp);
}
void reorder(char **data_in, int num_after_alignment, int chw) {
void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int index = 0;
int new_index = 0;
int filter_num_alignment = get_filter_num_alignment();
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
(index / 16 % 2 * 4);
memcpy(data_tmp + index * chw_align,
*data_in + new_index * chw_align,
chw_align);
new_index = index / filter_num_alignment * filter_num_alignment + (index % (filter_num_alignment/2) / 4 * 8) + (index % (filter_num_alignment/2) % 4) +
(index / (filter_num_alignment/2) % 2 * 4);
memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
}
*data_in = data_tmp;
fpga_free(tmp);
}
size_t interleave(char **data_in, int num_after_alignment, int chw) {
int i = 0;
int j = 0;
int k = 0;
void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int interleave_per_num = 16;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) {
for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(data_tmp + i * chw_align + interleave_per_num * j,
*data_in + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
*data_in + (i + 1) * chw_align + interleave_per_num * k,
for (int i = 0; i < num_after_alignment; i += 2) {
for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(dst + i * chw_align + interleave_per_num * j,
src + i * chw_align + interleave_per_num * k, interleave_per_num);
memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
src + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num);
}
}
*data_in = data_tmp;
fpga_free(tmp);
return chw_align * num_after_alignment;
}
size_t format_filter(float **data_in,
int num,
int channel,
int height,
int width,
int group_num,
float max) {
int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
int height, int width, int group_num, float max,
std::vector<float>& filter_max) {
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
align_to_x(num_per_div_before_alignment, filter_num_alignment);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, filter_num_alignment);
// saveFloatToFile("quantize_before", data_in, data_size);
int8_t* quantized_data =
reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
float f_max = find_max(filter_start, chw);
int8_t* quantized_start = quantized_data + n * chw;
// quantize(filter_start, quantized_start, chw, f_max);
quantize(filter_start, quantized_start, chw, max);
// filter_max.push_back(f_max);
filter_max.push_back(max);
}
reorder(quantize_data, num_after_alignment, chw);
size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data,
align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
sizeof(char));
return mem_size;
// saveToFile("chw.txt", quantized_data, data_size);
int8_t* hwc_data =
reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
fpga_free(quantized_data);
// saveToFile("hwc.txt", hwc_data, data_size);
int8_t* temp_data = hwc_data; // NOLINT
int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (should_align_chw(chw)) {
int8_t* hwc_aligned_data = reinterpret_cast<int8_t*>(
fpga_malloc(num * chw_aligned * sizeof(int8_t)));
align_chw(hwc_data, hwc_aligned_data, num, chw);
// saveToFile("align_el.txt", hwc_aligned_data, data_size * 2);
temp_data = hwc_aligned_data;
fpga_free(hwc_data);
}
if (num_after_alignment != num) {
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment);
// int div_num =
// (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
align_num(temp_data, num_aligned_data, num_per_div_before_alignment, num,
chw_aligned);
// saveToFile("align_num.txt", num_aligned_data, data_size * 8);
fpga_free(temp_data);
temp_data = num_aligned_data;
}
int8_t* aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
reorder(temp_data, aligned_data, num_after_alignment, chw);
// saveToFile("reorder.txt", aligned_data, data_size * 8);
fpga_free(temp_data); // TODO change name of qdata;
int8_t* interleaved_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
interleave(aligned_data, interleaved_data, num_after_alignment, chw);
// saveToFile("interleave.txt", interleaved_data, data_size * 8);
fpga_free(aligned_data);
fpga_flush(interleaved_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
mem_size_a = num_after_alignment * chw_aligned;
return interleaved_data;
}
void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
int16_t *tmp = *data_in;
int16_t *data_tmp =
(int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width) {
int16_t* tmp = *data_in;
int16_t* data_tmp =
(int16_t*)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
......@@ -254,16 +301,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
fpga_free(tmp);
}
size_t align_element_n(int16_t **data_in, int num, int height, int width) {
size_t align_element_n(int16_t** data_in, int num, int height, int width) {
int unalign_n = num;
int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
int num_element = height * width * align_n;
if (unalign_n != align_n) {
int16_t *tmp = *data_in;
int16_t* tmp = *data_in;
int num_element = height * width * align_n;
int16_t *data_tmp =
(int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
int16_t* data_tmp =
(int16_t*)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(int16_t));
for (int h = 0; h < height; h++) {
......@@ -276,17 +323,33 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) {
}
}
*data_in = data_tmp;
free(tmp);
fpga_free(tmp);
}
return num_element * sizeof(int16_t);
}
void quantize_to_fp16(
float **data_in, int num, int height, int width, float *scale_ptr) {
float *tmp = *data_in;
void to_fp16(float* src, float16* dst, int num, int height, int width,
float* scale_ptr) {
int size = num * height * width;
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int index = n * height * width + h * width + w;
float value = src[index] * scale_val;
dst[index] = float_to_half(value);
}
}
}
fpga_flush(dst, size * sizeof(int16_t));
}
void quantize_to_fp16(float** data_in, int num, int height, int width,
float* scale_ptr) {
float* tmp = *data_in;
int size = num * height * width;
float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16)); // NOLINT
float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16)); // NOLINT
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
......@@ -298,18 +361,23 @@ void quantize_to_fp16(
}
}
fpga_flush(tmp_data, size * sizeof(int16_t));
*data_in = (float *)tmp_data; // NOLINT
*data_in = (float*)tmp_data; // NOLINT
fpga_free(tmp);
}
size_t format_dwconv_filter(
float **data_in, int num, int height, int width, float *scale_ptr) {
size_t format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr) {
// float16* fp16_data = reinterpret_cast<float16*>(
// fpga_malloc(num * height * width * sizeof(float16)));
// to_fp16(*data_in, fp16_data, num, height, width, scale_ptr);
// int16_t** quantize_data = (int16_t**)&fp16_data; // NOLINT
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
int16_t **quantize_data = (int16_t **)data_in;
convert_to_hwn(quantize_data, num, height, width);
size_t size = align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data,
align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * height * width *
sizeof(int16_t));
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
return size;
}
} // namespace filter
......
......@@ -18,40 +18,31 @@ limitations under the License. */
#include <cstdlib>
#include <cwchar>
#include <vector>
namespace paddle {
namespace zynqmp {
namespace filter {
void set_filter_capacity(uint32_t cap);
void set_colunm(uint32_t column);
int get_filter_num_alignment();
int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity);
void convert_to_hwc(
char** data_in, int num, int channel, int height, int width);
float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max);
void align_element(char** data_in, int num, int chw);
void align_num(char** data_in,
int num_per_div_before_alignment,
int num,
int chw);
void reorder(char** data_in, int num_after_alignment, int chw);
size_t interleave(char** data_in, int num_after_alignment, int chw);
size_t format_filter(float** data_in,
int num,
int channel,
int height,
int width,
int group_num,
float max);
int8_t* format_filter(float* data_in, int& mem_size, int num, int channel,
int height, int width, int group_num, float max,
std::vector<float>& filter_max);
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width);
void quantize_to_fp16(
float** data_in, int num, int height, int width, float* scale_ptr);
size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr);
// void quantize_to_fp16(float** data_in, int num, int height, int width,
// float* scale_ptr);
size_t format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr);
} // namespace filter
} // namespace zynqmp
......
......@@ -23,13 +23,12 @@ limitations under the License. */
#include <map>
#include <utility>
#include "lite/backends/fpga/KD/llapi/config.h"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
namespace paddle {
namespace zynqmp {
#define PADDLE_LITE_OS_LINUX
#define PADDLE_MOBILE_OS_LINUX
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
......@@ -39,21 +38,19 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) {
int ret = -1;
#ifdef PADDLE_LITE_OS_LINUX
ret = ioctl(fd, req, arg);
if (ret != 0) {
throw - 1;
}
#ifdef PADDLE_MOBILE_OS_LINUX
return ioctl(fd, req, arg);
#else
return ret;
return -1;
#endif
}
int open_device() {
// std::cout << "open_device" << std::endl;
if (fd == -1) {
fd = open(device_path, O_RDWR);
}
// std::cout << "open_device fd:" << fd << std::endl;
return fd;
}
......@@ -66,13 +63,19 @@ void reset_device() {
// memory management;
void *fpga_malloc(size_t size) {
#ifdef PADDLE_LITE_OS_LINUX
// std::cout << "fpga malloc: 0x" << std::hex << size << std::dec << " (" <<
// size << ") - ";
#ifdef ENABLE_DEBUG
// std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_MOBILE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) {
std::cout << "not enough memory !";
exit(-1);
}
// std::cout << std::hex << ptr << std::dec << std::endl;
memory_map.insert(std::make_pair(ptr, size));
memory_size += size;
if (memory_size > memory_size_max) {
......@@ -90,6 +93,8 @@ size_t fpga_get_memory_size_max() { return memory_size_max; }
size_t fpga_diagnose_memory(int detailed) {
size_t total = 0;
// size_t size = 0;
// int i = 0;
auto iter = memory_map.begin(); // std::map<void *, size_t>::iterator
while (iter != memory_map.end()) {
total += iter->second;
......@@ -108,7 +113,7 @@ void fpga_free(void *ptr) {
memory_size -= size;
#ifdef PADDLE_LITE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
munmap(ptr, size);
#else
......@@ -126,6 +131,9 @@ int fpga_flush(void *address, size_t size) {
}
int fpga_invalidate(void *address, size_t size) {
// std::cout <<
// "=================================================================================="
// << std::endl;
struct MemoryCacheArgs args;
args.address = address;
args.size = size;
......@@ -150,18 +158,85 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int fpga_reset() {
struct FpgaResetArgs args;
return do_ioctl(IOCTL_FPGA_RESET, &args);
}
int ioctl_conv(const struct ConvArgs &args) {
#ifdef ENABLE_DEBUG
// std::cout << "======Compute Basic Conv======";
// std::cout << " relu_enabled:" << args.relu_enabled
// << " sb_address:" << args.sb_address
// << " filter_address:" << args.filter_address
// << " filter_num:" << args.filter_num
// << " group_num:" << args.group_num;
// std::cout << " image_address:" << args.image.address
// << " image_scale_address:" << args.image.scale_address
// << " image_channels:" << args.image.channels
// << " image_height:" << args.image.height
// << " image_width:" << args.image.width
// << " pad_height:" << args.image.pad_height
// << " pad_width:" << args.image.pad_width;
// std::cout << " kernel_height:" << args.kernel.height
// << " kernel_width:" << args.kernel.width
// << " stride_h:" << args.kernel.stride_h
// << " stride_w:" << args.kernel.stride_w;
// std::cout << " out_address:" << args.output.address
// << " out_scale_address:" << args.output.scale_address;
//
// float* in_scale = (float*)args.image.scale_address;
// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
#endif
return do_ioctl(IOCTL_CONFIG_CONV, &args);
// return 0;
}
int compute_fpga_conv_basic(const struct ConvArgs &args) {
#ifdef ENABLE_DEBUG
// std::cout << "======Compute Basic Conv======";
// std::cout << " relu_enabled:" << args.relu_enabled
// << " sb_address:" << args.sb_address
// << " filter_address:" << args.filter_address
// << " filter_num:" << args.filter_num
// << " group_num:" << args.group_num;
// std::cout << " image_address:" << args.image.address
// << " image_scale_address:" << args.image.scale_address
// << " image_channels:" << args.image.channels
// << " image_height:" << args.image.height
// << " image_width:" << args.image.width
// << " pad_height:" << args.image.pad_height
// << " pad_width:" << args.image.pad_width;
// std::cout << " kernel_height:" << args.kernel.height
// << " kernel_width:" << args.kernel.width
// << " stride_h:" << args.kernel.stride_h
// << " stride_w:" << args.kernel.stride_w;
// std::cout << " out_address:" << args.output.address
// << " out_scale_address:" << args.output.scale_address;
// float *in_scale = (float *)args.image.scale_address;
// std::cout << " scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
// float *filter_scale = (float *)args.filter_scale_address;
// std::cout << " filter scale:" << filter_scale[0] << "," <<
// filter_scale[1] << std::endl;
#endif
return do_ioctl(IOCTL_CONFIG_CONV, &args);
}
int compute_fpga_conv(const struct SplitConvArgs &args) {
// return do_ioctl(IOCTL_CONFIG_CONV, &args);
int split_num = args.split_num;
int ret = -1;
for (int i = 0; i < split_num; i++) {
// ComputeBasicConv(args.conv_args[i]);
ret = compute_fpga_conv_basic(args.conv_arg[i]);
}
......@@ -181,11 +256,15 @@ int compute_fpga_ewadd(const struct EWAddArgs &args) {
}
int get_device_info(const struct DeviceInfo &args) {
// DeviceInfo info;
// struct DeviceInfo* a = &info;
int ret = do_ioctl(IOCTL_DEVICE_INFO, &args);
// std::cout << "a." << a->filter_cap << std::endl;
return ret;
}
int perform_bypass(const struct BypassArgs &args) {
int ret = -1;
int size = args.image.channels * args.image.width * args.image.height;
int max_size = 1 << 21;
......@@ -206,6 +285,7 @@ int perform_bypass(const struct BypassArgs &args) {
bypassArgs.image.height = 1;
bypassArgs.output.scale_address = scales;
float scale = 0;
for (int i = 0; i < count; ++i) {
bypassArgs.image.channels = max_size;
......@@ -213,7 +293,7 @@ int perform_bypass(const struct BypassArgs &args) {
reinterpret_cast<char *>(input_address + i * max_size * type_size);
bypassArgs.output.address =
reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
if (ret != 0) {
......@@ -222,13 +302,18 @@ int perform_bypass(const struct BypassArgs &args) {
}
int remainder = size - max_size * count;
bypassArgs.image.channels = remainder;
bypassArgs.image.address =
reinterpret_cast<char *>(input_address + count * max_size * type_size);
bypassArgs.output.address = reinterpret_cast<char *>(
output_address + count * max_size * out_type_size);
int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
// std::cout << "remainder:" << remainder << std::endl;
if (remainder > 0) {
bypassArgs.image.channels = remainder;
bypassArgs.image.address =
reinterpret_cast<char *>(input_address + count * max_size * type_size);
bypassArgs.output.address = reinterpret_cast<char *>(
output_address + count * max_size * out_type_size);
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
}
args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale;
return ret;
......@@ -279,10 +364,21 @@ int compute_fpga_dwconv(const struct DWconvArgs &args) {
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
// float *in_scale = (float *)args.image.scale_address;
// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
#endif
return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
}
int config_activation(const struct ActiveParamterArgs& args) {
return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
}
// int config_power(const struct PowerArgs& args) {
// return do_ioctl(IOCTL_CONFIG_POWER, &args);
// }
int config_inplace(const struct InplaceArgs &args) {
return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
}
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
#include <stdint.h>
#include <cstddef>
#include <iostream>
......@@ -40,6 +43,13 @@ enum DLayoutType {
LAYOUT_HWC = 0,
};
enum ActiveType {
TYPE_RELU = 0,
TYPE_RELU6 = 1,
TYPE_LEAK_RELU = 2,
TYPE_SIGMOID = 3,
};
struct VersionArgs {
void* buffer;
};
......@@ -48,7 +58,7 @@ struct DeviceInfo {
uint32_t filter_cap;
uint32_t version;
uint16_t device_type;
uint32_t reserved0;
uint32_t colunm;
uint32_t reserved1;
uint32_t reserved2;
uint32_t reserved3;
......@@ -108,6 +118,7 @@ struct ConvArgs {
void* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
uint32_t dilation;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
......@@ -199,9 +210,16 @@ struct NormalizeParameterArgs {
uint32_t hight_width;
};
struct ActiveParamterArgs {
ActiveType type;
uint16_t leaky_relu_factor;
};
struct InplaceArgs {
bool leaky_relu_enable;
bool relu_enable;
bool sigmoid_enable;
bool relu6_enable;
bool power_enable;
bool normalize_enable;
};
......@@ -216,7 +234,9 @@ struct FpgaRegReadArgs {
uint64_t value;
};
struct FpgaResetArgs {};
struct FpgaResetArgs {
uint32_t val;
};
#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
......@@ -248,6 +268,8 @@ struct FpgaResetArgs {};
_IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
#define IOCTL_CONFIG_NORMALIZE_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
_IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
......@@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args);
int compute_fpga_concat(const struct ConcatArgs& args);
int compute_fpga_resize(const struct ResizeArgs& args);
int config_activation(const struct ActiveParamterArgs& args);
int config_power(const struct PowerArgs& args);
int compute_fpga_dwconv(const struct DWconvArgs& args);
int config_norm_param(const struct NormalizeParameterArgs& args);
......@@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args);
int flush_cache(void* addr, int size);
int invalidate_cache(void* addr, int size);
int fpga_reset();
int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num);
} // namespace zynqmp
} // namespace paddle
#endif // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
文件模式从 100644 更改为 100755
......@@ -26,6 +26,7 @@ namespace zynqmp {
struct ReLUParam {
public:
bool enabled = false;
float leaky_relu_factor = 0.0f;
};
struct PEParam {
......@@ -98,6 +99,24 @@ struct DepthwiseConvParam : ConvParam {
Tensor* quantizedFilter_ = new Tensor();
};
struct GRUParam : PEParam {
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
Tensor* bias = nullptr;
Tensor* batch_gate = nullptr;
Tensor* batch_reset_hidden_prev = nullptr;
Tensor* batch_hidden = nullptr;
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation= "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
enum PoolingType : int {
MAX = 0,
AVERAGE = 1,
......@@ -133,6 +152,12 @@ struct ElementwiseAddParam : PEParam {
EWAddArgs ewargs;
};
struct ElementwiseMulParam : PEParam {
public:
std::vector<Tensor*> inputs;
Tensor* output = nullptr;
};
struct FullyConnectedParam : PEParam {
public:
Tensor* input = nullptr;
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -49,7 +49,118 @@ class ConvPE : public PE {
concatPE_.init();
concatPE_.apply();
}
if (DLEngine::get_instance().isZU3() &&
param_.input->shape().dimSize() == 4 &&
param_.input->shape().width() == 1 &&
param_.input->shape().width() >= 2048) {
use_cpu_ = true;
}
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) {
use_cpu_ = true;
}
if (!use_cpu_) {
// param_.filter->releaseData();
}
// exit(-1);
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
float* out = float_output.mutableData<float>(FP32, output->shape());
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.filter->shape().height();
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
// int out_channel = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width;
float max = 0;
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend = std::min(hstart + kernel_height, (int) image_height);
int wend = std::min(wstart + kernel_width, (int) image_width);
hstart = std::max(hstart, (int) 0);
wstart = std::max(wstart, (int) 0);
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
for (int c = 0; c < image_channels; c++) {
for (int h = hstart; h < hend; h++) {
int hi = 0;
if (ph == 0) {
hi = h - hstart + image_pad_h;
} else{
hi = h - hstart;
}
for (int w = wstart; w < wend; w++) {
int wi = 0;
if (pw == 0) {
wi = w - wstart + image_pad_w;
}else {
wi = w - wstart;
}
const int index = (h * image_width + w) * image_channels + c;
// int weight_index = (hi * kernel_width + wi) * image_channels + c;//TODO
int weight_index = oc * filter_chw + kernel_width *
kernel_height * c + kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index];
sum += value;
}
}
}
// std::cout << " ============================= pool_index:" << pool_index << " sum:" << sum << std::endl;
if (param_.relu.enabled && sum < 0) {
sum = -sum;
}
if (sum > max) {
max = sum;
}
out[pool_index] = sum;
}
}
}
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
......@@ -59,79 +170,161 @@ class ConvPE : public PE {
Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float_input.syncToCPU();
// float_input.saveToFile("input", true);
// param_.filter->saveToFile("filter", true);
// param_.bias()->saveToFile("bias", true);
// exit(-1);
// float16* data_out = output->data<float16>();
float* out = float_output.mutableData<float>(FP32, output->shape());
float* bias_data = param_.bias()->data<float>();
int out_width = output->shape().width();
int out_channel = output->shape().channel();
int in_channel = input->shape().channel();
float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel];
float max = 0;
int out_index = 0;
for (int i = 0; i < out_channel; i++) {
float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi;
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
float value = image_addr[j] * filter_ptr[j];
mi[j] = value;
}
float sum = 0;
for (int j = 0; j < in_channel; j++) {
sum += mi[j];
for (int h = 0; h < output->shape().height(); h++) {
for (int w = 0; w < output->shape().width(); w++) {
float sum = 0;
// #pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j];
sum += value;
// mi[j] = value;
}
// for (int j = 0; j < in_channel; j++) {
// sum += mi[j];
// }
sum += bias_data[i];
if (param_.relu.enabled && sum < 0) {
sum = 0;
}
if (sum > max) {
max = sum;
}
out_index = h * out_width * out_channel + w * out_channel + i;
out[out_index] = sum;
// out_index++;
}
}
out[i] = sum;
}
delete[] mi;
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127;
output->scale()[1] = 127 / max;
// float_output.saveToFile("out", true);
// exit(-1);
}
bool dispatch() {
inplace_.relu_enable = param_.relu.enabled;
if (use_cpu_) {
cpu_compute();
return true;
}
inplace_.leaky_relu_enable =
(param_.relu.leaky_relu_factor != 0) ? true : false;
inplace_.relu_enable =
inplace_.leaky_relu_enable ? false : param_.relu.enabled;
inplace_.power_enable = false;
inplace_.normalize_enable = false;
if (param_.relu.enabled) {
inplace_.relu_enable = param_.relu.enabled;
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.leaky_relu_factor =
fp32_2_fp16(param_.relu.leaky_relu_factor);
config_activation(activeParamterArgs);
}
}
std::vector<BasicConvParam*>& params = param_.splitParams();
int ret = 0;
for (auto conv_param : params) {
// conv_param->input.printScale();
ret |= compute_fpga_conv_basic(conv_param->args);
}
if (param_.relu.enabled) {
if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
inplace_.relu_enable = false;
inplace_.leaky_relu_enable = false;
config_inplace(inplace_);
if (inplace_.leaky_relu_enable) {
activeParamterArgs.type = TYPE_LEAK_RELU;
activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
config_activation(activeParamterArgs);
}
}
size_t size = params.size();
if (split_axis == 0 && ret == 0 && size > 1) {
// std::cout << "concat size:" << size << std::endl;
concatPE_.dispatch();
}
if (split_axis == 1 && ret == 0 && size > 1) {
// for (int n = 0; n < size - 1; n++) {
ElementwiseAddParam& add_param = addPE_.param();
add_param.inputs = {&params[0]->output, &params[1]->output};
add_param.output = param_.output;
addPE_.init();
addPE_.apply();
addPE_.dispatch();
// param_.output->printScale();
// params[0]->input.saveToFile("conv_1.txt");
// params[1]->input.saveToFile("conv_2.txt");
// params[0]->output.saveToFile("ew_o1.txt");
// params[1]->output.saveToFile("ew_o2.txt");
// std::cout << "\n ================== EW ================== \n";
// }
}
if (param_.input->shape().channel() == 64 && param_.output->shape().channel() == 128) {
// exit(-1);
}
return ret == 0;
}
ConvParam& param() { return param_; }
private:
bool use_cpu_ = false;
ConvParam param_;
ConcatPE concatPE_;
ElementwiseAddPE addPE_;
int split_axis = 0;
InplaceArgs inplace_ = {0};
ActiveParamterArgs activeParamterArgs;
};
} // namespace zynqmp
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#ifndef conv_process_hpp
#define conv_process_hpp
#include <string.h>
#include <cmath>
#include <vector>
......@@ -45,7 +48,10 @@ inline int get_split_num(Tensor* filter) {
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity);
// int aligned_num = align_to_x(num ,FILTER_NUM_ALIGNMENT);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num = align_to_x(num, filter_num_alignment);
return filter::calc_split_num(aligned_num, div_capacity);
}
inline void fill_scale_bias_const(ConvParam* param_) {
......@@ -79,8 +85,7 @@ inline void combine_bn_params(BatchnormParam* bn, ConvParam* param_) {
}
}
inline void combine_add_bn_params(BatchnormParam* bn,
Tensor* bias,
inline void combine_add_bn_params(BatchnormParam* bn, Tensor* bias,
ConvParam* param_) {
int channel = param_->output->shape().channel();
Shape sb_shape(N, {channel});
......@@ -112,59 +117,102 @@ inline void combine_add_bn_params(BatchnormParam* bn,
param_->bias()->setDataLocation(CPU);
}
inline void format_scale_bias(Tensor* scale,
Tensor* bias,
Tensor* filter,
Tensor* scale_bias,
int group) {
inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter,
Tensor* scale_bias, int group) {
float* scale_data = nullptr;
float* bias_data = nullptr;
if (scale != nullptr) {
scale_data = scale->data<float>();
}
if (bias != nullptr) {
bias_data = bias->data<float>();
}
int channel = filter->shape().num();
Shape bias_scale_shape(N, {2 * channel});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
for (int i = 0; i < channel; i++) {
float scale_value = scale_data == nullptr ? 1 : scale_data[i];
float bias_value = bias_data == nullptr ? 0 : bias_data[i];
bs_data[i + channel] = scale_value;
bs_data[i] = bias_value;
}
float* bias_data = nullptr;
if (scale != nullptr) {
scale_data = scale->data<float>();
}
if (bias != nullptr) {
bias_data = bias->data<float>();
}
int channel = filter->shape().num();
int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
int element_num_per_div = get_filter_num_per_div(filter, group);
bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
int c_per_group = channel / group;
int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
Shape bias_scale_shape(N, {2 * scale_bias_len});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
float* temp_data = (float*)fpga_malloc(2 * scale_bias_len * sizeof(float)) ;
memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
std::vector<float> scales;
if (scale_data != nullptr) {
for (int i = 0; i < channel; ++i) {
scales.push_back(scale_data[i]);
}
for (int i = 0;i < scale_bias_len - channel; i++) {
scales.push_back(1);
}
} else {
for (int i = 0;i < scale_bias_len; i++) {
scales.push_back(1);
}
}
for (int i = 0; i < scale_bias_len; ++i) {
temp_data[i + scale_bias_len] = 1;
temp_data[i] = 0;
}
for (int g = 0; g < group; g++) {
for (int c = 0; c < c_per_group; c++) {
int src_index = g * c_per_group + c;
int dst_index = g * aligned_c_per_group + c;
float scale_value = scales[src_index];
float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
temp_data[dst_index + scale_bias_len] = scale_value;
temp_data[dst_index] = bias_value;
}
}
// int element_num_per_div = get_filter_num_per_div(filter, group);
// int scale_bias_len = align_to_x(channel / group, 8) * group;
bias_scale::format_bias_scale_array(&temp_data, scale_bias_len / group, scale_bias_len);
memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
}
inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group,
std::vector<float>& scales) {
float max_value = find_max(*filter);
Shape& filter_shape = filter->shape();
int mem_size;
std::vector<float> max_values;
int8_t* quantized_data = filter::format_filter(filter->data<float>(), mem_size ,filter_shape.num(),
filter_shape.channel(), filter_shape.height(), filter_shape.width(), group, max_value, max_values);
float mem_factor = mem_size * 1.0f / filter->shape().numel();
quantized_filter->setMemScale(mem_factor);
quantized_filter->setAligned(true);
quantized_filter->mutableData<int8_t>(INT8, filter->shape());
int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
quantized_filter->scale()[0] = max_value / 127.0f;
quantized_filter->scale()[1] = 127.0f / max_value;
auto memory_size = filter->shape().memorySize(sizeof(float));
auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
memcpy(new_data, filter->data<float>(), memory_size);
size_t mem_size = filter::format_filter(&new_data,
filter_shape.num(),
filter_shape.channel(),
filter_shape.height(),
filter_shape.width(),
group,
max_value);
int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
memcpy(src, new_data, mem_size);
fpga_free(new_data);
memcpy(src, quantized_data, mem_size);
quantized_filter->flush();
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f);
}
// filter->saveToFile("filter.txt");
// std::ofstream ofs;
// ofs.open("quant.txt");
// for (int i = 0; i < mem_size; i++) {
// float value = quantized_data[i];
// ofs << value << std::endl;
// }
// ofs.close();
// exit(-1);
}
inline void format_dw_filter(Tensor* filter,
Tensor* quantized_filter,
inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter,
float* scale) {
int num = filter->shape().num();
int height = filter->shape().height();
......@@ -207,10 +255,18 @@ inline void split_filter_num(const ConvParam& c_param) {
Tensor* out = param.output;
Tensor* filter = param.filter;
auto channel = out->shape().channel();
int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num = align_to_x(num / param.groups, filter_num_alignment) * param.groups;
// int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) * param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
......@@ -224,23 +280,20 @@ inline void split_filter_num(const ConvParam& c_param) {
ConvArgs& args = conv_param->args;
if (split_num == 1) {
out_address = out->data<float16>();
out_scale_address = out->scale();
out_address = out->data<float16>();
out_scale_address = out->scale();
}
filter_num = i == split_num - 1
? channel - (split_num - 1) * filter_num_per_div // NOLINT
: filter_num_per_div;
? channel - (split_num - 1) * filter_num_per_div // NOLINT
: filter_num_per_div;
if (split_num != 1) {
Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
}
Shape f_shape(NCHW,
{filter_num,
filter->shape().channel(),
filter->shape().height(),
filter->shape().width()});
Shape f_shape(NCHW, {filter_num, filter->shape().channel(),
filter->shape().height(), filter->shape().width()});
Tensor new_filter;
float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
......@@ -251,9 +304,16 @@ inline void split_filter_num(const ConvParam& c_param) {
filter->data<float>() + i * filter_num_per_div * filter_hwc,
filter_num * filter_hwc * sizeof(float));
new_filter.flush();
conv_param->filter.mutableData<float>(FP32, f_shape);
format_filter(&new_filter, &(conv_param->filter), param.groups);
if (param.groups != 1) {
int mem_factor = 32 / filter_num_per_div; // TODO
conv_param->filter.setMemScale(mem_factor);
}
std::vector<float> v; // TODO
format_filter(&new_filter, &(conv_param->filter), param.groups, v);
conv_param->filter.setDataType(INT8);
int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
Tensor scale;
......@@ -264,23 +324,30 @@ inline void split_filter_num(const ConvParam& c_param) {
Shape s_shape(N, {filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
std::cout << "v size: " << v.size() << std::endl;
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
// scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
}
for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
Shape sb_shape(N, {sb_num});
format_scale_bias(&scale,
&bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
format_scale_bias(&scale, &bias, &conv_param->filter,
&conv_param->scaleBias, param.groups);
// conv_param->scaleBias.saveToFile("sb.txt");
conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>();
// conv_param->scaleBias.saveToFile("sb.txt");
// param.scale()->saveToFile("scale.txt");
// param.bias()->saveToFile("bias.txt");
// exit(-1);
args.group_num = param.groups;
args.relu_enabled = param.relu.enabled;
args.sb_address = conv_param->scaleBias.data<float>();
args.sb_address = bs_data;
args.kernel.stride_h = param.strides[1];
args.kernel.stride_w = param.strides[0];
args.kernel.height = new_filter.shape().height();
......@@ -294,17 +361,13 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.channels = input->shape().channel();
args.image.width = input->shape().width();
args.image.height = input->shape().height();
auto paddings = *param.padding;
args.image.pad_width = param.paddings[2];
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
// TODO dilations[0] = dilations[1]
args.dilation = param.dilations[0];
args.output.address = out_address;
args.output.scale_address = out_scale_address;
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.splitParams().push_back(conv_param);
}
}
......@@ -317,7 +380,7 @@ inline void split_channel(const ConvParam& c_param) {
int num = ceil(input->shape().channel() * 1.0f / 2047);
int channel = input->shape().channel() / num;
std::cout << "channel::" << channel << "num::" << num << std::endl;
Shape bs_shape(N, {channel});
for (int i = 0; i < num; i++) {
......@@ -331,6 +394,7 @@ inline void split_channel(const ConvParam& c_param) {
// filter transformation;
Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
Tensor new_filter;
float* dst = new_filter.mutableData<float>(FP32, f_shape);
......@@ -341,7 +405,8 @@ inline void split_channel(const ConvParam& c_param) {
src += param.filter->shape().channel();
}
new_filter.flush();
format_filter(&new_filter, &(conv_param->filter), param.groups);
std::vector<float> scales;
format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
Tensor bias;
Tensor scale;
......@@ -354,12 +419,11 @@ inline void split_channel(const ConvParam& c_param) {
}
scale.flush();
bias.flush();
format_scale_bias(&scale,
&bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
// Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale, &bias, &conv_param->filter,
&conv_param->scaleBias, param.groups);
conv_param->scaleBias.flush();
// conv_param->scaleBias.saveToFile("sb.txt");
ConvArgs& args = conv_param->args;
args.group_num = param.groups;
......@@ -379,18 +443,12 @@ inline void split_channel(const ConvParam& c_param) {
args.image.channels = conv_param->input.shape().channel();
args.image.width = conv_param->input.shape().width();
args.image.height = conv_param->input.shape().height();
auto paddings = *param.paddings;
args.image.pad_width = paddings[2];
args.image.pad_height = paddings[0];
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
// TODO dilations[0] = dilations[1]
args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale();
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.splitParams().push_back(conv_param);
}
}
......@@ -407,6 +465,7 @@ inline int fill_split_arg(const ConvParam& c_param) {
split_filter_num(c_param);
return 0;
}
// split_filter_num(c_param);
}
inline bool compute_conv(const ConvParam& c_conv_params) {
......@@ -418,11 +477,11 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
}
size_t size = params.size();
if (ret == 0 && size > 1) {
// Tensor* output = conv_params.output;
Tensor& img = params[0]->output;
for (int i = 0; i < 1; i++) {
for (int i = 0; i < img.shape().numel(); i++) {
float value = half_to_float(img.data<float16>()[i]);
std::cout << "value:" << value << std::endl;
}
}
}
......@@ -431,3 +490,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
} // namespace zynqmp
} // namespace paddle
#endif /* conv_process_hpp */
文件模式从 100644 更改为 100755
......@@ -36,19 +36,36 @@ class DepthwiseConvPE : public PE {
Tensor* input = param.input;
Tensor* output = param.output;
int channel = output->shape().channel();
float* new_scale_data = param_.scale()->data<float>();
float* new_bias_data = param_.bias()->data<float>();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]);
if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16
for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]);
}
bias_.flush();
} else {
float16* new_bias_data = param_.bias()->data<float16>();
memcpy(b_data, new_bias_data, channel * sizeof(float16));
bias_.flush();
}
bias_.flush();
Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
if (param_.scale()->dataType() == FP32) {
float* new_scale_data = param_.scale()->data<float>();
Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
} else {
//TODO filter 全为1时,且channal为对齐时
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data, scale_data, param.filter->shape().numel() * sizeof(float16));
param.quantizedFilter()->flush();
}
DWconvArgs args = {0};
args.bias_address = b_data;
......@@ -61,21 +78,14 @@ class DepthwiseConvPE : public PE {
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
auto paddings = *param.paddings;
args.image.pad_width = param.paddings[2];
args.image.pad_height = param.paddings[0];
args.image.pad_width = param.paddings[0];
args.image.pad_height = param.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
args.out_width = param.output->shape().width();
args.out_height = param.output->shape().height();
args.sub_conv_num = 1;
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
<< paddings[1] << ", " << paddings[2] << ", " << paddings[3];
}
param.args = args;
inplace_.relu_enable = param_.relu.enabled;
......@@ -106,4 +116,4 @@ class DepthwiseConvPE : public PE {
};
} // namespace zynqmp
} // namespace paddle
} // namespace paddle_mobile
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
namespace paddle {
namespace zynqmp {
class ElementwiseMulPE : public PE {
public:
bool init() {
Tensor* output = param_.output;
output->setAligned(true);
output->setDataLocation(Device);
return true;
}
void apply() {
Tensor* input = param_.inputs[0];
Tensor* output = param_.output;
int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
// int wc_aligned = / 32 * 32;
Shape s(N, {wc_aligned});
float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
memset(bias_data, 0, wc_aligned * sizeof(float16));
ScaleArgs& args = args_;
args.scale_address = param_.inputs[1]->data<void>();
args.bias_address = bias_tensor.data<void>();
args.wc_alignment = wc_aligned;
args.channel_alignment = wc_aligned;
args.image.address = input->data<void>();
args.image.scale_address = input->scale();
args.image.channels = wc_aligned;
args.image.height = 1;
args.image.width = 1;
args.image.pad_width = 0;
args.image.pad_height = 0;
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
}
void updateInput(Tensor* t, int index) {
if (index == 0) {
args_.scale_address = t->data<void>();//replace inputs?
}
}
bool dispatch() {
compute_fpga_scale(args_) == 0;
return true;
}
ElementwiseMulParam& param() { return param_; }
private:
ElementwiseMulParam param_;
ScaleArgs args_ = {0};
Tensor bias_tensor;
};
} // namespace zynqmp
} // namespace paddle
......@@ -37,7 +37,16 @@ class FullyConnectedPE : public PE {
ConvParam& convParam_ = convPE_.param();
Tensor* input = param_.input;
convParam_.input = param_.input;
num_ = param_.input->shape().num();
// if (num_ == 1) {
// } else {
// tempOut_.mutableData<void>(FP16, param_.out->shape());
// convParam_.output = &tempOut_;
// }
convParam_.output = param_.output;
convParam_.groups = 1;
convParam_.strides = {1, 1};
convParam_.paddings = {0, 0};
......@@ -63,9 +72,11 @@ class FullyConnectedPE : public PE {
new_filter_data[i * chw + j] = scale;
}
}
// conv_filter->copyFrom(param_.filter);
conv_filter->flush();
convParam_.filter = conv_filter;
// convParam_.filter = param_.filter;
Shape sb_shape(N, {channel});
float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
......@@ -82,13 +93,31 @@ class FullyConnectedPE : public PE {
convPE_.apply();
}
bool dispatch() { return convPE_.dispatch(); }
bool dispatch() {
// return
return convPE_.dispatch();
// convPE_.dispatch();
// if (num_ == 1) {
// return true;
// }
// Tensor* output = param_.output;
// int size = output->shape().numel() * sizeof(floa16);
// memcpy(output->data<void>(), tempOut_->data<void>(), size);
// for (int i = 1;i < num_;i ++) {
// memcpy(output->data<void>(), tempOut_->data<void>(), size);
// }
// return true;
}
FullyConnectedParam& param() { return param_; }
private:
FullyConnectedParam param_;
ConvPE convPE_;
Tensor tempOut_;
int num_ = 1;
};
} // namespace zynqmp
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/arm/math/sgemm.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/api/paddle_place.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace zynqmp {
struct GRUTensors {
Tensor* gate;
Tensor* pre_output;
Tensor* output;
Tensor* reset_output;
};
class GRUPE : public PE {
public:
bool init() {
// Tensor* output = param_.output;
// output->setAligned(true);
// output->setDataLocation(Device);
return true;
}
void apply() {
auto hidden = param_.hidden;
// auto hidden_dims = hidden->dims();
int frame_size = hidden->shape().channel();
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data = prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
// copy 2/3 weight from param.weight;
zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float));
weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memcpy(weight_data, param_.weight->data<float>(), weight_shape.numel() * sizeof(float));
Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
gate_ping_.mutableData<void>(FP32, gate_shape);
gate_pong_.mutableData<void>(FP16, gate_shape);
zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param();
pre_out_param.input = &prev_hidden_;
pre_out_param.output = &gate_pong_;
pre_out_param.filter = &weight_;
pre_out_param.bias = &gate_ping_;
pre_out_pe_.init();
pre_out_pe_.apply();
// // ============= C
// ElementwiseAddParam& bias_add_param = bias_ew_pe_.param();
// bias_add_param.inputs = {&pre_output_, &pre_input_};
// bias_add_param.output = &pre_input_;
// bias_ew_pe_.init();
// bias_ew_pe_.apply();
// // ====================
// Shape state_weight_shape(NC,{frame_size, frame_size});
// float* state_weight_data = state_weight_.mutableData<float>(FP32, state_weight_shape);
// memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
// state_weight_shape.numel() * sizeof(float));
// FullyConnectedParam& reset_out_param = reset_out_pe_.param();
// reset_out_param.input = &prev_hidden;
// reset_out_param.output = &gate_ping;
// reset_out_param.filter = &state_weight_;
// // ============== unit reset;
// update_gate_.mutableData<void>(FP16, pre_input_shape);
// InputParam& relu_param = update_relu_pe_.param();
// relu_param.input = &tempTensor;
// relu_param.output = &update_gate_;
// update_relu_pe_.init();
// update_relu_pe_.apply();
reset_gate_.mutableData<void>(FP16, hidden_shape);
prev_hidden_.mutableData<void>(FP16, hidden_shape);
reset_hidden_.mutableData<void>(FP16, hidden_shape);
// InputParam& reset_param = reset_relu_pe_.param();
// reset_param.input = &tempTensor;
// reset_param.output = &reset_gate_;
// reset_relu_pe_.init();
// reset_relu_pe_.apply();
// float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
// memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16)); // TODO
// reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
ElementwiseMulParam& mul_param = mul_pe_.param();
mul_param.inputs = {&reset_gate_, &prev_hidden_};
mul_param.output = &reset_hidden_;
mul_pe_.init();
mul_pe_.apply();
// ==============
}
bool dispatch() {
return true;
}
void gru_unit_reset_act(const lite_api::ActivationType active_gate, GRUTensors& value,
int frame_size, int batch_size) {
int stride_update = 3 * frame_size;
int stride_cell_state = 3 * frame_size;
int stride_hidden_prev = frame_size;
int stride_hidden = frame_size;
// Tensor* gate = value.gate;
// value.gate->saveToFile("value_input.txt");
float* update_gate_data = gate_ping_.data<float>();
float* reset_gate_data = update_gate_data + frame_size;
for (int b = 0; b < batch_size; b++) {
// memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel() * sizeof(float));
// tempTensor.flush();
Tensor tmp;
Shape s(NC, {1, frame_size}); //TODO
float* tmp_data = tmp.mutableData<float>(FP32, s);
for (int i = 0; i < frame_size; i++) {
// f(x) = x / (1 + abs(x))?
update_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(update_gate_data[i]);
reset_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(reset_gate_data[i]);
}
memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
tmp.flush();
reset_gate_.copyFrom(&tmp);
// reset_gate_.copyFrom(&tempTensor);
Tensor* hidden_prev = value.pre_output;
if (hidden_prev) {
// memcpy(prev_data, )
// TODO change to pre_out;
prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
}
// // 4.0 reset_date * hidden_prev;
// // reset_hidden_prev[i] = reset_gate[i] * prev;
mul_pe_.dispatch();
reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update;
reset_gate_data += stride_update;
// reset_hidden_prev += stride_hidden;// TODO
}
}
void gru_unit_out_act(const lite_api::ActivationType active_node, bool origin_mode,
GRUTensors& value, int frame_size, int batch_size) {
// int stride_update = 3 * frame_size;
// int stride_cell_state = 3 * frame_size;
// int stride_hidden_prev = frame_size;
// int stride_hidden = frame_size;
// Tensor* hidden = value.output_value;
// float* hidden_prev = nullptr;
// if (hidden) {
// hidden_prev = hidden->data<float>();
// }
// float* cell_state = value.gate->data<float>() + 2 * frame_size;
// float* updata_gate = value.gate->data<float>();
// // float* reset_gate_data = update_gate_data + frame_size;
// float prev = 0.0f;
// for (int b = 0; b < batch_size; ++b) {
// if (origin_mode) {
// // for (int i = 0; i < frame_size; i++) {
// // float prev = 0;
// // if (hidden_prev) {
// // prev = hidden_prev[i];
// // }
// // cell_state[i] = lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
// // hidden[i] =
// // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] * prev;
// // }
// } else {
// for (int i = 0; i < frame_size; ++i) {
// cell_state[i] = lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
// if (hidden_prev) {
// prev = hidden_prev[i];
// }
// float hidden_value =
// prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
// hidden_prev[i] = hidden_value;
// std::cout << "hidden_value::" << hidden_value << std::endl;
// }
// }
// updata_gate += stride_update;
// cell_state += stride_cell_state;
// hidden_prev += frame_size;
// }
}
void copy_input(GRUTensors& value) {
float max = find_max(*(value.gate));
gate_ping_.mutableData<void>(FP32, value.gate->shape());
gate_ping_.copyFrom(value.gate);
// TODO update input pointer?
// gate_.readFromFile("input/in.txt");
// // pre_input_.saveToFile("pppp_in.txt");
// gate_.scale()[0] = max / 127;
// gate_.scale()[1] = 127 / max;
// gate_.printScale("pre_input_");
// gate_.saveToFile("pre_input_.txt");
// pre_out_pe_.dispatch();
// pre_output_.saveToFile("pp_out.txt");
}
void GRUCOmpute(GRUTensors& value,
int frame_size,
int batch_size,
const lite_api::ActivationType active_node,
const lite_api::ActivationType active_gate,
bool origin_mode) {
copy_input(value);
if (value.pre_output) {
// copy by batch;
pre_out_pe_.dispatch();
gate_ping_.copyFrom(&gate_pong_);
}
gru_unit_reset_act(active_gate, value, frame_size, batch_size);
// if (value.pre_output) {
// // state weight;
// reset_out_pe_.dispatch();
// }
// gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
}
GRUParam& param() { return param_; }
// Tensor* preOutput() {
// return &pre_output_;
// }
// Tensor* gate() {
// return &gate_;
// }
Tensor* updateGate() {
return &update_gate_;
}
Tensor* resetGate() {
return &reset_gate_;
}
private:
GRUParam param_;
zynqmp::Tensor gate_ping_;
zynqmp::Tensor gate_pong_;
zynqmp::Tensor bias_;
zynqmp::Tensor weight_;
zynqmp::Tensor state_weight_;
// =================================
zynqmp::Tensor update_gate_;
zynqmp::Tensor reset_gate_;
zynqmp::Tensor cell_state_;
zynqmp::Tensor prev_hidden_;
zynqmp::Tensor reset_hidden_;
Tensor tempTensor;
// =================================
ReluPE update_relu_pe_;
ReluPE reset_relu_pe_;
zynqmp::ElementwiseMulPE mul_pe_;
zynqmp::FullyConnectedPE pre_out_pe_;
zynqmp::FullyConnectedPE reset_out_pe_;
zynqmp::ElementwiseAddPE bias_ew_pe_;
};
} // namespace zynqmp
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/backends/arm/math/gru_utils.h"
namespace paddle {
namespace lite {
namespace fpga {
// inline void gru_unit_reset_act(lite_api::ActivationType act_type,
// GRUMetaValue<float> value,
// int frame_size,
// int batch_size) {
// auto updata_gate = value.gate_value;
// auto reset_gate = value.gate_value + frame_size;
// auto hidden_prev = value.prev_out_value;
// auto reset_hidden_prev = value.reset_output_value;
// int stride_update = 3 * frame_size;
// int stride_reset = 3 * frame_size;
// int stride_hidden_prev = frame_size;
// int stride_reset_hidden_prev = frame_size;
// if (act_type == kRelu) {
// }
// }
// void gru_compute(arm::math::GRUMetaValue<float> value,
// int frame_size,
// int batch_size,
// const lite_api::ActivationType active_node,
// const lite_api::ActivationType active_gate,
// bool origin_mode) {
// std::cout << " =================== gru gru_compute =================== \n";
// // exit(-1);
// // sgemm(bool is_transA,
// // bool is_transB,
// // int M,
// // int N,
// // int K,
// // float alpha,
// // const float* A,
// // int lda,
// // const float* B,
// // int ldb,
// // float beta,
// // float* C,
// // int ldc,
// // const float* bias,
// // bool is_bias,
// // bool is_relu,
// // ARMContext* ctx);
// // sgemm for fc;
// // lite::arm::math::sgemm(false,
// // false,
// // m_,// batch;
// // n_,// filter num;
// // k_,// input_channel;
// // 1.f,
// // i_data,// input data;
// // k_,
// // w_data,// weight data;
// // n_,
// // 0.f,//beta;
// // o_data,// out data;
// // n_,
// // b_data,// bias;
// // false,
// // false,
// // &ctx);
// // C := alpha*op( A )*op( B ) + beta*C,
// if (value.prev_out_value) {
// // sgemm(false, // is_transA
// // false, // is_transB
// // batch_size, // M specifies the number of rows of the matrix
// // frame_size * 2, // N specifies the number of columns of the matrix
// // frame_size, // K
// // 1.f, // alpha
// // value.prev_out_value, // float* A,
// // frame_size, // lda
// // value.gate_weight, // float* B,
// // frame_size * 2, // ldb
// // 1.f, // beta
// // value.gate_value, // C*
// // frame_size * 3, // ldc
// // nullptr, // bias
// // false, // is_bias
// // false, // is_relu
// // ctx); // context
// }
// // gru_unit_reset_act(active_gate, value, frame_size, batch_size);
// if (value.prev_out_value) {
// // sgemm(false,
// // false,
// // batch_size,
// // frame_size,
// // frame_size,
// // 1.f,
// // value.reset_output_value,
// // frame_size,
// // value.state_weight,
// // frame_size,
// // 1.f,
// // value.gate_value + frame_size * 2,
// // frame_size * 3,
// // nullptr,
// // false,
// // false,
// // ctx);
// }
// // gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
// }
}
}
}
\ No newline at end of file
文件模式从 100644 更改为 100755
......@@ -25,6 +25,8 @@ class OutputPE : public PE {
bool init() {
Tensor* output = param_.output;
output->setAligned(false);
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
return true;
}
......@@ -41,6 +43,15 @@ class OutputPE : public PE {
} else {
output->copyFrom(input);
}
//
output->syncToCPU();
if (DLEngine::get_instance().out_data == nullptr) {
DLEngine::get_instance().out_data = reinterpret_cast<float*>(
fpga_malloc(output->shape().numel() * sizeof(float)));
}
memcpy(DLEngine::get_instance().out_data,
output->data<void>(),
output->shape().numel() * sizeof(float));
return true;
}
......
......@@ -35,24 +35,26 @@ class PoolingPE : public PE {
Tensor* input = param_.input;
Tensor* output = param_.output;
uint32_t k_width = param_.kernelSize[0];
uint32_t k_height = param_.kernelSize[1];
uint32_t k_height = param_.kernelSize[0];
uint32_t k_width = param_.kernelSize[1];
if (param_.globalPooling) {
k_width = input->shape().width();
k_height = input->shape().height();
param_.kernelSize[0] = k_height;
param_.kernelSize[1] = k_width;
}
PoolingArgs args = {0};
args.mode = param_.type;
auto paddings = *param_.paddings;
args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
args.image.address = input->data<float16>();
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_height = paddings[0];
args.image.pad_width = paddings[2];
args.image.pad_height = param_.paddings[0];
args.image.pad_width = param_.paddings[1];
args.image.scale_address = input->scale();
args.output.address = output->mutableData<float16>();
args.output.scale_address = output->scale();
......@@ -66,6 +68,9 @@ class PoolingPE : public PE {
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7);
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
// (k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE;
}
void compute() {
......@@ -74,16 +79,16 @@ class PoolingPE : public PE {
input->syncToCPU();
Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
auto paddings = *param_.paddings;
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = paddings[0];
int image_pad_w = paddings[2];
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0];
int kernel_step_h = param_.strides[0];
......@@ -109,6 +114,8 @@ class PoolingPE : public PE {
for (int c = 0; c < image_channels; ++c) {
const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
float sum = 0;
// const int index =
// (hstart * image_width + wstart) * image_channels + c;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = (h * image_width + w) * image_channels + c;
......@@ -129,7 +136,7 @@ class PoolingPE : public PE {
output->flush();
}
void cpu_compute() {
void cpu_compute1() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
......@@ -137,6 +144,7 @@ class PoolingPE : public PE {
Tensor float_input;
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
// float_input.saveToFile("pool_float.txt");
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
......@@ -154,13 +162,44 @@ class PoolingPE : public PE {
}
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
std::cout << "pool scale:" << scale_max / 127.0f << std::endl;
output->flush();
// exit(-1);
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
Tensor float_input;
float* float_input_data = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
float scale_max = 0;
for (int i = 0; i < output->shape().channel(); i++) {
float sum = 0;
for (int j = 0; j < kernel_hw; j++) {
sum += float_input_data[i * kernel_hw + j];
}
float value = sum / kernel_hw;
data_out[i] = float_to_half(value);
scale_max = std::max(scale_max, std::abs(value));
}
output->scale()[0] = scale_max / 127.0f;
output->scale()[1] = 127.0f / scale_max;
output->flush();
// exit(-1);
}
bool dispatch() {
if (use_cpu_) {
// cpu_compute();
compute();
// exit(-1);
return true;
}
param_.input->syncToDevice();
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -17,8 +17,16 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include <algorithm>
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
namespace paddle {
namespace zynqmp {
class ScalePE : public PE {
public:
inline int gcd(int a, int b) {
......@@ -38,10 +46,87 @@ class ScalePE : public PE {
return true;
}
// void apply() {
// Tensor* input = param_.input;
// Tensor* output = param_.output;
// Shape& input_shape = input->shape();
// int channel = input_shape.channel();
// int repeat = 1;
// int alignment = 16;
// int length = channel;
// if (channel % alignment != 0 || channel < alignment) {
// int c_lcm = lcm(channel, alignment);
// repeat = c_lcm / (channel);
// }
// Shape shape(N, {channel * repeat});
// param_.alignedBias()->mutableData<float16>(FP16, shape);
// param_.alignedScale()->mutableData<float16>(FP16, shape);
// float16* bias_data = param_.alignedBias()->data<float16>();
// float16* scale_data = param_.alignedScale()->data<float16>();
// if (param_.bias != nullptr) {
// float* bias_data_float = param_.bias->data<float>();
// for (int i = 0; i < repeat; i++) {
// for (int j = 0; j < length; j++) {
// float16 value = float_to_half(bias_data_float[j]);
// bias_data[i * length + j] = value;
// // bias_data[i * length + j] = float_to_half(1.0f);
// }
// }
// } else {
// float16 zero = float_to_half(0.0f);
// for (int i = 0; i < repeat; i++) {
// for (int j = 0; j < length; j++) {
// bias_data[i * length + j] = zero;
// }
// }
// }
// float* scale_data_float = param_.scale->data<float>();
// for (int i = 0; i < repeat; i++) {
// for (int j = 0; j < length; j++) {
// float16 value = float_to_half(scale_data_float[j]);
// scale_data[i * length + j] = value;
// }
// }
// param_.alignedScale()->flush();
// param_.alignedBias()->flush();
// int wc = input_shape.width() * input_shape.channel();
// int wc_aligned = align_image(wc);
// ScaleArgs& args = param_.args;
// args.scale_address = param_.alignedScale()->data<void>();
// args.bias_address = param_.alignedBias()->data<void>();
// args.wc_alignment = wc_aligned;
// args.channel_alignment = channel * repeat;
// args.image.address = input->data<void>();
// args.image.scale_address = input->scale();
// args.image.channels = channel;
// args.image.height = input_shape.height();
// args.image.width = input_shape.width();
// args.image.pad_width = 0;
// args.image.pad_height = 0;
// args.output.address = output->data<void>();
// args.output.scale_address = output->scale();
// }
// bool dispatch() {
// param_.input->syncToDevice();
// std::cout << "scale dispatch" << std::endl;
// return compute_fpga_scale(param_.args) == 0;
// }
void apply() {
Tensor* input = param_.input;
Tensor* output = param_.output;
Shape& input_shape = input->shape();
Shape& input_shape = input->shape();
DepthwiseConvParam& dw_param = dw_pe_.param();
int channel = input_shape.channel();
int repeat = 1;
int alignment = 16;
......@@ -51,70 +136,162 @@ class ScalePE : public PE {
int c_lcm = lcm(channel, alignment);
repeat = c_lcm / (channel);
}
// TODO FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现
Shape shape(N, {channel * repeat});
param_.alignedBias()->mutableData<float16>(FP16, shape);
param_.alignedScale()->mutableData<float16>(FP16, shape);
float16* bias_data = param_.alignedBias()->data<float16>();
float16* scale_data = param_.alignedScale()->data<float16>();
float* filter_data = filter.mutableData<float>(FP32, shape);
std::fill_n(filter_data, input->shape().channel(), 1.0f);
Tensor *scale = dw_param.scale();
float16* scale_data = scale->mutableData<float16>(FP16, shape);
// memcpy(scale_data, param_.scale->data<float>(), input->shape().channel() * sizeof(float));
Tensor *bias = dw_param.bias();
float16* bias_data = bias->mutableData<float16>(FP16, shape);
std::fill_n(bias_data, input->shape().channel(), 0);
if (param_.scale->dataType() == FP32) {
// std::cout << "scale dataType FP32:" << std::endl;
if (param_.bias != nullptr) {
float* bias_data_float = param_.bias->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(bias_data_float[j]);
bias_data[i * length + j] = value;
}
}
} else {
float16 zero = float_to_half(0.0f);
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
bias_data[i * length + j] = zero;
}
}
}
if (param_.bias != nullptr) {
float* bias_data_float = param_.bias->data<float>();
float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(bias_data_float[j]);
bias_data[i * length + j] = value;
float16 value = float_to_half(scale_data_float[j]);
scale_data[i * length + j] = value;
}
}
} else {
float16 zero = float_to_half(0.0f);
if (param_.bias != nullptr) {
float16* bias_data_float = param_.bias->data<float16>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = bias_data_float[j];
bias_data[i * length + j] = value;
}
}
} else {
float16 zero = float_to_half(0.0f);
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
bias_data[i * length + j] = zero;
}
}
}
float16* scale_data_float = param_.scale->data<float16>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
bias_data[i * length + j] = zero;
float16 value = scale_data_float[j];
scale_data[i * length + j] = value;
}
}
}
float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
float16 value = float_to_half(scale_data_float[j]);
scale_data[i * length + j] = value;
// if (param_.bias != nullptr) {
// memcpy(bias_data, param_.bias->data<float>(), input->shape().channel() * sizeof(float));
// }
dw_param.input = param_.input;
dw_param.output = param_.output;
dw_param.filter = &filter;
dw_param.strides = {1, 1};
dw_param.paddings = {0, 0};
dw_param.kernelSize = {1, 1};
dw_param.dilations = {1, 1};
dw_pe_.init();
dw_pe_.apply();
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
Tensor float_input;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
input->syncToCPU();
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
float* scale_data = param_.scale->data<float>();
int wh = input->shape().width() * input->shape().height();
float16* in_data = input->data<float16>();
float max = 0;
for (int i = 0; i < wh; i++) {
for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c];
std::cout << "value:" << value << " = " << half_to_float(in_data[index]) << " x " << scale_data[c] << std::endl;
data_out[index] = float_to_half(value);
if (value < 0) {
value = -value;
}
if (value > max) {
max = value;
}
}
}
param_.alignedScale()->flush();
param_.alignedBias()->flush();
int wc = input_shape.width() * input_shape.channel();
int wc_aligned = align_image(wc);
ScaleArgs& args = param_.args;
args.scale_address = param_.alignedScale()->data<void>();
args.bias_address = param_.alignedBias()->data<void>();
args.wc_alignment = wc_aligned;
args.channel_alignment = channel * repeat;
args.image.address = input->data<void>();
args.image.scale_address = input->scale();
args.image.channels = channel;
args.image.height = input_shape.height();
args.image.width = input_shape.width();
args.image.pad_width = 0;
args.image.pad_height = 0;
args.output.address = output->data<void>();
args.output.scale_address = output->scale();
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
}
bool dispatch() {
// cpu_compute();
// return true;
if (param_.scale->dataType() == FP16) {
DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(), param_.scale->data<float16>(), param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
dw_param.quantizedFilter()->flush();
// apply();
}
// param_.scale->saveToFile("scale.txt");
// cpu_compute();
// return true;
// param_.input->syncToDevice();
// return compute_fpga_scale(param_.args) == 0;
param_.input->syncToDevice();
return compute_fpga_scale(param_.args) == 0;
return dw_pe_.dispatch();
}
ScaleParam& param() { return param_; }
private:
ScaleParam param_;
Tensor filter;
DepthwiseConvPE dw_pe_;
};
} // namespace zynqmp
} // namespace paddle
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle {
namespace zynqmp {
static struct None none_;
static struct NCHW nchw_;
static struct NHWC nhwc_;
static struct NC nc_;
......@@ -82,6 +83,9 @@ class Shape {
void setLayoutType(LayoutType layout) {
this->layoutType_ = layout;
switch (layout) {
case None:
layout_ = &none_;
break;
case NCHW:
layout_ = &nchw_;
break;
......
......@@ -24,6 +24,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include <unistd.h>
// #include "lite/core/tensor.h"
#include "lite/backends/fpga/KD/dl_engine.hpp"
......@@ -117,7 +119,8 @@ class Tensor {
template <typename Dtype>
Dtype* mutableData() {
size_t memorySize = shape_->memorySize(CellSize(dataType_));
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
if (placeHolder_ != nullptr) {
if (memorySize > placeHolder_->memorySize()) {
placeHolder_.reset(new PlaceHolder(memorySize));
......@@ -241,6 +244,10 @@ class Tensor {
}
}
void setMemScale(float scale_factor) {
this->mem_scale_factor_ = scale_factor;
}
void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
......@@ -276,9 +283,11 @@ class Tensor {
.height = 1,
.pad_width = 0u,
.pad_height = 0u};
args.output = {
ImageOutputArgs output = {
.address = data<void>(), .scale_address = scale(),
};
args.output = output;
src->syncToDevice();
size_t aligned_remainder = src->shape().numel() % 16;
if (aligned_remainder > 0) {
......@@ -294,10 +303,14 @@ class Tensor {
this->invalidate();
}
void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
void flush() {
size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
}
void invalidate() {
fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_invalidate(placeHolder_->data(), memorySize);
}
void sync() {
......@@ -337,6 +350,17 @@ class Tensor {
if (placeHolder_ == nullptr) {
return;
}
std::cout << scale()[0] << " , " << scale()[1] << std::endl;
}
void printScale(std::string type) {
std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" + std::to_string(shape_->width())
<< std::endl;
std::cout << type << " \n";
printScale();
}
std::string dimsFileName() {
......@@ -358,48 +382,39 @@ class Tensor {
saveToFile(path);
}
friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
os << "tensor:"
<< "\n";
os << "dims: {";
for (int i = 0; i < tensor.shape().dimSize(); ++i) {
os << tensor.shape()[i] << " ";
}
os << "}\n";
for (int i = 0; i < tensor.shape().numel(); i++) {
float value = 0;
if (tensor.dataType() == FP32) {
value = tensor.data<float>()[i];
} else {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
}
void saveToFile(std::string path) {
syncToCPU();
invalidate();
std::ofstream ofs;
static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path;
counter++;
std::cout << "======== saving file:" << npath << " ============\n";
save_file_with_name(npath);
}
void save_file_with_name(std::string path) {
// return;
return;
invalidate();
// usleep(20000);
// return;
std::ofstream ofs;
ofs.open(path);
ofs << "dataType: " << dataType_ << std::endl;
ofs << "scale: " << scale()[0] << " , " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) {
float value = 0;
if (dataType_ == FP32) {
value = data<float>()[i];
} else {
} else if (dataType_ == FP16) {
value = half_to_float(data<float16>()[i]);
} else {
value = data<int8_t>()[i];
}
ofs << value << std::endl;
}
......@@ -415,18 +430,49 @@ class Tensor {
int num = shape_->numel();
invalidate();
float max = 0.0f;
float16* data = mutableData<float16>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = float_to_half(value);
if (dataType_ == FP16) {
float16* data = mutableData<float16>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = float_to_half(value);
}
} else {
float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
max = std::max(std::abs(value), max);
data[i] = value;
}
}
flush();
placeHolder_->scale_[0] = max / 127.0f;
placeHolder_->scale_[1] = 127.0f / max;
}
friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
os << "tensor:"
<< "\n";
os << "dims: {";
for (int i = 0; i < tensor.shape().dimSize(); ++i) {
os << tensor.shape()[i] << " ";
}
os << "}\n";
for (int i = 0; i < tensor.shape().numel(); i++) {
float value = 0;
if (tensor.dataType() == FP32) {
value = tensor.data<float>()[i];
} else {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
}
~Tensor() {
if (shape_ != nullptr) {
delete shape_;
......@@ -436,6 +482,7 @@ class Tensor {
private:
int offset = 0;
float mem_scale_factor_ = 1.0f;
std::shared_ptr<PlaceHolder> placeHolder_;
Shape* shape_ = nullptr;
DataType dataType_ = FP32;
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -92,13 +92,28 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
}
void TensorLite::CopyDataFrom(const TensorLite &other) {
// std::cout << "1\n";
dims_ = other.dims_;
// std::cout << "2\n";
target_ = other.target_;
// std::cout << "3\n";
lod_ = other.lod_;
// memory_size_ = other.memory_size_;
// buffer_->CopyDataFrom(*other.buffer_, memory_size_);
zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
other.zynq_tensor_->shape());
auto dt = zynq_tensor_->dataType();
// std::cout << "4\n";
// std::cout << "dt:" << dt << std::endl;
auto shape = other.zynq_tensor_->shape();
Resize(other.dims());
// mutable_data<float>();
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
// std::cout << "copy Data From: \n";
// std::cout << "ss" << (void*)(other.ZynqTensor()) << "\n";
this->ZynqTensor()->copyFrom(other.ZynqTensor());
// set_lod(other.lod());
}
// template <typename T>
......
......@@ -106,7 +106,7 @@ class TensorLite {
// For other devices, T and R may be the same type.
template <typename T, typename R = T>
const R *data() const {
return zynq_tensor_->data<R>();
return zynq_tensor_->data<R>() + offset_;
}
void Resize(const DDimLite &ddim) { dims_ = ddim; }
......@@ -125,6 +125,7 @@ class TensorLite {
bool persistable() const { return persistable_; }
void set_persistable(bool persistable) { persistable_ = persistable; }
// T is the data type and R is the return type
// For OpenCL, the return type can be cl::Buffer
// and the data type can be float/int8_t.
......@@ -147,6 +148,8 @@ class TensorLite {
size_t memory_size() const { return zynq_tensor_->memorySize(); }
size_t offset() const { return offset_; }
bool IsInitialized() const { return buffer_->data(); }
// Other share data to this.
......@@ -157,8 +160,14 @@ class TensorLite {
template <typename T>
TensorLite Slice(int64_t begin, int64_t end) const;
template <typename T>
void Slice(TensorLite& dst, int64_t begin, int64_t end) const;
TargetType target() const { return target_; }
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
......@@ -173,16 +182,21 @@ class TensorLite {
private:
TargetType target_{TargetType::kHost};
// precision_ and persistable_ are only used for persistable vars.
// If your tensor wants to be saved and loaded correctly, you must
// set values of precision_ and persistable_ after updating it.
// If your tensor is just a temp tensor, such as activations,
// you can ignore these two attributes.
PrecisionType precision_{PrecisionType::kUnk};
bool persistable_{false};
DDimLite dims_;
std::shared_ptr<Buffer> buffer_;
LoD lod_;
size_t memory_size_{};
size_t offset_{0};
PrecisionType precision_{PrecisionType::kUnk};
bool persistable_{false};
zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
template <typename T>
......@@ -197,6 +211,9 @@ R *TensorLite::mutable_data() {
}
zynqmp::LayoutType layout_type = zynqmp::NCHW;
switch (v.size()) {
case 0:
layout_type = zynqmp::None;
break;
case 1:
layout_type = zynqmp::N;
break;
......@@ -228,24 +245,79 @@ R *TensorLite::mutable_data(TargetType target) {
return mutable_data<T>();
}
template <typename TensorT>
bool TensorCompareWith(const TensorT &a, const TensorT &b) {
if (a.dims() != b.dims()) return false;
if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
return true;
}
template <typename T>
TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
int64_t base = numel() / dims_[0];
TensorLite dst;
dst.buffer_ = buffer_;
throw -1;
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
if (dims_[0] == 1) {
return *this;
} else {
int64_t base = numel() / dims_[0];
TensorLite dst;
// dst.buffer_ = buffer_;
// dst.zynq_tensor_ = zynq_tensor_;
dst.target_ = target_;
auto dst_dims = dims_;
dst_dims[0] = end - begin;
dst.Resize(dst_dims);
void* dst_data = dst.mutable_data<T>();
T* src_data = const_cast<T*>(data<T>());
memcpy(dst_data, src_data + static_cast<size_t>(begin * base) * sizeof(T), dst_dims.production() * sizeof(T));
dst.ZynqTensor()->saveToFile("_slice", true);
// dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
return dst;
}
}
template <typename T>
void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
dst.target_ = target_;
auto dst_dims = dims_;
dst_dims[0] = end - begin;
dst.Resize(dst_dims);
dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
return dst;
void* dst_data = dst.mutable_data<T>();
int64_t base = numel() / dims_[0];
T* src_data = const_cast<T*>(data<T>());
std::cout << "end:" << end << " begin:" << begin << std::endl;
std::cout << "base:" << base << std::endl;
std::cout << "production:" << dst_dims.production() << std::endl;
memcpy(dst_data, src_data + static_cast<size_t>(begin * dst_dims.production()), dst_dims.production() * sizeof(T));
// dst.ZynqTensor()->saveToFile("_slice", true);
// if (dims_[0] == 1) {
// dst-
// return;
// } else {
// // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
// return dst;
// }
}
template <typename TensorT>
bool TensorCompareWith(const TensorT &a, const TensorT &b) {
if (a.dims() != b.dims()) return false;
if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
return true;
}
} // namespace lite
} // namespace paddle
文件模式从 100644 更改为 100755
......@@ -4,29 +4,40 @@ endif()
set(fpga_deps fpga_target_wrapper kernel_fpga)
add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
# add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
# add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps})
add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
# lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
# lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/beam_search_decode_compute.h"
#include <algorithm>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
using LoDTensor = lite::Tensor;
using LoDTensorArray = std::vector<lite::Tensor>;
// all the lod have 2 levels.
// The first is source level, the second is sentence level.
// source level describe how many prefixes (branchs) for each source sentece
// (beam). sentence level describe how these candidates belong to the prefixes.
const size_t kSourceLevel = 0;
const size_t kSentenceLevel = 1;
template <typename T>
struct Sentence {
std::vector<float> word_ids;
std::vector<T> scores;
};
template <typename T>
using SentenceVector = std::vector<Sentence<T>>;
template <typename T>
struct BeamSearchDecoder {
BeamSearchDecoder(size_t beam_size, int end_id)
: beam_size_(beam_size), end_id_(end_id) {}
/**
* convert the result sentence_vector for each source sentence into two
* LodTensor.
* One is all candidate sentences with word id, one is all candidate sentences
* with word score.
* Param:
* sentence_vector_list: sentence_vector for each source sentence.
* id_tensor: result LoDTensor for sentences of id.
* score_tensor: result LoDTensor for sentences of score.
* reverse: whether ids of sentence in sentence_vector_list is reversed
* sort_by_score: whether to sort hypotheses of each sentence by scores.
*/
void ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
bool reverse = true,
bool sort_by_score = true) const {
size_t src_num = sentence_vector_list.size();
CHECK_GT(src_num, 0) << "src_num should not be 0";
std::vector<uint64_t> source_level_lod = {0};
std::vector<uint64_t> sentence_level_lod = {0};
std::vector<float> id_data;
std::vector<T> score_data;
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
if (sort_by_score) {
sort(sentence_vector_list[src_idx].begin(),
sentence_vector_list[src_idx].end(),
[reverse](const Sentence<T>& a, const Sentence<T>& b) {
if (reverse)
return a.scores.front() > b.scores.front();
else
return a.scores.back() > b.scores.back();
});
}
for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
if (reverse) {
id_data.insert(id_data.end(),
sentence.word_ids.rbegin(),
sentence.word_ids.rend());
score_data.insert(score_data.end(),
sentence.scores.rbegin(),
sentence.scores.rend());
} else {
id_data.insert(id_data.end(),
sentence.word_ids.begin(),
sentence.word_ids.end());
score_data.insert(
score_data.end(), sentence.scores.begin(), sentence.scores.end());
}
sentence_level_lod.push_back(sentence_level_lod.back() +
sentence.word_ids.size());
}
source_level_lod.push_back(source_level_lod.back() +
sentence_vector_list[src_idx].size());
}
LoD lod;
lod.push_back(source_level_lod);
lod.push_back(sentence_level_lod);
*(id_tensor->mutable_lod()) = lod;
id_tensor->Resize({static_cast<int64_t>(id_data.size())});
auto id_ptr = id_tensor->mutable_data<float>();
TargetCopy(
TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float));
*(score_tensor->mutable_lod()) = lod;
score_tensor->Resize({static_cast<int64_t>(score_data.size())});
auto score_ptr = score_tensor->mutable_data<T>();
TargetCopy(TARGET(kARM),
score_ptr,
score_data.data(),
score_data.size() * sizeof(T));
}
/**
* Gather the hypotheses for each source sentence by backtrace though the
* LoDTensorArray step_ids whose lods reserve the path in the tree.
*/
void Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor) const {
CHECK(!step_ids.empty()) << "step num should be larger than 0";
CHECK_EQ(step_ids.size(), step_scores.size())
<< "step_ids and step_scores should be the same";
const size_t step_num = step_ids.size();
const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
std::vector<SentenceVector<T>> sentence_vector_list(
src_num, SentenceVector<T>(beam_size_));
std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
for (int step_id = step_num - 1; step_id >= 0; --step_id) {
auto& cur_ids = step_ids.at(step_id);
auto& cur_scores = step_scores.at(step_id);
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
// for each source sentence
auto& sentence_vector = sentence_vector_list.at(src_idx);
auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
if (prefix_idx_vector.empty()) { // be finished and pruned at this step
// or the last time step
for (size_t prefix_idx = src_prefix_start;
prefix_idx < src_prefix_end;
++prefix_idx) {
size_t candidate_start =
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
size_t candidate_end =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
for (size_t candidate_idx = candidate_start;
candidate_idx < candidate_end;
++candidate_idx) {
prefix_idx_vector.push_back(prefix_idx);
size_t idx = prefix_idx_vector.size() - 1;
auto cur_id = cur_ids.data<float>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
}
} else { // use prefix_idx_vector to backtrace
size_t src_candidate_start =
cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
size_t prefix_idx = src_prefix_start;
size_t candidate_num =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
auto candidate_idx = prefix_idx_vector.at(idx);
auto cur_id = cur_ids.data<float>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
// to skip redundant end tokens
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
while (src_candidate_start + candidate_num <=
candidate_idx) { // search the corresponding prefix
prefix_idx++;
candidate_num +=
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
}
prefix_idx_vector.at(idx) = prefix_idx;
}
}
}
}
ConvertSentenceVectorToLodTensor(
sentence_vector_list, id_tensor, score_tensor, true, true);
}
size_t beam_size_;
int end_id_;
};
struct BeamSearchDecodeFunctor {
BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
size_t beam_size,
int end_id)
: beam_size_(beam_size),
end_id_(end_id),
step_ids_(step_ids),
step_scores_(step_scores),
id_tensor_(id_tensor),
score_tensor_(score_tensor) {}
template <typename T>
void apply() const {
BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
beam_search_decoder.Backtrace(
step_ids_, step_scores_, id_tensor_, score_tensor_);
}
size_t beam_size_;
int end_id_;
const LoDTensorArray& step_ids_;
const LoDTensorArray& step_scores_;
LoDTensor* id_tensor_;
LoDTensor* score_tensor_;
};
template <>
void BeamSearchDecodeFunctor::apply<bool>() const {
LOG(FATAL) << "beam search decode op does not support bool!";
}
void BeamSearchDecodeCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
// inputs
auto ids = param.ids;
auto scores = param.scores;
// outputs
auto sentence_ids = param.sentence_ids;
auto sentence_scores = param.sentence_scores;
const size_t step_num = ids->size();
CHECK_GT(step_num, 0UL) << "beam search steps should be larger than 0";
const size_t source_num = ids->at(0).lod().at(0).size() - 1;
CHECK_GT(source_num, 0UL) << "source num should be larger than 0";
for (size_t i = 0; i < step_num; ++i) {
CHECK_EQ(ids->at(i).lod().size(), 2UL) << "Level of LodTensor should be 2";
}
//! fixme
// only support float score now
BeamSearchDecodeFunctor func(*ids,
*scores,
sentence_ids,
sentence_scores,
param.beam_size,
param.end_id);
func.apply<float>();
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(beam_search_decode,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::BeamSearchDecodeCompute,
def)
.BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))})
.BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))})
.BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class BeamSearchDecodeCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::BeamSearchDecodeParam;
BeamSearchDecodeCompute() = default;
void Run() override;
virtual ~BeamSearchDecodeCompute() = default;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/box_coder_compute.h"
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/float16.hpp"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
void BoxCoderCompute::Run() {
auto& param = Param<operators::ReshapeParam>();
param.output->mutable_data<float16>();
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(box_coder,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::BoxCoderCompute,
def)
.BindInput("PriorBox",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("PriorBoxVar",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("TargetBox",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("OutputBox",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class BoxCoderCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::BoxCoderParam;
void Run() override;
virtual ~BoxCoderCompute() = default;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -23,24 +23,45 @@ namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
// void CalibComputeFp32ToFP16::PrepareForRun() {
// }
void CalibComputeFp32ToFP16::Run() {
auto& param = this->Param<operators::CalibParam>();
const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<float16>(TARGET(kFPGA));
param.output->mutable_data<float16>();
param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
for (int i = 0; i < param.input->numel(); ++i) {
dout[i] = zynqmp::float_to_half(din[i]);
}
// for (int i = 0; i < param.input->numel(); ++i) {
// dout[i] = zynqmp::float_to_half(din[i]);
// }
param.input->ZynqTensor()->saveToFile("calib_input.txt");
param.output->ZynqTensor()->saveToFile("ouput_31.txt");
param.output->ZynqTensor()->printScale("calib");
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
}
// void CalibComputeFP16ToFp32::PrepareForRun() {
// }
void CalibComputeFP16ToFp32::Run() {
auto& param = this->Param<operators::CalibParam>();
const auto* din = param.input->data<float16>();
auto* dout = param.output->mutable_data<float>(TARGET(kFPGA));
for (int i = 0; i < param.input->numel(); ++i) {
dout[i] = zynqmp::half_to_float(din[i]);
}
auto* dout = param.output->mutable_data<float>();
// for (int i = 0; i < param.input->numel(); ++i) {
// dout[i] = zynqmp::half_to_float(din[i]);
// }
param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
param.output->ZynqTensor()->saveToFile("ouput_13.txt");
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
}
......
......@@ -26,6 +26,8 @@ class CalibComputeFp32ToFP16
public:
using param_t = operators::CalibParam;
// void PrepareForRun() override;
void Run() override;
~CalibComputeFp32ToFP16() override{};
......@@ -38,6 +40,8 @@ class CalibComputeFP16ToFp32
public:
using param_t = operators::CalibParam;
// void PrepareForRun() override;
void Run() override;
~CalibComputeFP16ToFp32() override{};
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/concat_compute.h"
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
void ConcatCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
param.output->mutable_data<float16>();
// ====================================================
zynqmp::ConcatParam& concat_param = pe_.param();
for (auto t : param.x) {
concat_param.inputs.push_back(t->ZynqTensor());
}
concat_param.output = param.output->ZynqTensor();
concat_param.axis = param.axis;
pe_.init();
pe_.apply();
}
void ConcatCompute::Run() {
pe_.dispatch();
zynqmp::ConcatParam& concat_param = pe_.param();
concat_param.output->saveToFile("concat", true);
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(concat,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ConcatCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/operators/concat_op.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class ConcatCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::ConcatParam;
void PrepareForRun() override;
void Run() override;
virtual ~ConcatCompute() = default;
private:
zynqmp::ConcatPE pe_;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -25,37 +25,67 @@ using float16 = zynqmp::float16;
void ConvCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
// ====================================================
zynqmp::ConvParam& conv_param = pe_.param();
param.output->mutable_data<float16>();
// ====================================================
if (param.x->ZynqTensor()->shape().channel() != 1 &&
param.groups == param.x->ZynqTensor()->shape().channel()) {
zynqmp::DepthwiseConvParam& conv_param = dw_conv_pe_.param();
// filter_.setDataType(zynqmp::FP32);
conv_param.input = param.x->ZynqTensor();
conv_param.output = param.output->ZynqTensor();
conv_param.filter = param.filter->ZynqTensor();
conv_param.groups = param.groups;
conv_param.strides = param.strides;
auto paddings = *param.paddings;
conv_param.paddings = param.paddings;
conv_param.dilations = param.dilations;
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
if (!pad_equal) {
LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1]
<< ", " << paddings[2] << ", " << paddings[3];
conv_param.input = param.x->ZynqTensor();
conv_param.output = param.output->ZynqTensor();
conv_param.filter = param.filter->ZynqTensor();
conv_param.filter->setDataType(zynqmp::FP32);
conv_param.groups = param.groups;
conv_param.strides = param.strides;
conv_param.paddings = param.paddings;
conv_param.dilations = param.dilations;
fill_scale_bias_const(&conv_param);
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
conv_param.relu.enabled = param.fuse_relu;
dw_conv_pe_.init();
dw_conv_pe_.apply();
} else {
zynqmp::ConvParam& conv_param = conv_pe_.param();
conv_param.input = param.x->ZynqTensor();
conv_param.output = param.output->ZynqTensor();
conv_param.filter = param.filter->ZynqTensor();
conv_param.filter->setDataType(zynqmp::FP32);
conv_param.groups = param.groups;
conv_param.strides = param.strides;
conv_param.paddings = param.paddings;
conv_param.dilations = param.dilations;
fill_scale_bias_const(&conv_param);
if (param.bias != nullptr) {
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
std::cout << "copy bias \n";
}
conv_param.relu.enabled = param.fuse_relu;
// conv_param.filter->saveToFile("filter", true);
// conv_param.bias()->saveToFile("bias", true);
// conv_param.scale()->saveToFile("scale", true);
conv_pe_.init();
conv_pe_.apply();
}
fill_scale_bias_const(&conv_param);
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
conv_param.relu.enabled = param.fuse_relu;
pe_.init();
pe_.apply();
}
void ConvCompute::Run() {
auto& param = this->Param<param_t>();
zynqmp::ConvParam& conv_param = pe_.param();
pe_.dispatch();
// std::cout << "in:" << param.x->ZynqTensor()->data<void>() << std::endl;
if (param.x->ZynqTensor()->shape().channel() != 1 &&
param.groups == param.x->ZynqTensor()->shape().channel()) {
dw_conv_pe_.dispatch();
// param.output->ZynqTensor()->saveToFile("dw", true);
} else {
zynqmp::ConvParam& conv_param = conv_pe_.param();
conv_pe_.dispatch();
// conv_param.input->saveToFile("_conv_in", true);
conv_param.output->printScale("conv");
param.output->ZynqTensor()->saveToFile("_conv", true);
// conv_param.output->saveToFile("_conv_param", true);
}
}
} // namespace fpga
......
......@@ -14,11 +14,13 @@
#pragma once
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
#include "lite/core/kernel.h"
#include "lite/operators/conv_op.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
namespace paddle {
namespace lite {
namespace kernels {
......@@ -36,7 +38,8 @@ class ConvCompute
~ConvCompute() {}
private:
zynqmp::ConvPE pe_;
zynqmp::ConvPE conv_pe_;
zynqmp::DepthwiseConvPE dw_conv_pe_;
};
} // namespace fpga
......
......@@ -141,15 +141,13 @@ void conv_compute_ref(const operators::ConvParam& param) {
int group = param.groups;
int kernel_w = param.filter->dims()[2];
int kernel_h = param.filter->dims()[3];
auto paddings = *param.paddings;
auto dilations = *para.dilations;
int stride_w = param.strides[0];
int stride_h = param.strides[1];
int dila_w = dilations[0];
int dila_h = dilations[1];
int pad_w = paddings[2];
int pad_h = paddings[0];
int dila_w = param.dilations[0];
int dila_h = param.dilations[1];
int pad_w = param.paddings[0];
int pad_h = param.paddings[1];
bool flag_bias = (param.bias != nullptr);
bool flag_relu = param.fuse_relu;
......@@ -279,14 +277,10 @@ TEST(conv_fpga, compute) {
param.bias = &bias;
}
param.fuse_relu = flag_relu;
std::vector<int> paddings = {
padding, padding, padding, padding};
param.paddings = std::vector<int>({padding, padding});
param.strides = std::vector<int>({stride, stride});
std::vector<int> dilations = {dilation, dilation};
param.paddings =
std::make_shared<std::vector<int>>(paddings);
param.dilations =
std::make_shared<std::vector<int>>(dilations);
std::vector<int>({dilation, dilation});
param.groups = group;
conv.SetParam(param);
conv.Launch();
......
此差异已折叠。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class DensityPriorBoxCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::DensityPriorBoxParam;
void Run() override;
virtual ~DensityPriorBoxCompute() = default;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -16,6 +16,7 @@
#include <algorithm>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
......@@ -50,6 +51,17 @@ class ElementwiseAddActivationCompute
zynqmp::ElementwiseAddPE pe_;
};
class ElementwiseMulCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~ElementwiseMulCompute() = default;
private:
zynqmp::ScalePE pe_;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
......
文件模式从 100644 更改为 100755
此差异已折叠。
此差异已折叠。
文件模式从 100644 更改为 100755
此差异已折叠。
......@@ -32,8 +32,6 @@ class FeedCompute
private:
zynqmp::InputPE pe_;
zynqmp::Tensor input_;
zynqmp::Tensor output_;
};
} // namespace fpga
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100644 更改为 100755
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册