提交 d3d793c7 编写于 作者: C chonwhite

YOLOV3 works

上级 a59d6fab
...@@ -18,19 +18,14 @@ namespace paddle { ...@@ -18,19 +18,14 @@ namespace paddle {
namespace zynqmp { namespace zynqmp {
class Action { class Action {
public: public:
void readScale(float* scale) { void readScale(float* scale) {}
} void writeScale(float* scale) {}
void writeScale(float* scale) { private:
}
private:
int id_ = -1; int id_ = -1;
int scaleIndex_ = -1; int scaleIndex_ = -1;
} }
} }
} }
\ No newline at end of file
...@@ -21,20 +21,16 @@ namespace paddle { ...@@ -21,20 +21,16 @@ namespace paddle {
namespace zynqmp { namespace zynqmp {
class Transaction { class Transaction {
public:
void appendAction(Action* action) { actions_.push_back(action); };
public: void startTraction(){
void appendAction(Action* action) {
actions_.push_back(action);
};
void startTraction() {
}; };
private: private:
std::std::vector<Action*> actions_; std::std::vector<Action*> actions_;
int id_ = -1; int id_ = -1;
} }
} }
} }
\ No newline at end of file
...@@ -20,7 +20,7 @@ namespace paddle { ...@@ -20,7 +20,7 @@ namespace paddle {
namespace zynqmp { namespace zynqmp {
class TransactionManager { class TransactionManager {
public: public:
static TransactionManager& get_instance() { static TransactionManager& get_instance() {
static TransactionManager s_instance; static TransactionManager s_instance;
return s_instance; return s_instance;
...@@ -34,14 +34,11 @@ public: ...@@ -34,14 +34,11 @@ public:
return currentTransaction_; return currentTransaction_;
}; };
void endTransaction() { void endTransaction() { currentTransaction_ = nullptr; }
currentTransaction_ = nullptr;
}
private: private:
Transaction* currentTransaction_ = nullptr; Transaction* currentTransaction_ = nullptr;
std::vector<Transaction*> transactions_; std::vector<Transaction*> transactions_;
} }
} }
} }
\ No newline at end of file
...@@ -15,8 +15,5 @@ limitations under the License. */ ...@@ -15,8 +15,5 @@ limitations under the License. */
#include "io.hpp" #include "io.hpp"
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {} // namespace zynqmp
} // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -70,6 +70,10 @@ class ConvPE : public PE { ...@@ -70,6 +70,10 @@ class ConvPE : public PE {
param_.input->shape().channel() >= 2048) { param_.input->shape().channel() >= 2048) {
use_cpu_ = true; use_cpu_ = true;
} }
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().num() % 16 != 0) {
use_cpu_ = true;
}
if (!use_cpu_) { if (!use_cpu_) {
// param_.filter->releaseData(); // param_.filter->releaseData();
} }
...@@ -93,34 +97,38 @@ class ConvPE : public PE { ...@@ -93,34 +97,38 @@ class ConvPE : public PE {
float* filter_data = param_.filter->data<float>(); float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel]; float* mi = new float[in_channel];
int wh = input->shape().width() * input->shape().height();
float max = 0;
for (int i = 0; i < out_channel; i++) { for (int i = 0; i < out_channel; i++) {
float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel; float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi; // #pragma omp parallel for
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) { for (int k = 0; k < wh; k++) {
// float32x4_t x0 = vld1q_f32(image); float* image = image_addr;
// float32x4_t x1 = vld1q_f32(filter_ptr); float* out_ptr = mi;
// float32x4_t r = vmulq_f32(x0, x1); for (int j = 0; j < in_channel; j++) {
float value = image_addr[k * in_channel + j] * filter_ptr[j];
// vst1q_f32(out_ptr, r); mi[j] = value;
// image += 4; }
// filter_ptr += 4;
// out_ptr += 4; float sum = 0;
float value = image_addr[j] * filter_ptr[j]; for (int j = 0; j < in_channel; j++) {
mi[j] = value; sum += mi[j];
}
sum *= param_.scale()->data<float>()[i];
sum += param_.bias()->data<float>()[i];
out[i * wh + k] = sum;
max = std::max(max, std::abs(sum));
} }
float sum = 0;
for (int j = 0; j < in_channel; j++) {
sum += mi[j];
}
out[i] = sum;
} }
delete[] mi; delete[] mi;
float_output.flush(); float_output.flush();
output->copyFrom(&float_output); output->copyFrom(&float_output);
output->scale()[0] = max / 127.0;
output->scale()[1] = 127.0 / max;
// output->saveToFile("cpu", true);
} }
bool dispatch() { bool dispatch() {
...@@ -206,7 +214,6 @@ class ConvPE : public PE { ...@@ -206,7 +214,6 @@ class ConvPE : public PE {
// std::cout << "\n ================== EW ================== \n"; // std::cout << "\n ================== EW ================== \n";
// } // }
} }
return ret == 0; return ret == 0;
} }
......
...@@ -262,7 +262,7 @@ bool PriorBoxPE::dispatch() { ...@@ -262,7 +262,7 @@ bool PriorBoxPE::dispatch() {
param_.outputBoxes->copyFrom(this->cachedBoxes_); param_.outputBoxes->copyFrom(this->cachedBoxes_);
param_.outputVariances->copyFrom(this->cachedVariances_); param_.outputVariances->copyFrom(this->cachedVariances_);
param_.outputBoxes->flush(); param_.outputBoxes->flush();
// param_.outputBoxes->syncToCPU(); // param_.outputBoxes->syncToCPU();
param_.outputVariances->flush(); param_.outputVariances->flush();
......
...@@ -84,28 +84,26 @@ class ResizePE : public PE { ...@@ -84,28 +84,26 @@ class ResizePE : public PE {
param_.input->syncToCPU(); param_.input->syncToCPU();
for (int h = 0; h < in_height; h++) { for (int h = 0; h < in_height; h++) {
for (int w = 0; w < in_width; w++) { for (int w = 0; w < in_width; w++) {
int src_index = in_width * channel * h + w * channel; int src_index = in_width * channel * h + w * channel;
float16* src = param_.input->data<float16>() + src_index; float16* src = param_.input->data<float16>() + src_index;
// std::cout << "src_index:" << src_index << std::endl; // std::cout << "src_index:" << src_index << std::endl;
for (int v = 0; v < factor; v++) { for (int v = 0; v < factor; v++) {
for (int i =0; i < factor; i++) { for (int i = 0; i < factor; i++) {
int dst_index = out_width * channel * h * factor + int dst_index = out_width * channel * h * factor +
out_width * channel * v + out_width * channel * v + w * channel * factor +
w * channel * factor +
channel * i; channel * i;
float16* dst = param_.output->data<float16>() + dst_index; float16* dst = param_.output->data<float16>() + dst_index;
memcpy(dst, src, channel * sizeof(float16)); memcpy(dst, src, channel * sizeof(float16));
// std::cout << "dst_index:" << dst_index << std::endl; // std::cout << "dst_index:" << dst_index << std::endl;
} }
}
}
} }
param_.output->flush(); }
param_.output->copyScaleFrom(param_.input);
} }
param_.output->flush();
param_.output->copyScaleFrom(param_.input);
}
bool dispatch() { bool dispatch() {
cpu_compute(); cpu_compute();
......
...@@ -158,8 +158,9 @@ class ScalePE : public PE { ...@@ -158,8 +158,9 @@ class ScalePE : public PE {
int index = i * input->shape().channel() + c; int index = i * input->shape().channel() + c;
float x = image_addr[index]; float x = image_addr[index];
float y = half_to_float(scale_data[c]); float y = half_to_float(scale_data[c]);
float value = x * y; float value = x * y;
// std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl; // std::cout << " x = " << std::to_string(x) << " y = " <<
// std::to_string(y) << " v = " << std::to_string(value) << std::endl;
// float value = half_to_float(in_data[index]) * 19.3598f; // float value = half_to_float(in_data[index]) * 19.3598f;
data_out[index] = float_to_half(value); data_out[index] = float_to_half(value);
...@@ -188,9 +189,9 @@ class ScalePE : public PE { ...@@ -188,9 +189,9 @@ class ScalePE : public PE {
// dw_param.quantizedFilter()->flush(); // dw_param.quantizedFilter()->flush();
// } // }
// param_.input->syncToDevice(); // param_.input->syncToDevice();
// return dw_pe_.dispatch(); return dw_pe_.dispatch();
cpu_compute(); // cpu_compute();
return true; return true;
} }
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
namespace paddle {
namespace zynqmp {
float sigmoid(float x) {
return 1.0 / (1.0 + std::exp(-x));
}
inline void GetYoloBox(float* box, const float* x, const int* anchors, int w,
int h, int an_idx, int grid_size,
int input_size, int index,
int img_height, int img_width) {
box[0] = (w + sigmoid(x[index])) * img_width * 1.0f/ grid_size;
box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size;
box[2] = std::exp(x[index + 2 ]) * anchors[2 * an_idx] * img_width * 1.0f/
input_size;
box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] *
img_height * 1.0f / input_size;
}
inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
int an_num, int an_stride, int stride,
int entry) {
return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
}
inline void CalcDetectionBox(float* boxes, float* box, const int box_idx,
const int img_height,
const int img_width) {
boxes[box_idx] = box[0] - box[2] / 2;
boxes[box_idx + 1] = box[1] - box[3] / 2;
boxes[box_idx + 2] = box[0] + box[2] / 2;
boxes[box_idx + 3] = box[1] + box[3] / 2;
boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0;
boxes[box_idx + 1] =
boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
? boxes[box_idx + 2]
: (img_width - 1);
boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
? boxes[box_idx + 3]
: (img_height - 1);
}
inline void CalcLabelScore(float* scores, const float* input,
const int label_idx, const int score_idx,
const int class_num, const float conf) {
for (int i = 0; i < class_num; i++) {
scores[score_idx + i] = conf * sigmoid(input[label_idx + i]);
// std::cout << scores[score_idx + i] << " ";
}
// std::cout << std::endl;
}
class YoloBoxPE : public PE {
public:
bool init() {
param_.outputBoxes->setAligned(false);
param_.outputScores->setAligned(false);
param_.outputBoxes->setDataLocation(CPU);
param_.outputScores->setDataLocation(CPU);
return true;
}
bool dispatch() {
auto* input = param_.input;
auto* imgsize = param_.imgSize;
auto* boxes = param_.outputBoxes;
auto* scores = param_.outputScores;
auto anchors = param_.anchors;
int class_num = param_.classNum;
float conf_thresh = param_.confThresh;
int downsample_ratio = param_.downsampleRatio;
const int num = input->shape().num();
const int height = input->shape().height();
const int width = input->shape().width();
const int box_num = boxes->shape().channel();
const int an_num = anchors.size() / 2;
int input_size = downsample_ratio * height;
const int stride = height * width;
const int an_stride = (class_num + 5) * stride;
Tensor anchors_;
Shape anchors_shape(N, {an_num * 2});
auto anchors_data = anchors_.mutableData<int32_t>(INT32, anchors_shape);
std::copy(anchors.begin(), anchors.end(), anchors_data);
input->syncToCPU();
input->unalignImage();
// input->setAligned(false);
Tensor input_float;
input_float.setDataLocation(CPU);
float* input_data = input_float.mutableData<float>(FP32, input->shape());
input_float.copyFrom(input);
// input_float.saveToFile("input_yolobox_half", "true");
// input_float.setAligned(input->aligned());
// input_float.unalignImage();
// std::cout << "-------------unalignImage-----------------" << std::endl;
// for (int i = 0; i < input_float.shape().numel(); ++i)
// {
// std::cout << input_data[i] << " ";
// }
// std::cout << "-" << std::endl;
// std::cout << "-------------unalignImage-----------------" << std::endl;
// input_float.setAligned(false);
// input_float.saveToFile("input_yolobox_float", "true");
// input_float.syncToCPU();
// input_float.invalidate();
imgsize->saveToFile("img_size", true);
const int32_t* imgsize_data = imgsize->data<int32_t>();
Tensor boxes_float;
Tensor scores_float;
boxes_float.setDataLocation(CPU);
float* boxes_float_data = boxes_float.mutableData<float>(FP32, boxes->shape());
memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float));
scores_float.setDataLocation(CPU);
float* scores_float_data = scores_float.mutableData<float>(FP32, scores->shape());
memset(scores_float_data, 0, scores->shape().numel() * sizeof(float));
// float* boxes_data = boxes->mutableData<float>();
// memset(boxes_data, 0, boxes->shape().numel() * sizeof(float));
// float* scores_data = scores->mutableData<float>();
// memset(scores_data, 0, scores->shape().numel() * sizeof(float));
float box[4];
// for (int n = 0; n < num; n++) {
// int img_height = imgsize_data[2 * i];
// int img_width = imgsize_data[2 * i + 1];
int img_height = imgsize_data[0];
int img_width = imgsize_data[1];
std::cout << "YoloBoxPE imgsize:" << img_height << "," << img_width << std::endl;
int channel = input_float.shape().channel();
int count = 0;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width ; w++) {
for (int n = 0; n < an_num; n++) {
int obj_idx = channel * width * h + channel * w + n * (5 + class_num) + 4;
// std::cout << obj_idx << " ";
float conf = sigmoid(input_data[obj_idx]);
if (conf < conf_thresh) {
count++;
continue;
}
int box_idx = channel * width * h + channel * w + n * (5 + class_num) + 0;
GetYoloBox(box, input_data, anchors_data, w, h, n, height, input_size,
box_idx, img_height, img_width);
box_idx = h * an_num * 4 * width + an_num * 4 * w + n * 4;
CalcDetectionBox(boxes_float_data, box, box_idx, img_height,img_width);
int label_idx = channel * width * h + channel * w + n * (5 + class_num) + 5;
int score_idx = h * an_num * class_num * width + an_num * class_num * w + n * class_num;
CalcLabelScore(scores_float_data, input_data, label_idx, score_idx, class_num, conf);
}
}
}
boxes->copyFrom(&boxes_float);
scores->copyFrom(&scores_float);
input->setAligned(true);
}
void apply(){};
YoloBoxParam& param() { return param_; }
private:
YoloBoxParam param_;
};
} // namespace zynqmp
} // namespace paddle
...@@ -266,23 +266,21 @@ class Tensor { ...@@ -266,23 +266,21 @@ class Tensor {
return; return;
} }
BypassArgs args; BypassArgs args;
args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; args.input_data_type =
src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.input_layout_type = LAYOUT_HWC; args.input_layout_type = LAYOUT_HWC;
args.output_layout_type = LAYOUT_HWC; args.output_layout_type = LAYOUT_HWC;
args.image = { args.image = {.address = src->data<void>(),
.address = src->data<void>(), .scale_address = src->scale(),
.scale_address = src->scale(), .channels = (uint32_t)src->shape().numel(),
.channels = (uint32_t)src->shape().numel(), .width = 1,
.width = 1, .height = 1,
.height = 1, .pad_width = 0U,
.pad_width = 0U, .pad_height = 0U};
.pad_height = 0U
};
ImageOutputArgs output = { ImageOutputArgs output = {
.address = data<void>(), .address = data<void>(), .scale_address = scale(),
.scale_address = scale(),
}; };
args.output = output; args.output = output;
...@@ -385,10 +383,11 @@ class Tensor { ...@@ -385,10 +383,11 @@ class Tensor {
void save_file_with_name(std::string path) { void save_file_with_name(std::string path) {
// std::cout << "saving file: " << path << std::endl; // std::cout << "saving file: " << path << std::endl;
void* add = (void*)this; void* add = (void*)this;
// printf("tensor @: %p data: %p \n", (void *)add, (void*)data<void>()); // printf("tensor @: %p data: %p \n", (void *)add, (void*)data<void>());
// return; // return;
std::ofstream ofs; std::ofstream ofs;
ofs.open(path); ofs.open(path);
ofs << "data type: " << dataType() << std::endl;
ofs << scale()[0] << " / " << scale()[1] << std::endl; ofs << scale()[0] << " / " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) { for (int i = 0; i < shape_->numel(); i++) {
...@@ -406,13 +405,14 @@ class Tensor { ...@@ -406,13 +405,14 @@ class Tensor {
if (dataType_ == INT32) { if (dataType_ == INT32) {
value = data<int32_t>()[i]; value = data<int32_t>()[i];
} }
if (i < 10) { if (i < 10) {
std::cout << value << ","; std::cout << value << ",";
} }
// if (i > 1000) {
// break;
// }
ofs << value << std::endl; ofs << value << std::endl;
} }
usleep(30000); usleep(30000);
ofs.close(); ofs.close();
...@@ -465,7 +465,6 @@ class Tensor { ...@@ -465,7 +465,6 @@ class Tensor {
value = half_to_float(tensor.data<float16>()[i]); value = half_to_float(tensor.data<float16>()[i]);
} }
os << value << " "; os << value << " ";
} }
os << "\n"; os << "\n";
return os; return os;
......
...@@ -166,6 +166,9 @@ class TensorLite { ...@@ -166,6 +166,9 @@ class TensorLite {
void clear() { void clear() {
// zynq_tensor_->releaseData(); // zynq_tensor_->releaseData();
if (zynq_tensor_) {
memset(zynq_tensor_->data<void>(), 0, zynq_tensor_->memorySize());
}
} }
template <typename T> template <typename T>
......
...@@ -105,7 +105,6 @@ class ChannelWiseDequantOpFuser : public FuseBase { ...@@ -105,7 +105,6 @@ class ChannelWiseDequantOpFuser : public FuseBase {
*/ */
class DeleteQuantDequantOpFuser : public FuseBase { class DeleteQuantDequantOpFuser : public FuseBase {
public: public:
void BuildPattern() override; void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
......
...@@ -84,13 +84,13 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -84,13 +84,13 @@ class KernelPlaceCorrectPass : public DebugPass {
std::string node_name = out->AsArg().name; std::string node_name = out->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->outputs()); std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
auto op_type = inst.op_type(); auto op_type = inst.op_type();
if (op_type == "reshape" || op_type == "reshape2") { if (op_type == "reshape" || op_type == "reshape2") {
for (auto* x_in : x->inlinks) { for (auto* x_in : x->inlinks) {
std::string in_name =
std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs()); get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl; // std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl; // std::cout << "in_name: " << in_name << std::endl;
if (in_name == "X") { if (in_name == "X") {
...@@ -101,9 +101,11 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -101,9 +101,11 @@ class KernelPlaceCorrectPass : public DebugPass {
} }
p = in->AsArg().type->precision(); p = in->AsArg().type->precision();
if ( p != PrecisionType::kFP16) { if (p != PrecisionType::kFP16) {
// std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl; // std::cout << "found an arm ............... : " <<
// std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << std::endl; // inst.kernels().size() << std::endl;
// std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) <<
// std::endl;
UpdateTarget(inst, TargetType::kHost); UpdateTarget(inst, TargetType::kHost);
UpdateTensor(inst, in, out, TargetType::kHost); UpdateTensor(inst, in, out, TargetType::kHost);
} }
...@@ -113,8 +115,9 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -113,8 +115,9 @@ class KernelPlaceCorrectPass : public DebugPass {
UpdateTarget(inst, TargetType::kFPGA); UpdateTarget(inst, TargetType::kFPGA);
} }
if (inst.op_type() == "split" || inst.op_type() == "transpose") { if (inst.op_type() == "split" || inst.op_type() == "transpose" ||
if ( p != PrecisionType::kFP16) { inst.op_type() == "transpose2") {
if (p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM); UpdateTarget(inst, TargetType::kARM);
for (auto* x_out : x->outlinks) { for (auto* x_out : x->outlinks) {
UpdateTensor(inst, in, x_out, TargetType::kARM); UpdateTensor(inst, in, x_out, TargetType::kARM);
...@@ -123,9 +126,12 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -123,9 +126,12 @@ class KernelPlaceCorrectPass : public DebugPass {
} }
if (inst.op_type() == "concat") { if (inst.op_type() == "concat") {
std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl; std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target())
std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl; << std::endl;
if ( p != PrecisionType::kFP16) { std::cout << "concat p:"
<< PrecisionToStr(inst.kernels()[0]->precision())
<< std::endl;
if (p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM); UpdateTarget(inst, TargetType::kARM);
UpdateTensor(inst, in, out, TargetType::kARM); UpdateTensor(inst, in, out, TargetType::kARM);
} }
...@@ -134,8 +140,9 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -134,8 +140,9 @@ class KernelPlaceCorrectPass : public DebugPass {
// if (inst.op_type() == "elementwise_mul") { // if (inst.op_type() == "elementwise_mul") {
// for (auto* x_in : x->inlinks) { // for (auto* x_in : x->inlinks) {
// std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs()); // std::string in_name = get_argname(x_in->AsArg().name,
// inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl; // std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl; // std::cout << "in_name: " << in_name << std::endl;
// if (in_name == "Y") { // if (in_name == "Y") {
...@@ -150,7 +157,6 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -150,7 +157,6 @@ class KernelPlaceCorrectPass : public DebugPass {
// UpdateTensor(inst, in, out, TargetType::kARM); // UpdateTensor(inst, in, out, TargetType::kARM);
// } // }
// } // }
std::vector<TargetType> in_types; std::vector<TargetType> in_types;
std::vector<TargetType> out_types; std::vector<TargetType> out_types;
...@@ -164,11 +170,13 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -164,11 +170,13 @@ class KernelPlaceCorrectPass : public DebugPass {
auto type = inst.picked_kernel().GetInputDeclType(arg_name); auto type = inst.picked_kernel().GetInputDeclType(arg_name);
// std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight) // std::cout << arg_name <<" is weight:: " <<
// << " is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl; // std::to_string(x_in->AsArg().is_weight)
// << " is persist: " <<
// std::to_string(x_in->AsArg().is_persist) << std::endl;
// std::cout << " type: "<< inst.op_type() << std::endl; // std::cout << " type: "<< inst.op_type() << std::endl;
if (!x_in->AsArg().is_weight) { if (!x_in->AsArg().is_weight) {
auto p = x_in->AsArg().type->precision(); auto p = x_in->AsArg().type->precision();
auto t = x_in->AsArg().type->target(); auto t = x_in->AsArg().type->target();
...@@ -224,10 +232,10 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -224,10 +232,10 @@ class KernelPlaceCorrectPass : public DebugPass {
} }
} }
// Update me's kUnk fields by other's fields. // Update me's kUnk fields by other's fields.
void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT
// std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl; // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) <<
// std::endl;
auto new_place = inst.place(); auto new_place = inst.place();
new_place.target = new_target; new_place.target = new_target;
...@@ -244,25 +252,30 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -244,25 +252,30 @@ class KernelPlaceCorrectPass : public DebugPass {
std::vector<Place> places; std::vector<Place> places;
places.push_back(new_place); places.push_back(new_place);
inst.ResetKernels(places); inst.ResetKernels(places);
// std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl; // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) <<
// std::endl;
} }
void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) { void UpdateTensor(mir::Node::Stmt& inst,
Node* in,
Node* out,
TargetType new_target = TargetType::kUnk) {
auto get_argname = [&]( auto get_argname = [&](
const std::string& node_name, const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map) const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string { -> std::string {
for (auto& ele : argname_map) { for (auto& ele : argname_map) {
auto it = auto it =
std::find(ele.second.begin(), ele.second.end(), node_name); std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first; if (it != ele.second.end()) return ele.first;
} }
return ""; return "";
}; };
std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs()); std::string arg_name =
std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs()); get_argname(out->AsArg().name, inst.op_info()->outputs());
std::string in_name =
get_argname(in->AsArg().name, inst.op_info()->inputs());
auto type = inst.picked_kernel().GetInputDeclType(in_name); auto type = inst.picked_kernel().GetInputDeclType(in_name);
auto tmp_ptype = in->AsArg().type->precision(); auto tmp_ptype = in->AsArg().type->precision();
...@@ -281,7 +294,8 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -281,7 +294,8 @@ class KernelPlaceCorrectPass : public DebugPass {
tmp_layout = DataLayoutType::kNCHW; tmp_layout = DataLayoutType::kNCHW;
} }
out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout); out->AsArg().type =
LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
} }
}; };
......
...@@ -80,6 +80,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -80,6 +80,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::sort(scored.begin(), scored.end(), KernelScoreCmp); std::sort(scored.begin(), scored.end(), KernelScoreCmp);
instruct.kernels().clear(); instruct.kernels().clear();
VLOG(2) << "picking kernel " << scored.front().second->name() << "\n\n";
if (!instruct.op_info()->HasAttr("enable_int8")) { if (!instruct.op_info()->HasAttr("enable_int8")) {
// Move kernel back // Move kernel back
// Just keep a single best kernel. // Just keep a single best kernel.
......
...@@ -157,7 +157,21 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -157,7 +157,21 @@ class StaticKernelPickPass : public mir::StmtPass {
} }
} }
if (in_match) { if (in_match) {
final_score = 5000; final_score += 1000;
}
bool out_match = true;
for (size_t i = 0; i < out_names.size(); ++i) {
std::string tmp;
CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
if (out_types.count(out_names[i]) &&
out_types.at(out_names[i]) !=
kernel.GetOutputDeclType(tmp)->precision()) {
out_match = false;
}
}
if (out_match) {
final_score += 1000;
} }
} }
......
...@@ -53,7 +53,6 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -53,7 +53,6 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
ComplementInputs(graph.get(), node, in, &copied_nodes); ComplementInputs(graph.get(), node, in, &copied_nodes);
} }
} }
} }
void TypeTargetTransformPass::ComplementInputs( void TypeTargetTransformPass::ComplementInputs(
...@@ -74,6 +73,7 @@ void TypeTargetTransformPass::ComplementInputs( ...@@ -74,6 +73,7 @@ void TypeTargetTransformPass::ComplementInputs(
auto in_arg_name = in->AsArg().name; auto in_arg_name = in->AsArg().name;
std::string tmp; std::string tmp;
CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
VLOG(4) << "in_arg_name: " << in_arg_name << " tmp:" << tmp;
auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
CHECK(in->AsArg().type); CHECK(in->AsArg().type);
if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) { if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
......
...@@ -141,6 +141,7 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -141,6 +141,7 @@ class VariablePlaceInferencePass : public DebugPass {
x_in->AsArg().type = type; x_in->AsArg().type = type;
} else { } else {
PrecisionType tmp_ptype = x_in->AsArg().type->precision(); PrecisionType tmp_ptype = x_in->AsArg().type->precision();
VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
x_in->AsArg().type = LiteType::GetTensorTy( x_in->AsArg().type = LiteType::GetTensorTy(
type->target(), tmp_ptype, type->layout()); type->target(), tmp_ptype, type->layout());
} }
...@@ -172,6 +173,9 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -172,6 +173,9 @@ class VariablePlaceInferencePass : public DebugPass {
x_out->AsArg().type = type; x_out->AsArg().type = type;
} else { } else {
PrecisionType tmp_ptype = x_out->AsArg().type->precision(); PrecisionType tmp_ptype = x_out->AsArg().type->precision();
tmp_ptype = type->precision();
// inst.picked_kernel().precision();
VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
x_out->AsArg().type = LiteType::GetTensorTy( x_out->AsArg().type = LiteType::GetTensorTy(
type->target(), tmp_ptype, type->layout()); type->target(), tmp_ptype, type->layout());
} }
......
...@@ -134,7 +134,6 @@ class Optimizer { ...@@ -134,7 +134,6 @@ class Optimizer {
"mlu_postprocess_pass"}}; "mlu_postprocess_pass"}};
if (passes.size() == 1) { if (passes.size() == 1) {
// multi_stream_analysis_pass must be in the front of // multi_stream_analysis_pass must be in the front of
// runtime_context_assign_pass // runtime_context_assign_pass
......
...@@ -42,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep ...@@ -42,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep
add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps}) add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps}) add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
add_kernel(yolo_box_compute_fpga FPGA basic SRCS yolo_box_compute.cc DEPS ${fpga_deps})
# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps}) # add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps}) # add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
......
...@@ -35,6 +35,28 @@ void ReluCompute::PrepareForRun() { ...@@ -35,6 +35,28 @@ void ReluCompute::PrepareForRun() {
void ReluCompute::Run() { pe_.dispatch(); } void ReluCompute::Run() { pe_.dispatch(); }
void SigmoidCompute::Run() {
// TODO(chonwhite) use fpga and arm implementation;
auto& param = this->Param<param_t>();
auto output_data = param.Out->mutable_data<float16>();
int numel = param.Out->numel();
float16* in_data = param.X->ZynqTensor()->data<float16>();
float16* out_data = param.Out->ZynqTensor()->data<float16>();
param.X->ZynqTensor()->syncToCPU();
float max = 0.0f;
for (int i = 0; i < numel; i++) {
/* code */
float value = zynqmp::half_to_float(in_data[i]);
value = 1 / (1 + exp(-value));
out_data[i] = zynqmp::float_to_half(value);
max = std::max(std::abs(value), max);
}
param.Out->ZynqTensor()->scale()[0] = max / 127.0;
param.Out->ZynqTensor()->scale()[1] = 127.0 / max;
param.Out->ZynqTensor()->flush();
}
} // namespace fpga } // namespace fpga
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
...@@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL( ...@@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(sigmoid,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::SigmoidCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
...@@ -49,6 +49,16 @@ class ReluCompute ...@@ -49,6 +49,16 @@ class ReluCompute
zynqmp::Tensor output_; zynqmp::Tensor output_;
}; };
class SigmoidCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~SigmoidCompute() = default;
};
} // namespace fpga } // namespace fpga
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -49,7 +49,7 @@ void CalibComputeFloat2Int::Run() { ...@@ -49,7 +49,7 @@ void CalibComputeFloat2Int::Run() {
const auto* din = param.input->data<float>(); const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<int>(); auto* dout = param.output->mutable_data<int>();
// param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor()); // param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
//TODO // TODO
auto out_lod = param.output->mutable_lod(); auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod(); *out_lod = param.input->lod();
return; return;
......
...@@ -45,11 +45,11 @@ void ConcatCompute::PrepareForRun() { ...@@ -45,11 +45,11 @@ void ConcatCompute::PrepareForRun() {
void ConcatCompute::Run() { void ConcatCompute::Run() {
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR // #ifdef FPGA_PRINT_TENSOR
zynqmp::ConcatParam& concat_param = pe_.param(); zynqmp::ConcatParam& concat_param = pe_.param();
concat_param.output->flush(); concat_param.output->flush();
// Debugger::get_instance().registerOutput("concat", concat_param.output); Debugger::get_instance().registerOutput("concat", concat_param.output);
#endif // #endif
} }
} // namespace fpga } // namespace fpga
......
...@@ -53,7 +53,8 @@ void ConvCompute::PrepareForRun() { ...@@ -53,7 +53,8 @@ void ConvCompute::PrepareForRun() {
if (param.activation_param.Leaky_relu_alpha > 0.001) { if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU; conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha; conv_param.activeParam.leaky_relu_factor =
param.activation_param.Leaky_relu_alpha;
} }
dw_conv_pe_.init(); dw_conv_pe_.init();
...@@ -79,13 +80,15 @@ void ConvCompute::PrepareForRun() { ...@@ -79,13 +80,15 @@ void ConvCompute::PrepareForRun() {
if (param.activation_param.Leaky_relu_alpha > 0.001) { if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU; conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha; conv_param.activeParam.leaky_relu_factor =
param.activation_param.Leaky_relu_alpha;
} }
conv_pe_.init(); conv_pe_.init();
conv_pe_.apply(); conv_pe_.apply();
} }
// std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl; // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha
// << std::endl;
} }
void ConvCompute::Run() { void ConvCompute::Run() {
......
...@@ -96,11 +96,12 @@ void ElementwiseMulCompute::PrepareForRun() { ...@@ -96,11 +96,12 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_value = param.Y->data<zynqmp::float16>()[0]; scale_value = param.Y->data<zynqmp::float16>()[0];
// std::cout << "FP16 \n"; // std::cout << "FP16 \n";
} }
// std::cout << "channel:" << channel << std::endl; // std::cout << "channel:" << channel << std::endl;
// std::cout << "production:" << param.Y->dims().production() << std::endl; // std::cout << "production:" << param.Y->dims().production() << std::endl;
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; // std::cout << "scale_value:" <<
// std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1); // exit(-1);
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
...@@ -112,7 +113,8 @@ void ElementwiseMulCompute::PrepareForRun() { ...@@ -112,7 +113,8 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_value = param.Y->data<zynqmp::float16>()[i]; scale_value = param.Y->data<zynqmp::float16>()[i];
} }
} }
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; // std::cout << "scale_value:" <<
// std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1); // exit(-1);
scale_data[i] = scale_value; scale_data[i] = scale_value;
bias_data[i] = zero_; bias_data[i] = zero_;
...@@ -128,13 +130,13 @@ void ElementwiseMulCompute::Run() { ...@@ -128,13 +130,13 @@ void ElementwiseMulCompute::Run() {
if (!param.Y->persistable()) { if (!param.Y->persistable()) {
// TODO // TODO
scale_.copyFrom(param.Y->ZynqTensor()); scale_.copyFrom(param.Y->ZynqTensor());
scale_.flush();//TODO scale_.flush(); // TODO
} }
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param(); zynqmp::ScaleParam& scale_param = pe_.param();
// Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input); // Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
// Debugger::get_instance().registerOutput("ew_mul", scale_param.output); // Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif #endif
} }
...@@ -214,8 +216,7 @@ REGISTER_LITE_KERNEL(elementwise_mul, ...@@ -214,8 +216,7 @@ REGISTER_LITE_KERNEL(elementwise_mul,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Y", .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
......
...@@ -28,7 +28,14 @@ void FeedCompute::PrepareForRun() { ...@@ -28,7 +28,14 @@ void FeedCompute::PrepareForRun() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col); Tensor& x = param.feed_list->at(param.col);
param.out->Resize(x.dims()); param.out->Resize(x.dims());
param.out->mutable_data<float16>();
auto in_type = x.ZynqTensor()->dataType();
if (in_type == zynqmp::FP32 || in_type == zynqmp::FP16) {
param.out->mutable_data<float16>();
}
if (in_type == zynqmp::INT32) {
param.out->mutable_data<int32_t>();
}
// ==================================================== // ====================================================
zynqmp::InputParam& feed_param = pe_.param(); zynqmp::InputParam& feed_param = pe_.param();
feed_param.input = x.ZynqTensor(); feed_param.input = x.ZynqTensor();
...@@ -68,12 +75,18 @@ REGISTER_LITE_KERNEL( ...@@ -68,12 +75,18 @@ REGISTER_LITE_KERNEL(
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
// REGISTER_LITE_KERNEL(feed, REGISTER_LITE_KERNEL(feed,
// kFPGA, kFPGA,
// kFP16, kFP16,
// kNHWC, kNHWC,
// paddle::lite::kernels::fpga::FeedCompute, paddle::lite::kernels::fpga::FeedCompute,
// def_host) feed_int32)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("X",
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) {LiteType::GetTensorTy(TARGET(kHost),
// .Finalize(); PRECISION(kFloat),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
\ No newline at end of file
...@@ -49,7 +49,6 @@ void BilinearInterpCompute::Run() { ...@@ -49,7 +49,6 @@ void BilinearInterpCompute::Run() {
// interp_method); // interp_method);
} }
void nearest_interp(const float16* src, void nearest_interp(const float16* src,
int w_in, int w_in,
int h_in, int h_in,
...@@ -85,7 +84,7 @@ void nearest_interp(const float16* src, ...@@ -85,7 +84,7 @@ void nearest_interp(const float16* src,
int near_y = static_cast<int>(scale_h_new * h); int near_y = static_cast<int>(scale_h_new * h);
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w); int near_x = static_cast<int>(scale_w_new * w);
const float16* src_n = src + (near_y * w_in + near_x) * c; const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16)); memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c; dst_p += c;
...@@ -133,8 +132,6 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) { ...@@ -133,8 +132,6 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
return vec_new_data; return vec_new_data;
} }
void interpolate(lite::Tensor* X, void interpolate(lite::Tensor* X,
lite::Tensor* OutSize, lite::Tensor* OutSize,
std::vector<const lite::Tensor*> SizeTensor, std::vector<const lite::Tensor*> SizeTensor,
...@@ -188,19 +185,18 @@ void interpolate(lite::Tensor* X, ...@@ -188,19 +185,18 @@ void interpolate(lite::Tensor* X,
int spatial_in = in_h * in_w; int spatial_in = in_h * in_w;
int spatial_out = out_h * out_w; int spatial_out = out_h * out_w;
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
nearest_interp(din + spatial_in * i, nearest_interp(din + spatial_in * i,
in_w, in_w,
in_h, in_h,
out_c, out_c,
dout + spatial_out * i, dout + spatial_out * i,
out_w, out_w,
out_h, out_h,
1.f / width_scale, 1.f / width_scale,
1.f / height_scale, 1.f / height_scale,
with_align); with_align);
} }
} }
void NearestInterpCompute::Run() { void NearestInterpCompute::Run() {
...@@ -215,27 +211,24 @@ void NearestInterpCompute::Run() { ...@@ -215,27 +211,24 @@ void NearestInterpCompute::Run() {
int out_h = param.out_h; int out_h = param.out_h;
bool align_corners = param.align_corners; bool align_corners = param.align_corners;
std::string interp_method = ""; std::string interp_method = "";
X->ZynqTensor()->invalidate();//TODO X->ZynqTensor()->invalidate(); // TODO
X->ZynqTensor()->saveToFile("n_in", true); X->ZynqTensor()->saveToFile("n_in", true);
interpolate(X, interpolate(X,
OutSize, OutSize,
SizeTensor, SizeTensor,
Scale, Scale,
Out, Out,
out_h, out_h,
out_w, out_w,
scale, scale,
align_corners, align_corners,
interp_method); interp_method);
Out->ZynqTensor()->flush(); Out->ZynqTensor()->flush();
Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor()); Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
Out->ZynqTensor()->saveToFile("n_out", true); Out->ZynqTensor()->saveToFile("n_out", true);
} }
} /* namespace fpga */ } /* namespace fpga */
...@@ -249,15 +242,17 @@ REGISTER_LITE_KERNEL(bilinear_interp, ...@@ -249,15 +242,17 @@ REGISTER_LITE_KERNEL(bilinear_interp,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::BilinearInterpCompute, paddle::lite::kernels::fpga::BilinearInterpCompute,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), .BindInput("X",
PRECISION(kFP16), {LiteType::GetTensorTy(TARGET(kFPGA),
DATALAYOUT(kNHWC))}) PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize", .BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor", .BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -268,15 +263,17 @@ REGISTER_LITE_KERNEL(nearest_interp, ...@@ -268,15 +263,17 @@ REGISTER_LITE_KERNEL(nearest_interp,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::NearestInterpCompute, paddle::lite::kernels::fpga::NearestInterpCompute,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), .BindInput("X",
PRECISION(kFP16), {LiteType::GetTensorTy(TARGET(kFPGA),
DATALAYOUT(kNHWC))}) PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize", .BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor", .BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#pragma once #pragma once
#include <string> #include <string>
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -34,12 +34,12 @@ class BilinearInterpCompute ...@@ -34,12 +34,12 @@ class BilinearInterpCompute
class NearestInterpCompute class NearestInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
virtual ~NearestInterpCompute() = default; virtual ~NearestInterpCompute() = default;
private: private:
zynqmp::ResizePE pe_; zynqmp::ResizePE pe_;
}; };
......
...@@ -44,8 +44,6 @@ class IoCopyHostCHWToFpgaHWCCompute ...@@ -44,8 +44,6 @@ class IoCopyHostCHWToFpgaHWCCompute
param.x->target() == TARGET(kFPGA)); param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->flush(); param.x->ZynqTensor()->flush();
if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) { if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
param.y->mutable_data<int>(); param.y->mutable_data<int>();
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
...@@ -86,7 +84,7 @@ class IoCopyFpgaToHostCompute ...@@ -86,7 +84,7 @@ class IoCopyFpgaToHostCompute
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) || CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA)); param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->syncToDevice(); param.x->ZynqTensor()->syncToDevice();
param.y->mutable_data<float>(); param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32); param.y->ZynqTensor()->setDataType(zynqmp::FP32);
...@@ -104,7 +102,7 @@ class IoCopyFpgaToHostCompute ...@@ -104,7 +102,7 @@ class IoCopyFpgaToHostCompute
} else { } else {
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
} }
param.y->ZynqTensor()->invalidate(); param.y->ZynqTensor()->invalidate();
copy_properties(param); copy_properties(param);
} }
...@@ -141,16 +139,22 @@ class IoCopyFpgaToHostCHWCompute ...@@ -141,16 +139,22 @@ class IoCopyFpgaToHostCHWCompute
CHECK(param.x->target() == TARGET(kHost) || CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA)); param.x->target() == TARGET(kFPGA));
Tensor hwc; param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
param.y->mutable_data<int32_t>();
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
return;
}
Tensor hwc;
hwc.Resize(param.y->dims()); hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>(); float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>(); float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32); param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
hwc.ZynqTensor()->setDataLocation(zynqmp::CPU); hwc.ZynqTensor()->setDataLocation(zynqmp::CPU);
param.y->ZynqTensor()->setDataLocation(zynqmp::CPU); param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
if (param.x->ZynqTensor()->aligned() && if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) { param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor; zynqmp::Tensor tempTensor;
...@@ -158,15 +162,15 @@ class IoCopyFpgaToHostCHWCompute ...@@ -158,15 +162,15 @@ class IoCopyFpgaToHostCHWCompute
param.x->ZynqTensor()->shape()); param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor()); tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true); tempTensor.setAligned(true);
// tempTensor.saveToFile("temp_1", true); tempTensor.saveToFile("temp_1", true);
tempTensor.unalignImage(); // tempTensor.unalignImage();
// tempTensor.saveToFile("temp_2", true); tempTensor.saveToFile("temp_2", true);
hwc.ZynqTensor()->copyFrom(&tempTensor); hwc.ZynqTensor()->copyFrom(&tempTensor);
} else { } else {
// hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor()); // hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
float16* in_data = param.x->ZynqTensor()->data<float16>(); float16* in_data = param.x->ZynqTensor()->data<float16>();
// float* f_data = // float* f_data =
param.x->ZynqTensor()->flush(); param.x->ZynqTensor()->flush();
float max = 0; float max = 0;
...@@ -198,6 +202,7 @@ class IoCopyFpgaToHostCHWCompute ...@@ -198,6 +202,7 @@ class IoCopyFpgaToHostCHWCompute
dims.height(), dims.height(),
dims.width()); dims.width());
param.y->ZynqTensor()->copyFrom(hwc.ZynqTensor());
// param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); // param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush(); param.y->ZynqTensor()->flush();
copy_properties(param); copy_properties(param);
...@@ -205,8 +210,8 @@ class IoCopyFpgaToHostCHWCompute ...@@ -205,8 +210,8 @@ class IoCopyFpgaToHostCHWCompute
param.x->ZynqTensor()->invalidate(); param.x->ZynqTensor()->invalidate();
param.x->ZynqTensor()->flush(); param.x->ZynqTensor()->flush();
// hwc.ZynqTensor()->saveToFile("hwc", true); // hwc.ZynqTensor()->saveToFile("hwc", true);
// param.x->ZynqTensor()->saveToFile("io2_x", true); param.x->ZynqTensor()->saveToFile("io2_x", true);
// param.y->ZynqTensor()->saveToFile("io2_y", true); param.y->ZynqTensor()->saveToFile("io2_y", true);
} }
std::string doc() const override { return "Copy IO from FPGA to HOST"; } std::string doc() const override { return "Copy IO from FPGA to HOST"; }
}; };
...@@ -238,15 +243,16 @@ REGISTER_LITE_KERNEL(io_copy, ...@@ -238,15 +243,16 @@ REGISTER_LITE_KERNEL(io_copy,
kAny, kAny,
paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute, paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
host_float_chw_to_device_fp16_hwc) host_float_chw_to_device_fp16_hwc)
.BindInput("Input", {LiteType::GetTensorTy( .BindInput("Input",
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))}) {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(io_copy, REGISTER_LITE_KERNEL(io_copy,
kFPGA, kFPGA,
kAny, kAny,
...@@ -311,25 +317,24 @@ REGISTER_LITE_KERNEL(io_copy, ...@@ -311,25 +317,24 @@ REGISTER_LITE_KERNEL(io_copy,
// DATALAYOUT(kAny))}) // DATALAYOUT(kAny))})
// .Finalize(); // .Finalize();
// ========================================================== // ==========================================================
// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override { // std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
// std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t); // std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
// *res = [](const std::map<std::string, const Type*>& inputs, // *res = [](const std::map<std::string, const Type*>& inputs,
// const std::string& out) -> const Type* { // const std::string& out) -> const Type* {
// CHECK(!inputs.empty()); // CHECK(!inputs.empty());
// auto* type = inputs.at("Input"); // auto* type = inputs.at("Input");
// CHECK(type->target() == TARGET(kHost)); // CHECK(type->target() == TARGET(kHost));
// auto out_place = type->place(); // auto out_place = type->place();
// out_place.target = TARGET(kFPGA); // out_place.target = TARGET(kFPGA);
// auto* out_type = Type::Get(type->id(), // auto* out_type = Type::Get(type->id(),
// out_place.target, // out_place.target,
// out_place.precision, // out_place.precision,
// out_place.layout, // out_place.layout,
// out_place.device); // out_place.device);
// return out_type; // return out_type;
// }; // };
// return res; // return res;
// } // }
\ No newline at end of file \ No newline at end of file
...@@ -132,4 +132,3 @@ REGISTER_LITE_KERNEL(prior_box, ...@@ -132,4 +132,3 @@ REGISTER_LITE_KERNEL(prior_box,
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -23,7 +23,6 @@ namespace fpga { ...@@ -23,7 +23,6 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
void FlattenCompute::Run() { void FlattenCompute::Run() {
auto& param = Param<operators::ReshapeParam>(); auto& param = Param<operators::ReshapeParam>();
auto x = param.x; auto x = param.x;
...@@ -45,12 +44,10 @@ void FlattenCompute::Run() { ...@@ -45,12 +44,10 @@ void FlattenCompute::Run() {
output->Resize(output_dims); output->Resize(output_dims);
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("flatten", Debugger::get_instance().registerOutput("flatten", output->ZynqTensor());
output->ZynqTensor());
#endif #endif
} }
void ReshapeCompute::Run() { void ReshapeCompute::Run() {
auto& param = Param<operators::ReshapeParam>(); auto& param = Param<operators::ReshapeParam>();
auto x = param.x; auto x = param.x;
...@@ -69,17 +66,14 @@ void ReshapeCompute::Run() { ...@@ -69,17 +66,14 @@ void ReshapeCompute::Run() {
} else { } else {
// output->CopyDataFrom(*x); // output->CopyDataFrom(*x);
} }
output->ZynqTensor()->copyFrom(x->ZynqTensor()); output->ZynqTensor()->copyFrom(x->ZynqTensor());
// output->ZynqTensor()->saveToFile("ro", true); // output->ZynqTensor()->saveToFile("ro", true);
output->ZynqTensor()->flush(); output->ZynqTensor()->flush();
output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned()); output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("reshape", Debugger::get_instance().registerOutput("reshape", output->ZynqTensor());
output->ZynqTensor());
#endif #endif
} }
...@@ -163,7 +157,7 @@ REGISTER_LITE_KERNEL(flatten2, ...@@ -163,7 +157,7 @@ REGISTER_LITE_KERNEL(flatten2,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Shape", .BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kHost), {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny))}) DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out",
......
...@@ -38,9 +38,9 @@ void SoftmaxCompute::Run() { ...@@ -38,9 +38,9 @@ void SoftmaxCompute::Run() {
zynqmp::SoftmaxParam& softmax_param = pe_.param(); zynqmp::SoftmaxParam& softmax_param = pe_.param();
// softmax_param.input->saveToFile("softmax_in", true); // softmax_param.input->saveToFile("softmax_in", true);
pe_.dispatch(); pe_.dispatch();
softmax_param.output->flush(); softmax_param.output->flush();
// softmax_param.output->saveToFile("softmax", true); // softmax_param.output->saveToFile("softmax", true);
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("softmax", softmax_param.output); Debugger::get_instance().registerOutput("softmax", softmax_param.output);
#endif #endif
...@@ -61,17 +61,9 @@ REGISTER_LITE_KERNEL(softmax, ...@@ -61,17 +61,9 @@ REGISTER_LITE_KERNEL(softmax,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
{LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
// .BindOutput("Out", // .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA), // {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16), // PRECISION(kFP16),
......
...@@ -104,7 +104,7 @@ void Transpose2Compute::Run() { ...@@ -104,7 +104,7 @@ void Transpose2Compute::Run() {
// param.x->ZynqTensor()->saveToFile("t_unaligned", true); // param.x->ZynqTensor()->saveToFile("t_unaligned", true);
param.x->ZynqTensor()->flush(); param.x->ZynqTensor()->flush();
param.x->ZynqTensor()->invalidate(); param.x->ZynqTensor()->invalidate();
if (param.x->dims().size() != 4) { if (param.x->dims().size() != 4) {
transposeCompute(param); transposeCompute(param);
param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned()); param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
...@@ -115,7 +115,7 @@ void Transpose2Compute::Run() { ...@@ -115,7 +115,7 @@ void Transpose2Compute::Run() {
// param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); // param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param.output->ZynqTensor()->flush(); param.output->ZynqTensor()->flush();
// param.output->ZynqTensor()->saveToFile("Transpose2", true); param.output->ZynqTensor()->saveToFile("Transpose2", true);
} }
} // namespace fpga } // namespace fpga
...@@ -151,8 +151,9 @@ REGISTER_LITE_KERNEL(transpose2, ...@@ -151,8 +151,9 @@ REGISTER_LITE_KERNEL(transpose2,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), .BindOutput("Out",
PRECISION(kFP16), {LiteType::GetTensorTy(TARGET(kFPGA),
DATALAYOUT(kNHWC))}) PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/yolo_box_compute.h"
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
void YoloBoxCompute::PrepareForRun() {
auto& param = Param<operators::YoloBoxParam>();
lite::Tensor* X = param.X;
lite::Tensor* ImgSize = param.ImgSize;
lite::Tensor* Boxes = param.Boxes;
lite::Tensor* Scores = param.Scores;
Boxes->mutable_data<float>();
Scores->mutable_data<float>();
zynqmp::YoloBoxParam& yolobox_param = pe_.param();
yolobox_param.input = X->ZynqTensor();
yolobox_param.imgSize = ImgSize->ZynqTensor();
yolobox_param.outputBoxes = Boxes->ZynqTensor();
yolobox_param.outputScores = Scores->ZynqTensor();
yolobox_param.downsampleRatio = param.downsample_ratio;
yolobox_param.anchors = param.anchors;
yolobox_param.classNum = param.class_num;
yolobox_param.confThresh = param.conf_thresh;
pe_.init();
pe_.apply();
}
void YoloBoxCompute::Run() {
pe_.dispatch();
zynqmp::YoloBoxParam& yolobox_param = pe_.param();
yolobox_param.imgSize->saveToFile("img_size", true);
// exit(-1);
yolobox_param.outputBoxes->saveToFile("yolo_boxes", true);
yolobox_param.outputScores->saveToFile("yolo_scores", true);
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(yolo_box,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::YoloBoxCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("ImgSize",
// {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
// .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
// .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/yolobox_pe.hpp"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
class YoloBoxCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~YoloBoxCompute() {
};
private:
zynqmp::YoloBoxPE pe_;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册