提交 d3d793c7 编写于 作者: C chonwhite

YOLOV3 works

上级 a59d6fab
......@@ -18,19 +18,14 @@ namespace paddle {
namespace zynqmp {
class Action {
public:
void readScale(float* scale) {
public:
void readScale(float* scale) {}
}
void writeScale(float* scale) {}
void writeScale(float* scale) {
}
private:
private:
int id_ = -1;
int scaleIndex_ = -1;
}
}
}
\ No newline at end of file
}
......@@ -21,20 +21,16 @@ namespace paddle {
namespace zynqmp {
class Transaction {
public:
void appendAction(Action* action) { actions_.push_back(action); };
public:
void appendAction(Action* action) {
actions_.push_back(action);
};
void startTraction(){
void startTraction() {
};
private:
private:
std::std::vector<Action*> actions_;
int id_ = -1;
}
}
}
\ No newline at end of file
}
......@@ -20,7 +20,7 @@ namespace paddle {
namespace zynqmp {
class TransactionManager {
public:
public:
static TransactionManager& get_instance() {
static TransactionManager s_instance;
return s_instance;
......@@ -34,14 +34,11 @@ public:
return currentTransaction_;
};
void endTransaction() {
currentTransaction_ = nullptr;
}
void endTransaction() { currentTransaction_ = nullptr; }
private:
private:
Transaction* currentTransaction_ = nullptr;
std::vector<Transaction*> transactions_;
}
}
}
\ No newline at end of file
}
......@@ -15,8 +15,5 @@ limitations under the License. */
#include "io.hpp"
namespace paddle {
namespace zynqmp {
} // namespace zynqmp
namespace zynqmp {} // namespace zynqmp
} // namespace paddle
......@@ -70,6 +70,10 @@ class ConvPE : public PE {
param_.input->shape().channel() >= 2048) {
use_cpu_ = true;
}
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().num() % 16 != 0) {
use_cpu_ = true;
}
if (!use_cpu_) {
// param_.filter->releaseData();
}
......@@ -93,34 +97,38 @@ class ConvPE : public PE {
float* filter_data = param_.filter->data<float>();
float* mi = new float[in_channel];
int wh = input->shape().width() * input->shape().height();
float max = 0;
for (int i = 0; i < out_channel; i++) {
float* image = image_addr;
float* filter_ptr = filter_data + i * in_channel;
float* out_ptr = mi;
#pragma omp parallel for
for (int j = 0; j < in_channel; j++) {
// float32x4_t x0 = vld1q_f32(image);
// float32x4_t x1 = vld1q_f32(filter_ptr);
// float32x4_t r = vmulq_f32(x0, x1);
// vst1q_f32(out_ptr, r);
// image += 4;
// filter_ptr += 4;
// out_ptr += 4;
float value = image_addr[j] * filter_ptr[j];
mi[j] = value;
// #pragma omp parallel for
for (int k = 0; k < wh; k++) {
float* image = image_addr;
float* out_ptr = mi;
for (int j = 0; j < in_channel; j++) {
float value = image_addr[k * in_channel + j] * filter_ptr[j];
mi[j] = value;
}
float sum = 0;
for (int j = 0; j < in_channel; j++) {
sum += mi[j];
}
sum *= param_.scale()->data<float>()[i];
sum += param_.bias()->data<float>()[i];
out[i * wh + k] = sum;
max = std::max(max, std::abs(sum));
}
float sum = 0;
for (int j = 0; j < in_channel; j++) {
sum += mi[j];
}
out[i] = sum;
}
delete[] mi;
float_output.flush();
output->copyFrom(&float_output);
output->scale()[0] = max / 127.0;
output->scale()[1] = 127.0 / max;
// output->saveToFile("cpu", true);
}
bool dispatch() {
......@@ -206,7 +214,6 @@ class ConvPE : public PE {
// std::cout << "\n ================== EW ================== \n";
// }
}
return ret == 0;
}
......
......@@ -262,7 +262,7 @@ bool PriorBoxPE::dispatch() {
param_.outputBoxes->copyFrom(this->cachedBoxes_);
param_.outputVariances->copyFrom(this->cachedVariances_);
param_.outputBoxes->flush();
// param_.outputBoxes->syncToCPU();
param_.outputVariances->flush();
......
......@@ -84,28 +84,26 @@ class ResizePE : public PE {
param_.input->syncToCPU();
for (int h = 0; h < in_height; h++) {
for (int w = 0; w < in_width; w++) {
int src_index = in_width * channel * h + w * channel;
float16* src = param_.input->data<float16>() + src_index;
// std::cout << "src_index:" << src_index << std::endl;
for (int v = 0; v < factor; v++) {
for (int i =0; i < factor; i++) {
int dst_index = out_width * channel * h * factor +
out_width * channel * v +
w * channel * factor +
for (int h = 0; h < in_height; h++) {
for (int w = 0; w < in_width; w++) {
int src_index = in_width * channel * h + w * channel;
float16* src = param_.input->data<float16>() + src_index;
// std::cout << "src_index:" << src_index << std::endl;
for (int v = 0; v < factor; v++) {
for (int i = 0; i < factor; i++) {
int dst_index = out_width * channel * h * factor +
out_width * channel * v + w * channel * factor +
channel * i;
float16* dst = param_.output->data<float16>() + dst_index;
memcpy(dst, src, channel * sizeof(float16));
// std::cout << "dst_index:" << dst_index << std::endl;
}
}
}
float16* dst = param_.output->data<float16>() + dst_index;
memcpy(dst, src, channel * sizeof(float16));
// std::cout << "dst_index:" << dst_index << std::endl;
}
}
param_.output->flush();
param_.output->copyScaleFrom(param_.input);
}
}
param_.output->flush();
param_.output->copyScaleFrom(param_.input);
}
bool dispatch() {
cpu_compute();
......
......@@ -158,8 +158,9 @@ class ScalePE : public PE {
int index = i * input->shape().channel() + c;
float x = image_addr[index];
float y = half_to_float(scale_data[c]);
float value = x * y;
// std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl;
float value = x * y;
// std::cout << " x = " << std::to_string(x) << " y = " <<
// std::to_string(y) << " v = " << std::to_string(value) << std::endl;
// float value = half_to_float(in_data[index]) * 19.3598f;
data_out[index] = float_to_half(value);
......@@ -188,9 +189,9 @@ class ScalePE : public PE {
// dw_param.quantizedFilter()->flush();
// }
// param_.input->syncToDevice();
// return dw_pe_.dispatch();
return dw_pe_.dispatch();
cpu_compute();
// cpu_compute();
return true;
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
namespace paddle {
namespace zynqmp {
float sigmoid(float x) {
return 1.0 / (1.0 + std::exp(-x));
}
inline void GetYoloBox(float* box, const float* x, const int* anchors, int w,
int h, int an_idx, int grid_size,
int input_size, int index,
int img_height, int img_width) {
box[0] = (w + sigmoid(x[index])) * img_width * 1.0f/ grid_size;
box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size;
box[2] = std::exp(x[index + 2 ]) * anchors[2 * an_idx] * img_width * 1.0f/
input_size;
box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] *
img_height * 1.0f / input_size;
}
inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
int an_num, int an_stride, int stride,
int entry) {
return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
}
inline void CalcDetectionBox(float* boxes, float* box, const int box_idx,
const int img_height,
const int img_width) {
boxes[box_idx] = box[0] - box[2] / 2;
boxes[box_idx + 1] = box[1] - box[3] / 2;
boxes[box_idx + 2] = box[0] + box[2] / 2;
boxes[box_idx + 3] = box[1] + box[3] / 2;
boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0;
boxes[box_idx + 1] =
boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0;
boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
? boxes[box_idx + 2]
: (img_width - 1);
boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
? boxes[box_idx + 3]
: (img_height - 1);
}
inline void CalcLabelScore(float* scores, const float* input,
const int label_idx, const int score_idx,
const int class_num, const float conf) {
for (int i = 0; i < class_num; i++) {
scores[score_idx + i] = conf * sigmoid(input[label_idx + i]);
// std::cout << scores[score_idx + i] << " ";
}
// std::cout << std::endl;
}
class YoloBoxPE : public PE {
public:
bool init() {
param_.outputBoxes->setAligned(false);
param_.outputScores->setAligned(false);
param_.outputBoxes->setDataLocation(CPU);
param_.outputScores->setDataLocation(CPU);
return true;
}
bool dispatch() {
auto* input = param_.input;
auto* imgsize = param_.imgSize;
auto* boxes = param_.outputBoxes;
auto* scores = param_.outputScores;
auto anchors = param_.anchors;
int class_num = param_.classNum;
float conf_thresh = param_.confThresh;
int downsample_ratio = param_.downsampleRatio;
const int num = input->shape().num();
const int height = input->shape().height();
const int width = input->shape().width();
const int box_num = boxes->shape().channel();
const int an_num = anchors.size() / 2;
int input_size = downsample_ratio * height;
const int stride = height * width;
const int an_stride = (class_num + 5) * stride;
Tensor anchors_;
Shape anchors_shape(N, {an_num * 2});
auto anchors_data = anchors_.mutableData<int32_t>(INT32, anchors_shape);
std::copy(anchors.begin(), anchors.end(), anchors_data);
input->syncToCPU();
input->unalignImage();
// input->setAligned(false);
Tensor input_float;
input_float.setDataLocation(CPU);
float* input_data = input_float.mutableData<float>(FP32, input->shape());
input_float.copyFrom(input);
// input_float.saveToFile("input_yolobox_half", "true");
// input_float.setAligned(input->aligned());
// input_float.unalignImage();
// std::cout << "-------------unalignImage-----------------" << std::endl;
// for (int i = 0; i < input_float.shape().numel(); ++i)
// {
// std::cout << input_data[i] << " ";
// }
// std::cout << "-" << std::endl;
// std::cout << "-------------unalignImage-----------------" << std::endl;
// input_float.setAligned(false);
// input_float.saveToFile("input_yolobox_float", "true");
// input_float.syncToCPU();
// input_float.invalidate();
imgsize->saveToFile("img_size", true);
const int32_t* imgsize_data = imgsize->data<int32_t>();
Tensor boxes_float;
Tensor scores_float;
boxes_float.setDataLocation(CPU);
float* boxes_float_data = boxes_float.mutableData<float>(FP32, boxes->shape());
memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float));
scores_float.setDataLocation(CPU);
float* scores_float_data = scores_float.mutableData<float>(FP32, scores->shape());
memset(scores_float_data, 0, scores->shape().numel() * sizeof(float));
// float* boxes_data = boxes->mutableData<float>();
// memset(boxes_data, 0, boxes->shape().numel() * sizeof(float));
// float* scores_data = scores->mutableData<float>();
// memset(scores_data, 0, scores->shape().numel() * sizeof(float));
float box[4];
// for (int n = 0; n < num; n++) {
// int img_height = imgsize_data[2 * i];
// int img_width = imgsize_data[2 * i + 1];
int img_height = imgsize_data[0];
int img_width = imgsize_data[1];
std::cout << "YoloBoxPE imgsize:" << img_height << "," << img_width << std::endl;
int channel = input_float.shape().channel();
int count = 0;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width ; w++) {
for (int n = 0; n < an_num; n++) {
int obj_idx = channel * width * h + channel * w + n * (5 + class_num) + 4;
// std::cout << obj_idx << " ";
float conf = sigmoid(input_data[obj_idx]);
if (conf < conf_thresh) {
count++;
continue;
}
int box_idx = channel * width * h + channel * w + n * (5 + class_num) + 0;
GetYoloBox(box, input_data, anchors_data, w, h, n, height, input_size,
box_idx, img_height, img_width);
box_idx = h * an_num * 4 * width + an_num * 4 * w + n * 4;
CalcDetectionBox(boxes_float_data, box, box_idx, img_height,img_width);
int label_idx = channel * width * h + channel * w + n * (5 + class_num) + 5;
int score_idx = h * an_num * class_num * width + an_num * class_num * w + n * class_num;
CalcLabelScore(scores_float_data, input_data, label_idx, score_idx, class_num, conf);
}
}
}
boxes->copyFrom(&boxes_float);
scores->copyFrom(&scores_float);
input->setAligned(true);
}
void apply(){};
YoloBoxParam& param() { return param_; }
private:
YoloBoxParam param_;
};
} // namespace zynqmp
} // namespace paddle
......@@ -266,23 +266,21 @@ class Tensor {
return;
}
BypassArgs args;
args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.input_data_type =
src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.input_layout_type = LAYOUT_HWC;
args.output_layout_type = LAYOUT_HWC;
args.image = {
.address = src->data<void>(),
.scale_address = src->scale(),
.channels = (uint32_t)src->shape().numel(),
.width = 1,
.height = 1,
.pad_width = 0U,
.pad_height = 0U
};
args.image = {.address = src->data<void>(),
.scale_address = src->scale(),
.channels = (uint32_t)src->shape().numel(),
.width = 1,
.height = 1,
.pad_width = 0U,
.pad_height = 0U};
ImageOutputArgs output = {
.address = data<void>(),
.scale_address = scale(),
.address = data<void>(), .scale_address = scale(),
};
args.output = output;
......@@ -385,10 +383,11 @@ class Tensor {
void save_file_with_name(std::string path) {
// std::cout << "saving file: " << path << std::endl;
void* add = (void*)this;
// printf("tensor @: %p data: %p \n", (void *)add, (void*)data<void>());
// printf("tensor @: %p data: %p \n", (void *)add, (void*)data<void>());
// return;
std::ofstream ofs;
ofs.open(path);
ofs << "data type: " << dataType() << std::endl;
ofs << scale()[0] << " / " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) {
......@@ -406,13 +405,14 @@ class Tensor {
if (dataType_ == INT32) {
value = data<int32_t>()[i];
}
if (i < 10) {
std::cout << value << ",";
}
// if (i > 1000) {
// break;
// }
ofs << value << std::endl;
}
usleep(30000);
ofs.close();
......@@ -465,7 +465,6 @@ class Tensor {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
......
......@@ -166,6 +166,9 @@ class TensorLite {
void clear() {
// zynq_tensor_->releaseData();
if (zynq_tensor_) {
memset(zynq_tensor_->data<void>(), 0, zynq_tensor_->memorySize());
}
}
template <typename T>
......
......@@ -105,7 +105,6 @@ class ChannelWiseDequantOpFuser : public FuseBase {
*/
class DeleteQuantDequantOpFuser : public FuseBase {
public:
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
......
......@@ -84,13 +84,13 @@ class KernelPlaceCorrectPass : public DebugPass {
std::string node_name = out->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
auto op_type = inst.op_type();
if (op_type == "reshape" || op_type == "reshape2") {
for (auto* x_in : x->inlinks) {
std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
std::string in_name =
get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
if (in_name == "X") {
......@@ -101,9 +101,11 @@ class KernelPlaceCorrectPass : public DebugPass {
}
p = in->AsArg().type->precision();
if ( p != PrecisionType::kFP16) {
// std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl;
// std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
if (p != PrecisionType::kFP16) {
// std::cout << "found an arm ............... : " <<
// inst.kernels().size() << std::endl;
// std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) <<
// std::endl;
UpdateTarget(inst, TargetType::kHost);
UpdateTensor(inst, in, out, TargetType::kHost);
}
......@@ -113,8 +115,9 @@ class KernelPlaceCorrectPass : public DebugPass {
UpdateTarget(inst, TargetType::kFPGA);
}
if (inst.op_type() == "split" || inst.op_type() == "transpose") {
if ( p != PrecisionType::kFP16) {
if (inst.op_type() == "split" || inst.op_type() == "transpose" ||
inst.op_type() == "transpose2") {
if (p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
for (auto* x_out : x->outlinks) {
UpdateTensor(inst, in, x_out, TargetType::kARM);
......@@ -123,9 +126,12 @@ class KernelPlaceCorrectPass : public DebugPass {
}
if (inst.op_type() == "concat") {
std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl;
if ( p != PrecisionType::kFP16) {
std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target())
<< std::endl;
std::cout << "concat p:"
<< PrecisionToStr(inst.kernels()[0]->precision())
<< std::endl;
if (p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
UpdateTensor(inst, in, out, TargetType::kARM);
}
......@@ -134,8 +140,9 @@ class KernelPlaceCorrectPass : public DebugPass {
// if (inst.op_type() == "elementwise_mul") {
// for (auto* x_in : x->inlinks) {
// std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::string in_name = get_argname(x_in->AsArg().name,
// inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
// if (in_name == "Y") {
......@@ -150,7 +157,6 @@ class KernelPlaceCorrectPass : public DebugPass {
// UpdateTensor(inst, in, out, TargetType::kARM);
// }
// }
std::vector<TargetType> in_types;
std::vector<TargetType> out_types;
......@@ -164,11 +170,13 @@ class KernelPlaceCorrectPass : public DebugPass {
auto type = inst.picked_kernel().GetInputDeclType(arg_name);
// std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight)
// << " is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl;
// std::cout << arg_name <<" is weight:: " <<
// std::to_string(x_in->AsArg().is_weight)
// << " is persist: " <<
// std::to_string(x_in->AsArg().is_persist) << std::endl;
// std::cout << " type: "<< inst.op_type() << std::endl;
if (!x_in->AsArg().is_weight) {
auto p = x_in->AsArg().type->precision();
auto t = x_in->AsArg().type->target();
......@@ -224,10 +232,10 @@ class KernelPlaceCorrectPass : public DebugPass {
}
}
// Update me's kUnk fields by other's fields.
void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT
// std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
// std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) <<
// std::endl;
auto new_place = inst.place();
new_place.target = new_target;
......@@ -244,25 +252,30 @@ class KernelPlaceCorrectPass : public DebugPass {
std::vector<Place> places;
places.push_back(new_place);
inst.ResetKernels(places);
// std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
// std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) <<
// std::endl;
}
void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) {
void UpdateTensor(mir::Node::Stmt& inst,
Node* in,
Node* out,
TargetType new_target = TargetType::kUnk) {
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs());
std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs());
std::string arg_name =
get_argname(out->AsArg().name, inst.op_info()->outputs());
std::string in_name =
get_argname(in->AsArg().name, inst.op_info()->inputs());
auto type = inst.picked_kernel().GetInputDeclType(in_name);
auto tmp_ptype = in->AsArg().type->precision();
......@@ -281,7 +294,8 @@ class KernelPlaceCorrectPass : public DebugPass {
tmp_layout = DataLayoutType::kNCHW;
}
out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
out->AsArg().type =
LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
}
};
......
......@@ -80,6 +80,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::sort(scored.begin(), scored.end(), KernelScoreCmp);
instruct.kernels().clear();
VLOG(2) << "picking kernel " << scored.front().second->name() << "\n\n";
if (!instruct.op_info()->HasAttr("enable_int8")) {
// Move kernel back
// Just keep a single best kernel.
......
......@@ -157,7 +157,21 @@ class StaticKernelPickPass : public mir::StmtPass {
}
}
if (in_match) {
final_score = 5000;
final_score += 1000;
}
bool out_match = true;
for (size_t i = 0; i < out_names.size(); ++i) {
std::string tmp;
CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
if (out_types.count(out_names[i]) &&
out_types.at(out_names[i]) !=
kernel.GetOutputDeclType(tmp)->precision()) {
out_match = false;
}
}
if (out_match) {
final_score += 1000;
}
}
......
......@@ -53,7 +53,6 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
ComplementInputs(graph.get(), node, in, &copied_nodes);
}
}
}
void TypeTargetTransformPass::ComplementInputs(
......@@ -74,6 +73,7 @@ void TypeTargetTransformPass::ComplementInputs(
auto in_arg_name = in->AsArg().name;
std::string tmp;
CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
VLOG(4) << "in_arg_name: " << in_arg_name << " tmp:" << tmp;
auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
CHECK(in->AsArg().type);
if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
......
......@@ -141,6 +141,7 @@ class VariablePlaceInferencePass : public DebugPass {
x_in->AsArg().type = type;
} else {
PrecisionType tmp_ptype = x_in->AsArg().type->precision();
VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
x_in->AsArg().type = LiteType::GetTensorTy(
type->target(), tmp_ptype, type->layout());
}
......@@ -172,6 +173,9 @@ class VariablePlaceInferencePass : public DebugPass {
x_out->AsArg().type = type;
} else {
PrecisionType tmp_ptype = x_out->AsArg().type->precision();
tmp_ptype = type->precision();
// inst.picked_kernel().precision();
VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype);
x_out->AsArg().type = LiteType::GetTensorTy(
type->target(), tmp_ptype, type->layout());
}
......
......@@ -134,7 +134,6 @@ class Optimizer {
"mlu_postprocess_pass"}};
if (passes.size() == 1) {
// multi_stream_analysis_pass must be in the front of
// runtime_context_assign_pass
......
......@@ -42,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep
add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
add_kernel(yolo_box_compute_fpga FPGA basic SRCS yolo_box_compute.cc DEPS ${fpga_deps})
# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
......
......@@ -35,6 +35,28 @@ void ReluCompute::PrepareForRun() {
void ReluCompute::Run() { pe_.dispatch(); }
void SigmoidCompute::Run() {
// TODO(chonwhite) use fpga and arm implementation;
auto& param = this->Param<param_t>();
auto output_data = param.Out->mutable_data<float16>();
int numel = param.Out->numel();
float16* in_data = param.X->ZynqTensor()->data<float16>();
float16* out_data = param.Out->ZynqTensor()->data<float16>();
param.X->ZynqTensor()->syncToCPU();
float max = 0.0f;
for (int i = 0; i < numel; i++) {
/* code */
float value = zynqmp::half_to_float(in_data[i]);
value = 1 / (1 + exp(-value));
out_data[i] = zynqmp::float_to_half(value);
max = std::max(std::abs(value), max);
}
param.Out->ZynqTensor()->scale()[0] = max / 127.0;
param.Out->ZynqTensor()->scale()[1] = 127.0 / max;
param.Out->ZynqTensor()->flush();
}
} // namespace fpga
} // namespace kernels
} // namespace lite
......@@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(sigmoid,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::SigmoidCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -49,6 +49,16 @@ class ReluCompute
zynqmp::Tensor output_;
};
class SigmoidCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~SigmoidCompute() = default;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
......
......@@ -49,7 +49,7 @@ void CalibComputeFloat2Int::Run() {
const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<int>();
// param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
//TODO
// TODO
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
......
......@@ -45,11 +45,11 @@ void ConcatCompute::PrepareForRun() {
void ConcatCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
// #ifdef FPGA_PRINT_TENSOR
zynqmp::ConcatParam& concat_param = pe_.param();
concat_param.output->flush();
// Debugger::get_instance().registerOutput("concat", concat_param.output);
#endif
Debugger::get_instance().registerOutput("concat", concat_param.output);
// #endif
}
} // namespace fpga
......
......@@ -53,7 +53,8 @@ void ConvCompute::PrepareForRun() {
if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
conv_param.activeParam.leaky_relu_factor =
param.activation_param.Leaky_relu_alpha;
}
dw_conv_pe_.init();
......@@ -79,13 +80,15 @@ void ConvCompute::PrepareForRun() {
if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
conv_param.activeParam.leaky_relu_factor =
param.activation_param.Leaky_relu_alpha;
}
conv_pe_.init();
conv_pe_.apply();
}
// std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl;
// std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha
// << std::endl;
}
void ConvCompute::Run() {
......
......@@ -96,11 +96,12 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_value = param.Y->data<zynqmp::float16>()[0];
// std::cout << "FP16 \n";
}
// std::cout << "channel:" << channel << std::endl;
// std::cout << "production:" << param.Y->dims().production() << std::endl;
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// std::cout << "scale_value:" <<
// std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
for (int i = 0; i < channel; i++) {
......@@ -112,7 +113,8 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_value = param.Y->data<zynqmp::float16>()[i];
}
}
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// std::cout << "scale_value:" <<
// std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
scale_data[i] = scale_value;
bias_data[i] = zero_;
......@@ -128,13 +130,13 @@ void ElementwiseMulCompute::Run() {
if (!param.Y->persistable()) {
// TODO
scale_.copyFrom(param.Y->ZynqTensor());
scale_.flush();//TODO
scale_.flush(); // TODO
}
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param();
// Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
// Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
// Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
// Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif
}
......@@ -214,8 +216,7 @@ REGISTER_LITE_KERNEL(elementwise_mul,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -28,7 +28,14 @@ void FeedCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col);
param.out->Resize(x.dims());
param.out->mutable_data<float16>();
auto in_type = x.ZynqTensor()->dataType();
if (in_type == zynqmp::FP32 || in_type == zynqmp::FP16) {
param.out->mutable_data<float16>();
}
if (in_type == zynqmp::INT32) {
param.out->mutable_data<int32_t>();
}
// ====================================================
zynqmp::InputParam& feed_param = pe_.param();
feed_param.input = x.ZynqTensor();
......@@ -68,12 +75,18 @@ REGISTER_LITE_KERNEL(
DATALAYOUT(kNHWC))})
.Finalize();
// REGISTER_LITE_KERNEL(feed,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::FeedCompute,
// def_host)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
REGISTER_LITE_KERNEL(feed,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FeedCompute,
feed_int32)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
\ No newline at end of file
......@@ -49,7 +49,6 @@ void BilinearInterpCompute::Run() {
// interp_method);
}
void nearest_interp(const float16* src,
int w_in,
int h_in,
......@@ -85,7 +84,7 @@ void nearest_interp(const float16* src,
int near_y = static_cast<int>(scale_h_new * h);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w);
const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c;
......@@ -133,8 +132,6 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
return vec_new_data;
}
void interpolate(lite::Tensor* X,
lite::Tensor* OutSize,
std::vector<const lite::Tensor*> SizeTensor,
......@@ -188,19 +185,18 @@ void interpolate(lite::Tensor* X,
int spatial_in = in_h * in_w;
int spatial_out = out_h * out_w;
for (int i = 0; i < count; ++i) {
nearest_interp(din + spatial_in * i,
in_w,
in_h,
out_c,
dout + spatial_out * i,
out_w,
out_h,
1.f / width_scale,
1.f / height_scale,
with_align);
}
nearest_interp(din + spatial_in * i,
in_w,
in_h,
out_c,
dout + spatial_out * i,
out_w,
out_h,
1.f / width_scale,
1.f / height_scale,
with_align);
}
}
void NearestInterpCompute::Run() {
......@@ -215,27 +211,24 @@ void NearestInterpCompute::Run() {
int out_h = param.out_h;
bool align_corners = param.align_corners;
std::string interp_method = "";
X->ZynqTensor()->invalidate();//TODO
X->ZynqTensor()->invalidate(); // TODO
X->ZynqTensor()->saveToFile("n_in", true);
interpolate(X,
OutSize,
SizeTensor,
Scale,
Out,
out_h,
out_w,
scale,
align_corners,
interp_method);
OutSize,
SizeTensor,
Scale,
Out,
out_h,
out_w,
scale,
align_corners,
interp_method);
Out->ZynqTensor()->flush();
Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
Out->ZynqTensor()->saveToFile("n_out", true);
}
} /* namespace fpga */
......@@ -249,15 +242,17 @@ REGISTER_LITE_KERNEL(bilinear_interp,
kNHWC,
paddle::lite::kernels::fpga::BilinearInterpCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -268,15 +263,17 @@ REGISTER_LITE_KERNEL(nearest_interp,
kNHWC,
paddle::lite::kernels::fpga::NearestInterpCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -14,9 +14,9 @@
#pragma once
#include <string>
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
namespace paddle {
namespace lite {
......@@ -34,12 +34,12 @@ class BilinearInterpCompute
class NearestInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~NearestInterpCompute() = default;
private:
zynqmp::ResizePE pe_;
};
......
......@@ -44,8 +44,6 @@ class IoCopyHostCHWToFpgaHWCCompute
param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->flush();
if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
param.y->mutable_data<int>();
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
......@@ -86,7 +84,7 @@ class IoCopyFpgaToHostCompute
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->syncToDevice();
param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
......@@ -104,7 +102,7 @@ class IoCopyFpgaToHostCompute
} else {
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
param.y->ZynqTensor()->invalidate();
copy_properties(param);
}
......@@ -141,16 +139,22 @@ class IoCopyFpgaToHostCHWCompute
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
Tensor hwc;
param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
param.y->mutable_data<int32_t>();
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
return;
}
Tensor hwc;
hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
hwc.ZynqTensor()->setDataLocation(zynqmp::CPU);
param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
......@@ -158,15 +162,15 @@ class IoCopyFpgaToHostCHWCompute
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
// tempTensor.saveToFile("temp_1", true);
tempTensor.unalignImage();
// tempTensor.saveToFile("temp_2", true);
tempTensor.saveToFile("temp_1", true);
// tempTensor.unalignImage();
tempTensor.saveToFile("temp_2", true);
hwc.ZynqTensor()->copyFrom(&tempTensor);
} else {
// hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
float16* in_data = param.x->ZynqTensor()->data<float16>();
// float* f_data =
// float* f_data =
param.x->ZynqTensor()->flush();
float max = 0;
......@@ -198,6 +202,7 @@ class IoCopyFpgaToHostCHWCompute
dims.height(),
dims.width());
param.y->ZynqTensor()->copyFrom(hwc.ZynqTensor());
// param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
copy_properties(param);
......@@ -205,8 +210,8 @@ class IoCopyFpgaToHostCHWCompute
param.x->ZynqTensor()->invalidate();
param.x->ZynqTensor()->flush();
// hwc.ZynqTensor()->saveToFile("hwc", true);
// param.x->ZynqTensor()->saveToFile("io2_x", true);
// param.y->ZynqTensor()->saveToFile("io2_y", true);
param.x->ZynqTensor()->saveToFile("io2_x", true);
param.y->ZynqTensor()->saveToFile("io2_y", true);
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
......@@ -238,15 +243,16 @@ REGISTER_LITE_KERNEL(io_copy,
kAny,
paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
host_float_chw_to_device_fp16_hwc)
.BindInput("Input", {LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy,
kFPGA,
kAny,
......@@ -311,25 +317,24 @@ REGISTER_LITE_KERNEL(io_copy,
// DATALAYOUT(kAny))})
// .Finalize();
// ==========================================================
// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
// std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
// *res = [](const std::map<std::string, const Type*>& inputs,
// const std::string& out) -> const Type* {
// CHECK(!inputs.empty());
// auto* type = inputs.at("Input");
// CHECK(type->target() == TARGET(kHost));
// auto out_place = type->place();
// out_place.target = TARGET(kFPGA);
// auto* out_type = Type::Get(type->id(),
// out_place.target,
// out_place.precision,
// out_place.layout,
// out_place.device);
// return out_type;
// };
// return res;
// }
\ No newline at end of file
// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
// std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
// *res = [](const std::map<std::string, const Type*>& inputs,
// const std::string& out) -> const Type* {
// CHECK(!inputs.empty());
// auto* type = inputs.at("Input");
// CHECK(type->target() == TARGET(kHost));
// auto out_place = type->place();
// out_place.target = TARGET(kFPGA);
// auto* out_type = Type::Get(type->id(),
// out_place.target,
// out_place.precision,
// out_place.layout,
// out_place.device);
// return out_type;
// };
// return res;
// }
\ No newline at end of file
......@@ -132,4 +132,3 @@ REGISTER_LITE_KERNEL(prior_box,
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -23,7 +23,6 @@ namespace fpga {
using float16 = zynqmp::float16;
void FlattenCompute::Run() {
auto& param = Param<operators::ReshapeParam>();
auto x = param.x;
......@@ -45,12 +44,10 @@ void FlattenCompute::Run() {
output->Resize(output_dims);
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("flatten",
output->ZynqTensor());
Debugger::get_instance().registerOutput("flatten", output->ZynqTensor());
#endif
}
void ReshapeCompute::Run() {
auto& param = Param<operators::ReshapeParam>();
auto x = param.x;
......@@ -69,17 +66,14 @@ void ReshapeCompute::Run() {
} else {
// output->CopyDataFrom(*x);
}
output->ZynqTensor()->copyFrom(x->ZynqTensor());
// output->ZynqTensor()->saveToFile("ro", true);
output->ZynqTensor()->flush();
output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("reshape",
output->ZynqTensor());
Debugger::get_instance().registerOutput("reshape", output->ZynqTensor());
#endif
}
......@@ -163,7 +157,7 @@ REGISTER_LITE_KERNEL(flatten2,
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kHost),
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
......
......@@ -38,9 +38,9 @@ void SoftmaxCompute::Run() {
zynqmp::SoftmaxParam& softmax_param = pe_.param();
// softmax_param.input->saveToFile("softmax_in", true);
pe_.dispatch();
softmax_param.output->flush();
// softmax_param.output->saveToFile("softmax", true);
// softmax_param.output->saveToFile("softmax", true);
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("softmax", softmax_param.output);
#endif
......@@ -61,17 +61,9 @@ REGISTER_LITE_KERNEL(softmax,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
......
......@@ -104,7 +104,7 @@ void Transpose2Compute::Run() {
// param.x->ZynqTensor()->saveToFile("t_unaligned", true);
param.x->ZynqTensor()->flush();
param.x->ZynqTensor()->invalidate();
if (param.x->dims().size() != 4) {
transposeCompute(param);
param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
......@@ -115,7 +115,7 @@ void Transpose2Compute::Run() {
// param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param.output->ZynqTensor()->flush();
// param.output->ZynqTensor()->saveToFile("Transpose2", true);
param.output->ZynqTensor()->saveToFile("Transpose2", true);
}
} // namespace fpga
......@@ -151,8 +151,9 @@ REGISTER_LITE_KERNEL(transpose2,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/yolo_box_compute.h"
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
void YoloBoxCompute::PrepareForRun() {
auto& param = Param<operators::YoloBoxParam>();
lite::Tensor* X = param.X;
lite::Tensor* ImgSize = param.ImgSize;
lite::Tensor* Boxes = param.Boxes;
lite::Tensor* Scores = param.Scores;
Boxes->mutable_data<float>();
Scores->mutable_data<float>();
zynqmp::YoloBoxParam& yolobox_param = pe_.param();
yolobox_param.input = X->ZynqTensor();
yolobox_param.imgSize = ImgSize->ZynqTensor();
yolobox_param.outputBoxes = Boxes->ZynqTensor();
yolobox_param.outputScores = Scores->ZynqTensor();
yolobox_param.downsampleRatio = param.downsample_ratio;
yolobox_param.anchors = param.anchors;
yolobox_param.classNum = param.class_num;
yolobox_param.confThresh = param.conf_thresh;
pe_.init();
pe_.apply();
}
void YoloBoxCompute::Run() {
pe_.dispatch();
zynqmp::YoloBoxParam& yolobox_param = pe_.param();
yolobox_param.imgSize->saveToFile("img_size", true);
// exit(-1);
yolobox_param.outputBoxes->saveToFile("yolo_boxes", true);
yolobox_param.outputScores->saveToFile("yolo_scores", true);
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(yolo_box,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::YoloBoxCompute,
// def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("ImgSize",
// {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
// .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
// .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/yolobox_pe.hpp"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
class YoloBoxCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~YoloBoxCompute() {
};
private:
zynqmp::YoloBoxPE pe_;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册