提交 a59d6fab 编写于 作者: C chonwhite

arm & fpga kernel works together

上级 2b32484a
......@@ -73,6 +73,7 @@ class Debugger {
op_config["nms"] = true;
op_config["pb_boxes"] = true;
op_config["pb_variances"] = true;
op_config["reshape"] = true;
op_config["softmax"] = true;
op_config["split"] = true;
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace zynqmp {
class Action {
public:
void readScale(float* scale) {
}
void writeScale(float* scale) {
}
private:
int id_ = -1;
int scaleIndex_ = -1;
}
}
}
\ No newline at end of file
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/dispatch/action.hpp"
#include <algorithm>
#include <vector>
namespace paddle {
namespace zynqmp {
class Transaction {
public:
void appendAction(Action* action) {
actions_.push_back(action);
};
void startTraction() {
};
private:
std::std::vector<Action*> actions_;
int id_ = -1;
}
}
}
\ No newline at end of file
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
namespace paddle {
namespace zynqmp {
class TransactionManager {
public:
static TransactionManager& get_instance() {
static TransactionManager s_instance;
return s_instance;
}
Transaction* getTransaction() {
if (currentTransaction_ == nullptr) {
currentTransaction_ = new Transaction();
transactions_.push_back(currentTransaction_);
}
return currentTransaction_;
};
void endTransaction() {
currentTransaction_ = nullptr;
}
private:
Transaction* currentTransaction_ = nullptr;
std::vector<Transaction*> transactions_;
}
}
}
\ No newline at end of file
......@@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in,
for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw;
int8_t* quantized_start = quantized_data + n * chw;
// float f_max = find_max(filter_start, chw);
float f_max = max;
float f_max = find_max(filter_start, chw);
// float f_max = max;
quantize(filter_start, quantized_start, chw, f_max);
filter_max.push_back(f_max);
}
......
......@@ -264,10 +264,10 @@ inline void format_filter(Tensor* filter,
quantized_filter->flush();
fpga_free(quantized_data);
// for (size_t i = 0; i < max_values.size(); i++) {
// // scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f);
// }
for (size_t i = 0; i < max_values.size(); i++) {
scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f);
}
// filter->saveToFile("filter.txt");
// std::ofstream ofs;
......@@ -374,17 +374,15 @@ inline void split_filter_num(const ConvParam& c_param) {
std::vector<float> v; // TODO(chonwhite) change variable name;
format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
conv_param->filter.setDataType(INT8);
Tensor scale;
Tensor bias;
int chnnnel_start = i * filter_num_per_div;
Shape s_shape(NC, {1, filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
}
for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
......@@ -513,7 +511,7 @@ inline void pack_channel_filter(const ConvParam& c_param) {
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_current_pack; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
}
for (int n = 0; n < filter_current_pack; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
......
......@@ -41,7 +41,9 @@ class InputPE : public PE {
src = &half_tensor;
}
output->mutableData<void>();
src->alignImage(output, true);
src->alignImage();
output->copyFrom(src);
// src->alignImage(output, true);
return true;
}
......
......@@ -103,6 +103,7 @@ class NormPE : public PE {
float_out.flush();
// float_out.saveToFile("normalize_", true);
param_.output->copyFrom(&float_out);
param_.output->flush();
}
bool dispatch() {
......
......@@ -56,8 +56,8 @@ class OutputPE : public PE {
fpga_reset();
auto max = fpga_get_memory_size_max();
std::cout << "PL ===== Max: ===== :: " << max << std::endl;
// auto max = fpga_get_memory_size_max();
// std::cout << "PL ===== Max: ===== :: " << max << std::endl;
return true;
}
......
......@@ -241,7 +241,7 @@ void PriorBoxPE::compute_prior_box() {
}
boxes.flush();
boxes.syncToCPU();
// boxes.syncToCPU();
variances.flush();
output_boxes->copyFrom(&boxes);
output_variances->copyFrom(&variances);
......@@ -261,11 +261,12 @@ bool PriorBoxPE::dispatch() {
}
param_.outputBoxes->copyFrom(this->cachedBoxes_);
param_.outputVariances->copyFrom(this->cachedVariances_);
param_.outputBoxes->flush();
param_.outputBoxes->syncToCPU();
// param_.outputBoxes->syncToCPU();
param_.outputVariances->flush();
return true;
}
} // namespace zynqmp
......
......@@ -35,6 +35,13 @@ class PriorBoxPE : public PE {
PriorBoxParam& param() { return param_; }
~PriorBoxPE() {
if (cachedBoxes_ != nullptr) {
delete cachedBoxes_;
delete cachedVariances_;
}
}
private:
PriorBoxParam param_;
Tensor* cachedBoxes_ = nullptr;
......
......@@ -73,9 +73,43 @@ class ResizePE : public PE {
scale[0] = max / 127.0;
scale[1] = 127.0 / max;
}
void cpu_compute() {
Shape& in_shape = param_.input->shape();
Shape& out_shape = param_.output->shape();
int channel = in_shape.channel();
int in_height = in_shape.height();
int in_width = in_shape.width();
int out_width = out_shape.width();
int factor = out_shape.width() / in_shape.width();
param_.input->syncToCPU();
for (int h = 0; h < in_height; h++) {
for (int w = 0; w < in_width; w++) {
int src_index = in_width * channel * h + w * channel;
float16* src = param_.input->data<float16>() + src_index;
// std::cout << "src_index:" << src_index << std::endl;
for (int v = 0; v < factor; v++) {
for (int i =0; i < factor; i++) {
int dst_index = out_width * channel * h * factor +
out_width * channel * v +
w * channel * factor +
channel * i;
float16* dst = param_.output->data<float16>() + dst_index;
memcpy(dst, src, channel * sizeof(float16));
// std::cout << "dst_index:" << dst_index << std::endl;
}
}
}
}
param_.output->flush();
param_.output->copyScaleFrom(param_.input);
}
bool dispatch() {
bool ret = compute_fpga_resize(args_) == 0;
cpu_compute();
// bool ret = compute_fpga_resize(args_) == 0;
return true;
}
......
......@@ -141,22 +141,26 @@ class ScalePE : public PE {
Tensor* output = param_.output;
Tensor float_input;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
input->syncToCPU();
// input->syncToCPU();
// input->invalidate();
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
float* scale_data = param_.scale->data<float>();
float16* scale_data = param_.scale->data<float16>();
int wh = input->shape().width() * input->shape().height();
float16* in_data = input->data<float16>();
float max = 0;
for (int i = 0; i < wh; i++) {
for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c];
float x = image_addr[index];
float y = half_to_float(scale_data[c]);
float value = x * y;
// std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl;
// float value = half_to_float(in_data[index]) * 19.3598f;
data_out[index] = float_to_half(value);
if (value < 0) {
......@@ -167,24 +171,27 @@ class ScalePE : public PE {
}
}
}
// exit(-1);
output->flush();
output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max;
}
bool dispatch() {
if (param_.scale->dataType() == FP16) {
DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
param_.scale->data<float16>(),
param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
dw_param.quantizedFilter()->flush();
}
param_.input->syncToDevice();
return dw_pe_.dispatch();
// if (param_.scale->dataType() == FP16) {
// DepthwiseConvParam& dw_param = dw_pe_.param();
// memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
// param_.scale->data<float16>(),
// param_.scale->shape().numel() * sizeof(float16));
// dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
// dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
// dw_param.quantizedFilter()->flush();
// }
// param_.input->syncToDevice();
// return dw_pe_.dispatch();
cpu_compute();
return true;
}
ScaleParam& param() { return param_; }
......
......@@ -154,6 +154,7 @@ bool SoftmaxPE::dispatch() {
float_output.flush();
output->copyFrom(&float_output);
output->flush();
return true;
}
......
......@@ -105,7 +105,7 @@ class SplitPE : public PE {
in_stride,
out_stride[axis]);
input_offset += out_stride[axis];
// out->flush();
out->flush();
}
return true;
}
......
......@@ -266,22 +266,25 @@ class Tensor {
return;
}
BypassArgs args;
args.input_data_type =
src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.input_layout_type = LAYOUT_HWC;
args.output_layout_type = LAYOUT_HWC;
args.image = {.address = src->data<void>(),
.scale_address = src->scale(),
.channels = (uint32_t)src->shape().numel(),
.width = 1,
.height = 1,
.pad_width = 0u,
.pad_height = 0u};
args.image = {
.address = src->data<void>(),
.scale_address = src->scale(),
.channels = (uint32_t)src->shape().numel(),
.width = 1,
.height = 1,
.pad_width = 0U,
.pad_height = 0U
};
ImageOutputArgs output = {
.address = data<void>(), .scale_address = scale(),
.address = data<void>(),
.scale_address = scale(),
};
args.output = output;
size_t aligned_remainder = src->shape().numel() % 16;
if (aligned_remainder > 0) {
......@@ -380,6 +383,10 @@ class Tensor {
}
void save_file_with_name(std::string path) {
// std::cout << "saving file: " << path << std::endl;
void* add = (void*)this;
// printf("tensor @: %p data: %p \n", (void *)add, (void*)data<void>());
// return;
std::ofstream ofs;
ofs.open(path);
ofs << scale()[0] << " / " << scale()[1] << std::endl;
......@@ -399,8 +406,15 @@ class Tensor {
if (dataType_ == INT32) {
value = data<int32_t>()[i];
}
if (i < 10) {
std::cout << value << ",";
}
ofs << value << std::endl;
}
usleep(30000);
ofs.close();
}
......@@ -451,6 +465,7 @@ class Tensor {
value = half_to_float(tensor.data<float16>()[i]);
}
os << value << " ";
}
os << "\n";
return os;
......
......@@ -102,6 +102,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
Resize(other.dims());
auto shape = other.zynq_tensor_->shape();
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
precision_ = other.precision_;
// this->ZynqTensor()->copyFrom(other.ZynqTensor());
memcpy(this->ZynqTensor()->data<void>(),
......
......@@ -109,6 +109,7 @@ class TensorLite {
template <typename T, typename R = T>
const R *data() const {
return zynq_tensor_->data<R>() + offset_;
// return zynq_tensor_->data<R>();
}
void Resize(const DDimLite &ddim) { dims_ = ddim; }
......@@ -198,7 +199,8 @@ class TensorLite {
// set values of precision_ and persistable_ after updating it.
// If your tensor is just a temp tensor, such as activations,
// you can ignore these two attributes.
PrecisionType precision_{PrecisionType::kUnk};
// PrecisionType precision_{PrecisionType::kUnk};
PrecisionType precision_{PrecisionType::kFloat};
bool persistable_{false};
DDimLite dims_;
......@@ -235,6 +237,28 @@ zynqmp::DataType get_date_type() {
return data_type;
}
template <typename T>
PrecisionType get_precistion_type() {
PrecisionType data_type = PrecisionType::kUnk;
if (typeid(T) == typeid(float)) {
data_type = PrecisionType::kFloat;
}
if (typeid(T) == typeid(zynqmp::float16)) {
data_type = PrecisionType::kFP16;
}
if (typeid(T) == typeid(int)) {
data_type = PrecisionType::kInt32;
}
if (typeid(T) == typeid(int32_t)) {
data_type = PrecisionType::kInt32;
}
if (typeid(T) == typeid(int8_t)) {
data_type = PrecisionType::kInt8;
}
return data_type;
}
template <typename T, typename R>
R *TensorLite::mutable_data() {
std::vector<int> v;
......@@ -261,6 +285,7 @@ R *TensorLite::mutable_data() {
}
zynqmp::Shape input_shape(layout_type, v);
zynqmp::DataType data_type = get_date_type<T>();
precision_ = get_precistion_type<T>();
if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor());
......
......@@ -50,6 +50,7 @@ class KernelPlaceCorrectPass : public DebugPass {
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
// std::cout << ""
for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference.
......@@ -77,6 +78,80 @@ class KernelPlaceCorrectPass : public DebugPass {
bool need_correct_place = true;
auto in = x->inlinks.front();
auto out = x->outlinks.front();
auto p = in->AsArg().type->precision();
std::string node_name = out->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
auto op_type = inst.op_type();
if (op_type == "reshape" || op_type == "reshape2") {
for (auto* x_in : x->inlinks) {
std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
if (in_name == "X") {
in = x_in;
std::cout << "found input \n";
// exit(-1);
}
}
p = in->AsArg().type->precision();
if ( p != PrecisionType::kFP16) {
// std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl;
// std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
UpdateTarget(inst, TargetType::kHost);
UpdateTensor(inst, in, out, TargetType::kHost);
}
}
if (inst.op_type() == "fetch") {
UpdateTarget(inst, TargetType::kFPGA);
}
if (inst.op_type() == "split" || inst.op_type() == "transpose") {
if ( p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
for (auto* x_out : x->outlinks) {
UpdateTensor(inst, in, x_out, TargetType::kARM);
}
}
}
if (inst.op_type() == "concat") {
std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl;
if ( p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
UpdateTensor(inst, in, out, TargetType::kARM);
}
}
// if (inst.op_type() == "elementwise_mul") {
// for (auto* x_in : x->inlinks) {
// std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
// if (in_name == "Y") {
// in = x_in;
// std::cout << "found y \n";
// // exit(-1);
// }
// }
// if ( p != PrecisionType::kFP16) {
// UpdateTarget(inst, TargetType::kARM);
// UpdateTensor(inst, in, out, TargetType::kARM);
// }
// }
std::vector<TargetType> in_types;
std::vector<TargetType> out_types;
for (auto* x_in : x->inlinks) {
......@@ -88,6 +163,21 @@ class KernelPlaceCorrectPass : public DebugPass {
<< "-- node name:" << node_name;
auto type = inst.picked_kernel().GetInputDeclType(arg_name);
// std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight)
// << " is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl;
// std::cout << " type: "<< inst.op_type() << std::endl;
if (!x_in->AsArg().is_weight) {
auto p = x_in->AsArg().type->precision();
auto t = x_in->AsArg().type->target();
auto l = x_in->AsArg().type->layout();
// std::cout << "p:" << PrecisionToStr(p) << std::endl;
// std::cout << "t:" << TargetRepr(t) << std::endl;
// std::cout << "layout:" << DataLayoutToStr(l) << std::endl;
}
if (!x_in->AsArg().type) {
need_correct_place &= false;
} else {
......@@ -129,18 +219,69 @@ class KernelPlaceCorrectPass : public DebugPass {
need_correct_place &= (io_target_same && (in_types[0] != this_type));
if (need_correct_place) {
// update this kernel's valid place;
UpdateTarget(inst, in_types[0]);
// UpdateTarget(inst, in_types[0]);
}
}
}
// Update me's kUnk fields by other's fields.
void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT
// std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
auto new_place = inst.place();
new_place.target = new_target;
if (new_target == TargetType::kARM) {
new_place.precision = PrecisionType::kFloat;
new_place.layout = DataLayoutType::kNCHW;
}
if (new_target == TargetType::kHost) {
new_place.precision = PrecisionType::kFloat;
new_place.layout = DataLayoutType::kNCHW;
}
std::vector<Place> places;
places.push_back(new_place);
inst.ResetKernels(places);
// std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
}
void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) {
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs());
std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs());
auto type = inst.picked_kernel().GetInputDeclType(in_name);
auto tmp_ptype = in->AsArg().type->precision();
auto tmp_target = type->target();
auto tmp_layout = type->layout();
if (new_target == TargetType::kARM) {
tmp_target = TargetType::kARM;
tmp_ptype = PrecisionType::kFloat;
tmp_layout = DataLayoutType::kNCHW;
}
if (new_target == TargetType::kHost) {
tmp_target = TargetType::kHost;
tmp_ptype = PrecisionType::kFloat;
tmp_layout = DataLayoutType::kNCHW;
}
out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
}
};
......
......@@ -144,6 +144,23 @@ class StaticKernelPickPass : public mir::StmtPass {
}
}
if (kernel.target() == TARGET(kFPGA)) {
final_score = 4000;
bool in_match = true;
for (size_t i = 0; i < in_names.size(); ++i) {
std::string tmp;
CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
if (in_types.count(in_names[i]) &&
in_types.at(in_names[i]) !=
kernel.GetInputDeclType(tmp)->precision()) {
in_match = false;
}
}
if (in_match) {
final_score = 5000;
}
}
VLOG(4) << "[score(final)]:" << final_score;
VLOG(2) << "-------- pick summary for " << instruct.op_type()
<< " --------";
......
......@@ -134,6 +134,12 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Start from inputs of the graph, those should have place set.
std::list<Node*> nodes;
for (auto& node : graph->StmtTopologicalOrder()) {
// if (node->IsStmt()) {
// auto& s = node->AsStmt();
// std::cout << "type_precision type:" << s.op_type() << std::endl;
// }
// type_precision_cast_pass
nodes.push_back(node);
}
......@@ -231,6 +237,10 @@ void PrecisionCastPass::AddCastInst(
// create Op and kernels.
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
std::string cast_type = in_persist ? "calib_once" : "calib";
// TODO
cast_type = "calib";
cast_op_output_arg->AsArg().is_persist = in_persist;
auto cast_op = LiteOpRegistry::Global().Create(cast_type);
CHECK(cast_op) << "create op [" << cast_op << "] failed";
......
......@@ -32,6 +32,12 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Start from inputs of the graph, those should have place set.
std::list<Node*> nodes;
for (auto& node : graph->StmtTopologicalOrder()) {
// if (node->IsStmt()) {
// auto& s = node->AsStmt();
// // std::cout << "type_target type:" << s.op_type() << std::endl;
// }else {
// // std::cout << "type_target not a statement \n";
// }
nodes.push_back(node);
}
......@@ -47,6 +53,7 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
ComplementInputs(graph.get(), node, in, &copied_nodes);
}
}
}
void TypeTargetTransformPass::ComplementInputs(
......@@ -127,7 +134,8 @@ void TypeTargetTransformPass::AddIoCopyInst(
auto* io_copy_inst = graph->NewInstructNode();
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
// std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
std::string io_copy_type = "io_copy";
io_copy_output_arg->AsArg().is_persist = in_persist;
// create Op and kernels.
auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
......@@ -147,6 +155,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
// fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
bool is_found = false;
std::vector<std::unique_ptr<KernelBase>> selected_kernels;
std::cout << "kernels:" << std::to_string(kernels.size()) << std::endl;
for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
......
......@@ -64,6 +64,7 @@ void ConcatCompute::Run() {
auto& param = Param<operators::ConcatParam>();
std::vector<lite::Tensor*> inputs = param.x;
CHECK_GE(inputs.size(), 1);
// std::cout << "concat size:" << std::to_string(inputs.size()) << std::endl;
auto* out = param.output;
int axis = param.axis;
auto* axis_tensor = param.axis_tensor;
......@@ -72,21 +73,22 @@ void ConcatCompute::Run() {
axis = axis_tensor_data[0];
}
switch (inputs.front()->precision()) {
case PRECISION(kFloat):
ConcatFunc<float>(inputs, axis, out);
break;
case PRECISION(kInt32):
ConcatFunc<int32_t>(inputs, axis, out);
break;
case PRECISION(kInt64):
ConcatFunc<int64_t>(inputs, axis, out);
break;
default:
LOG(FATAL) << "Concat does not implement for the "
<< "input type:"
<< static_cast<int>(inputs.front()->precision());
}
ConcatFunc<float>(inputs, axis, out);
// switch (inputs.front()->precision()) {
// case PRECISION(kFloat):
// ConcatFunc<float>(inputs, axis, out);
// break;
// case PRECISION(kInt32):
// ConcatFunc<int32_t>(inputs, axis, out);
// break;
// case PRECISION(kInt64):
// ConcatFunc<int64_t>(inputs, axis, out);
// break;
// default:
// LOG(FATAL) << "Concat does not implement for the "
// << "input type:"
// << static_cast<int>(inputs.front()->precision());
// }
}
} // namespace arm
......
......@@ -17,6 +17,8 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
add_kernel(interpolate_compute_fpga FPGA basic SRCS interpolate_compute.cc DEPS ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
......
......@@ -44,6 +44,17 @@ void CalibComputeFP16ToFp32::Run() {
return;
}
void CalibComputeFloat2Int::Run() {
auto& param = this->Param<operators::CalibParam>();
const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<int>();
// param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
//TODO
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
}
} // namespace fpga
} // namespace kernels
} // namespace lite
......@@ -65,12 +76,28 @@ REGISTER_LITE_KERNEL(calib,
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(calib,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::CalibComputeFloat2Int,
float_2_int_fpga)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(calib,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::CalibComputeFP16ToFp32,
fp16_to_fp32_fpga)
float_to_int_fpga)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32
private:
};
class CalibComputeFloat2Int
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::CalibParam;
void Run() override;
~CalibComputeFloat2Int() override{};
private:
};
} // namespace fpga
} // namespace kernels
} // namespace lite
......
......@@ -47,7 +47,8 @@ void ConcatCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ConcatParam& concat_param = pe_.param();
Debugger::get_instance().registerOutput("concat", concat_param.output);
concat_param.output->flush();
// Debugger::get_instance().registerOutput("concat", concat_param.output);
#endif
}
......
......@@ -51,6 +51,11 @@ void ConvCompute::PrepareForRun() {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
}
dw_conv_pe_.init();
dw_conv_pe_.apply();
} else {
......@@ -72,9 +77,15 @@ void ConvCompute::PrepareForRun() {
conv_param.activeParam.type = zynqmp::TYPE_RELU;
}
if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
}
conv_pe_.init();
conv_pe_.apply();
}
// std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl;
}
void ConvCompute::Run() {
......
......@@ -88,13 +88,33 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
zynqmp::float16* bias_data =
bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
float scale_value = param.Y->data<float>()[0];
zynqmp::float16 scale_value = 0;
if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
scale_value = zynqmp::float_to_half(param.Y->data<float>()[0]);
// std::cout << "FP32 \n";
} else {
scale_value = param.Y->data<zynqmp::float16>()[0];
// std::cout << "FP16 \n";
}
// std::cout << "channel:" << channel << std::endl;
// std::cout << "production:" << param.Y->dims().production() << std::endl;
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
for (int i = 0; i < channel; i++) {
if (param.Y->dims().production() != 1) {
scale_value = param.Y->ZynqTensor()->data<float>()[i];
// scale_value = param.Y->ZynqTensor()->data<zynqmp::float16>()[i];
if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
scale_value = zynqmp::float_to_half(param.Y->data<float>()[i]);
} else {
scale_value = param.Y->data<zynqmp::float16>()[i];
}
}
scale_data[i] = zynqmp::float_to_half(scale_value);
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
scale_data[i] = scale_value;
bias_data[i] = zero_;
}
......@@ -104,15 +124,17 @@ void ElementwiseMulCompute::PrepareForRun() {
void ElementwiseMulCompute::Run() {
auto& param = Param<operators::ElementwiseParam>();
// std::cout << "param.Y :" << param.Y->persistable() << std::endl;
if (!param.Y->persistable()) {
// TODO
scale_.copyFrom(param.Y->ZynqTensor());
scale_.invalidate();
scale_.flush();//TODO
}
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param();
Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
// Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
// Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif
}
......@@ -181,3 +203,21 @@ REGISTER_LITE_KERNEL(elementwise_mul,
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_mul,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ElementwiseMulCompute,
ew_mul_y_arm)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
\ No newline at end of file
......@@ -64,18 +64,18 @@ void FetchCompute::Run() {
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(fetch,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FetchCompute,
fpga_host)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// REGISTER_LITE_KERNEL(fetch,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::FetchCompute,
// fpga_host)
// .BindInput("X",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
REGISTER_LITE_KERNEL(fetch,
kFPGA,
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/interpolate_compute.h"
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
void BilinearInterpCompute::Run() {
// auto& param = Param<operators::InterpolateParam>();
// lite::Tensor* X = param.X;
// lite::Tensor* OutSize = param.OutSize;
// auto SizeTensor = param.SizeTensor;
// auto Scale = param.Scale;
// lite::Tensor* Out = param.Out;
// float scale = param.scale;
// int out_w = param.out_w;
// int out_h = param.out_h;
// bool align_corners = param.align_corners;
// std::string interp_method = "Bilinear";
// lite::arm::math::interpolate(X,
// OutSize,
// SizeTensor,
// Scale,
// Out,
// out_h,
// out_w,
// scale,
// align_corners,
// interp_method);
}
void nearest_interp(const float16* src,
int w_in,
int h_in,
int c,
float16* dst,
int w_out,
int h_out,
float scale_x,
float scale_y,
bool with_align) {
float scale_w_new = (with_align)
? (static_cast<float>(w_in - 1) / (w_out - 1))
: (static_cast<float>(w_in) / (w_out));
float scale_h_new = (with_align)
? (static_cast<float>(h_in - 1) / (h_out - 1))
: (static_cast<float>(h_in) / (h_out));
if (with_align) {
for (int h = 0; h < h_out; ++h) {
float16* dst_p = dst + h * w_out * c;
int near_y = static_cast<int>(scale_h_new * h + 0.5);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w + 0.5);
// *dst_p++ = src[near_y * w_in + near_x];
const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c;
}
}
} else {
for (int h = 0; h < h_out; ++h) {
float16* dst_p = dst + h * w_out;
int near_y = static_cast<int>(scale_h_new * h);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w);
const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c;
}
}
}
}
void NearestInterpCompute::PrepareForRun() {
auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize;
lite::Tensor* Out = param.Out;
Out->mutable_data<float16>();
zynqmp::ResizeParam& norm_param = pe_.param();
norm_param.input = X->ZynqTensor();
norm_param.output = Out->ZynqTensor();
pe_.init();
pe_.apply();
}
// TODO
inline std::vector<int> get_new_shape(
std::vector<const lite::Tensor*> list_new_shape_tensor) {
// get tensor from
std::vector<int> vec_new_shape;
for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
auto tensor = list_new_shape_tensor[i];
vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
}
return vec_new_shape;
}
template <typename T>
inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
std::vector<T> vec_new_data;
auto* new_data = new_data_tensor->data<T>();
lite::Tensor cpu_starts_tensor;
vec_new_data =
std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
return vec_new_data;
}
void interpolate(lite::Tensor* X,
lite::Tensor* OutSize,
std::vector<const lite::Tensor*> SizeTensor,
lite::Tensor* Scale,
lite::Tensor* Out,
int out_height,
int out_width,
float scale,
bool with_align,
std::string interpolate_type) {
int in_h = X->dims()[2];
int in_w = X->dims()[3];
if (SizeTensor.size() > 0) {
auto new_size = get_new_shape(SizeTensor);
out_height = new_size[0];
out_width = new_size[1];
} else {
auto scale_tensor = Scale;
if (scale_tensor != nullptr) {
auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
scale = scale_data[0];
}
if (scale > 0) {
out_height = static_cast<int>(in_h * scale);
out_width = static_cast<int>(in_w * scale);
}
auto out_size = OutSize;
if (out_size != nullptr) {
auto out_size_data = get_new_data_from_tensor<int>(out_size);
out_height = out_size_data[0];
out_width = out_size_data[1];
}
}
float height_scale = scale;
float width_scale = scale;
if (out_width > 0 && out_height > 0) {
height_scale = static_cast<float>(out_height / X->dims()[2]);
width_scale = static_cast<float>(out_width / X->dims()[3]);
}
int num_cout = X->dims()[0];
int c_cout = X->dims()[1];
Out->Resize({num_cout, c_cout, out_height, out_width});
float16* dout = Out->mutable_data<float16>();
const float16* din = X->data<float16>();
int out_num = Out->dims()[0];
int out_c = Out->dims()[1];
int count = out_num;
int out_h = Out->dims()[2];
int out_w = Out->dims()[3];
int spatial_in = in_h * in_w;
int spatial_out = out_h * out_w;
for (int i = 0; i < count; ++i) {
nearest_interp(din + spatial_in * i,
in_w,
in_h,
out_c,
dout + spatial_out * i,
out_w,
out_h,
1.f / width_scale,
1.f / height_scale,
with_align);
}
}
void NearestInterpCompute::Run() {
auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize;
auto SizeTensor = param.SizeTensor;
auto Scale = param.Scale;
lite::Tensor* Out = param.Out;
float scale = param.scale;
int out_w = param.out_w;
int out_h = param.out_h;
bool align_corners = param.align_corners;
std::string interp_method = "";
X->ZynqTensor()->invalidate();//TODO
X->ZynqTensor()->saveToFile("n_in", true);
interpolate(X,
OutSize,
SizeTensor,
Scale,
Out,
out_h,
out_w,
scale,
align_corners,
interp_method);
Out->ZynqTensor()->flush();
Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
Out->ZynqTensor()->saveToFile("n_out", true);
}
} /* namespace fpga */
} /* namespace kernels */
} /* namespace lite */
} /* namespace paddle */
REGISTER_LITE_KERNEL(bilinear_interp,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::BilinearInterpCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(nearest_interp,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::NearestInterpCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class BilinearInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void Run() override;
virtual ~BilinearInterpCompute() = default;
};
class NearestInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~NearestInterpCompute() = default;
private:
zynqmp::ResizePE pe_;
};
} /* namespace fpga */
} /* namespace kernels */
} /* namespace lite */
} /* namespace paddle */
......@@ -25,10 +25,17 @@ namespace fpga {
using float16 = zynqmp::float16;
void copy_properties(operators::IoCopyParam& param) {
param.y->set_persistable(param.x->persistable());
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
}
/*
* This kernel copies a tensor from host to FPGA space.
*/
class IoCopyHostToFpgaCompute
class IoCopyHostCHWToFpgaHWCCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
......@@ -37,52 +44,33 @@ class IoCopyHostToFpgaCompute
param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->flush();
if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
param.y->mutable_data<int>();
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
copy_properties(param);
return;
}
if (param.x->ZynqTensor()->dataType() == zynqmp::FP32) {
param.y->mutable_data<float16>();
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
tempTensor.unalignImage();
param.y->ZynqTensor()->copyFrom(&tempTensor);
} else {
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
param.y->ZynqTensor()->invalidate();
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->mutable_data<float16>();
param.y->ZynqTensor()->setDataLocation(zynqmp::Device);
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
tempTensor.unalignImage();
tempTensor.flush();
param.y->ZynqTensor()->copyFrom(&tempTensor);
} else {
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
}
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
*res = [](const std::map<std::string, const Type*>& inputs,
const std::string& out) -> const Type* {
CHECK(!inputs.empty());
auto* type = inputs.at("Input");
CHECK(type->target() == TARGET(kHost));
auto out_place = type->place();
out_place.target = TARGET(kFPGA);
auto* out_type = Type::Get(type->id(),
out_place.target,
out_place.precision,
out_place.layout,
out_place.device);
return out_type;
};
return res;
copy_properties(param);
param.y->ZynqTensor()->invalidate();
}
std::string doc() const override { return "Copy IO from HOST to FPGA"; }
......@@ -98,10 +86,11 @@ class IoCopyFpgaToHostCompute
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->syncToDevice();
param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
......@@ -115,10 +104,9 @@ class IoCopyFpgaToHostCompute
} else {
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
param.y->ZynqTensor()->invalidate();
copy_properties(param);
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
......@@ -153,14 +141,16 @@ class IoCopyFpgaToHostCHWCompute
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
Tensor hwc;
Tensor hwc;
hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
hwc.ZynqTensor()->setDataLocation(zynqmp::CPU);
param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
......@@ -168,10 +158,30 @@ class IoCopyFpgaToHostCHWCompute
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
// tempTensor.saveToFile("temp_1", true);
tempTensor.unalignImage();
// tempTensor.saveToFile("temp_2", true);
hwc.ZynqTensor()->copyFrom(&tempTensor);
} else {
hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
// hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
float16* in_data = param.x->ZynqTensor()->data<float16>();
// float* f_data =
param.x->ZynqTensor()->flush();
float max = 0;
for (int i = 0; i < param.x->dims().production(); i++) {
float value = zynqmp::half_to_float(in_data[i]);
hwc_data[i] = value;
if (value < 0) {
value = -value;
}
if (value > max) {
max = value;
}
}
param.x->ZynqTensor()->scale()[0] = max / 127;
param.x->ZynqTensor()->scale()[1] = 127 / max;
}
int num = 1;
......@@ -188,10 +198,15 @@ class IoCopyFpgaToHostCHWCompute
dims.height(),
dims.width());
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
// param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
copy_properties(param);
param.x->ZynqTensor()->invalidate();
param.x->ZynqTensor()->flush();
// hwc.ZynqTensor()->saveToFile("hwc", true);
// param.x->ZynqTensor()->saveToFile("io2_x", true);
// param.y->ZynqTensor()->saveToFile("io2_y", true);
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
......@@ -201,52 +216,36 @@ class IoCopyFpgaToHostCHWCompute
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(io_copy,
// kFPGA,
// kAny,
// kAny,
// paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
// host_to_device)
// .BindInput("Input",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .Finalize();
REGISTER_LITE_KERNEL(io_copy,
kFPGA,
kAny,
kAny,
paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
host_to_device_any_any)
paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
host_to_device)
.BindInput("Input",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy,
kFPGA,
kAny,
kAny,
paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
host_float_chw_to_device_fp16_hwc)
.BindInput("Input", {LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// REGISTER_LITE_KERNEL(io_copy,
// kFPGA,
// kAny,
// kAny,
// paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
// device_to_host)
// .BindInput("Input",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .Finalize();
REGISTER_LITE_KERNEL(io_copy,
kFPGA,
......@@ -311,3 +310,26 @@ REGISTER_LITE_KERNEL(io_copy,
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .Finalize();
// ==========================================================
// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
// std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
// *res = [](const std::map<std::string, const Type*>& inputs,
// const std::string& out) -> const Type* {
// CHECK(!inputs.empty());
// auto* type = inputs.at("Input");
// CHECK(type->target() == TARGET(kHost));
// auto out_place = type->place();
// out_place.target = TARGET(kFPGA);
// auto* out_type = Type::Get(type->id(),
// out_place.target,
// out_place.precision,
// out_place.layout,
// out_place.device);
// return out_type;
// };
// return res;
// }
\ No newline at end of file
......@@ -94,6 +94,7 @@ T PolyIoU(const T* box1,
const size_t box_size,
const bool normalized) {
LOG(FATAL) << "PolyIoU not implement.";
return *box1;
}
template <class T>
......@@ -128,34 +129,44 @@ void NMSFast(const Tensor& bbox,
std::vector<int>* selected_indices,
const bool normalized) {
// The total boxes for each instance.
// std::cout << "1\n";
int64_t num_boxes = bbox.dims()[0];
// std::cout << "1,1\n";
// 4: [xmin ymin xmax ymax]
// 8: [x1 y1 x2 y2 x3 y3 x4 y4]
// 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16
int64_t box_size = bbox.dims()[1];
// std::cout << "1,2\n";
std::vector<T> scores_data(num_boxes);
std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
// std::cout << "1,3\n";
std::vector<std::pair<T, int>> sorted_indices;
// std::cout << "1,4\n";
GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
// std::cout << "2\n";
selected_indices->clear();
T adaptive_threshold = nms_threshold;
const T* bbox_data = bbox.data<T>();
// std::cout << "3\n";
while (sorted_indices.size() != 0) {
const int idx = sorted_indices.front().second;
// std::cout << "4\n";
bool keep = true;
for (size_t k = 0; k < selected_indices->size(); ++k) {
// std::cout << "5\n";
if (keep) {
const int kept_idx = (*selected_indices)[k];
T overlap = T(0.);
// std::cout << "6\n";
// 4: [xmin ymin xmax ymax]
if (box_size == 4) {
overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size,
normalized);
}
// std::cout << "7\n";
// 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
if (box_size == 8 || box_size == 16 || box_size == 24 ||
box_size == 32) {
......@@ -168,10 +179,13 @@ void NMSFast(const Tensor& bbox,
} else {
break;
}
// std::cout << "8\n";
}
// std::cout << "9\n";
if (keep) {
selected_indices->push_back(idx);
}
// std::cout << "10\n";
sorted_indices.erase(sorted_indices.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
......@@ -195,21 +209,25 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
T score_threshold = static_cast<T>(param.score_threshold);
int num_det = 0;
int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
Tensor bbox_slice, score_slice;
for (int64_t c = 0; c < class_num; ++c) {
Tensor bbox_slice, score_slice;
if (c == background_label) continue;
// std::cout << "------ 1 \n";
if (scores_size == 3) {
// std::cout << "------ scores_size = 3 \n";
scores.Slice<T>(score_slice, c, c + 1);
bbox_slice = bboxes;
// bbox_slice = bboxes;
} else {
// std::cout << "------ scores_size != 3 \n";
score_slice.Resize({scores.dims()[0], 1});
bbox_slice.Resize({scores.dims()[0], 4});
SliceOneClass<T>(scores, c, &score_slice);
SliceOneClass<T>(bboxes, c, &bbox_slice);
}
NMSFast(bboxes,
NMSFast(bboxes,// TODO
score_slice,
score_threshold,
nms_threshold,
......@@ -226,8 +244,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
*num_nmsed_out = num_det;
const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) {
Tensor score_slice;
const T* sdata;
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (const auto& it : *indices) {
......@@ -275,7 +291,9 @@ void MultiClassOutput(const Tensor& scores,
const Tensor& bboxes,
const std::map<int, std::vector<int>>& selected_indices,
const int scores_size,
Tensor* outs) {
Tensor* outs,
int* oindices = nullptr,
const int offset = 0) {
int64_t class_num = scores.dims()[1];
int64_t predict_dim = scores.dims()[1];
int64_t box_size = bboxes.dims()[1];
......@@ -305,9 +323,15 @@ void MultiClassOutput(const Tensor& scores,
if (scores_size == 3) {
bdata = bboxes_data + idx * box_size;
odata[count * out_dim + 1] = sdata[idx]; // score
if (oindices != nullptr) {
oindices[count] = offset + idx;
}
} else {
bdata = bbox.data<T>() + idx * box_size;
odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
if (oindices != nullptr) {
oindices[count] = offset + idx * class_num + label;
}
}
// xmin, ymin, xmax, ymax or multi-points coordinates
std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
......@@ -318,36 +342,18 @@ void MultiClassOutput(const Tensor& scores,
void MulticlassNmsCompute::Run() {
auto& param = Param<operators::MulticlassNmsParam>();
auto* boxes_in = param.bboxes;
auto* scores_in = param.scores;
auto* boxes = param.bboxes;
auto* scores = param.scores;
auto* outs = param.out;
outs->mutable_data<float>();
auto score_dims = boxes_in->dims();
bool return_index = param.index ? true : false;
auto* index = param.index;
auto score_dims = scores->dims();
auto score_size = score_dims.size();
Tensor boxes_float;
Tensor scores_float;
boxes_float.Resize(boxes_in->dims());
scores_float.Resize(scores_in->dims());
boxes_float.mutable_data<float>();
scores_float.mutable_data<float>();
boxes_float.ZynqTensor()->copyFrom(boxes_in->ZynqTensor());
scores_float.ZynqTensor()->copyFrom(scores_in->ZynqTensor());
Tensor* boxes = &boxes_float;
Tensor* scores = &scores_float;
auto box_dims = boxes->dims();
int64_t box_dim = boxes->dims()[2];
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<uint64_t> batch_starts = {0};
int64_t batch_size = score_dims[0];
int64_t box_dim = boxes->dims()[2];
int64_t out_dim = box_dim + 2;
int num_nmsed_out = 0;
Tensor boxes_slice, scores_slice;
......@@ -372,79 +378,104 @@ void MulticlassNmsCompute::Run() {
uint64_t num_kept = batch_starts.back();
if (num_kept == 0) {
outs->Resize({1, 1});
float* od = outs->mutable_data<float>();
od[0] = -1;
batch_starts = {0, 1};
if (return_index) {
outs->Resize({0, out_dim});
index->Resize({0, 1});
} else {
outs->Resize({1, 1});
float* od = outs->mutable_data<float>();
od[0] = -1;
batch_starts = {0, 1};
}
} else {
outs->Resize({static_cast<int64_t>(num_kept), out_dim});
outs->mutable_data<float>();
int offset = 0;
int* oindices = nullptr;
for (int i = 0; i < n; ++i) {
if (score_size == 3) {
scores->Slice<float>(scores_slice, i, i + 1);
boxes->Slice<float>(boxes_slice, i, i + 1);
scores_slice.Resize({score_dims[1], score_dims[2]});
boxes_slice.Resize({score_dims[2], box_dim});
if (return_index) {
offset = i * score_dims[2];
}
} else {
auto boxes_lod = boxes->lod().back();
scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
if (return_index) {
offset = boxes_lod[i] * score_dims[1];
}
}
int64_t s = static_cast<int64_t>(batch_starts[i]);
int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
if (e > s) {
Tensor out;
outs->Slice<float>(out, s, e);
MultiClassOutput<float>(
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
if (return_index) {
index->Resize({static_cast<int64_t>(num_kept), 1});
int* output_idx = index->mutable_data<int>();
oindices = output_idx + s;
}
MultiClassOutput<float>(scores_slice,
boxes_slice,
all_indices[i],
score_dims.size(),
&out,
oindices,
offset);
// out.ZynqTensor()->saveToFile("nms_o", true);
outs->ZynqTensor()->copyFrom(out.ZynqTensor());
out.ZynqTensor()->saveToFile("nms_oo", true);
outs->ZynqTensor()->flush();
}
outs->Resize({static_cast<int64_t>(e - s), out_dim});
}
}
LoD lod;
lod.emplace_back(batch_starts);
if (return_index) {
index->set_lod(lod);
}
outs->set_lod(lod);
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor());
Debugger::get_instance().registerOutput("scores", scores->ZynqTensor());
Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
#endif
// boxes->ZynqTensor()->saveToFile("boxes", true);
// scores->ZynqTensor()->saveToFile("scores", true);
// outs->ZynqTensor()->saveToFile("nms", true);
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(multiclass_nms,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::MulticlassNmsCompute,
// def)
// .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
REGISTER_LITE_KERNEL(multiclass_nms,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::MulticlassNmsCompute,
def2)
.BindInput("BBoxes",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
def)
.BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// REGISTER_LITE_KERNEL(multiclass_nms,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::MulticlassNmsCompute,
// def2)
// .BindInput("BBoxes",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("Scores",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .Finalize();
......@@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() {
float offset = param.offset;
std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
size_t prior_num = aspect_ratios_vec.size() * min_size.size();
int prior_num = aspect_ratios_vec.size() * min_size.size();
prior_num += max_size.size();
std::vector<std::string> order = param.order;
bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
......@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
param.boxes->mutable_data<float>();
param.variances->mutable_data<float>();
zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor();
priobox_param.image = param.image->ZynqTensor();
......
......@@ -23,31 +23,64 @@ namespace fpga {
using float16 = zynqmp::float16;
void ReshapeCompute::Run() {
void FlattenCompute::Run() {
auto& param = Param<operators::ReshapeParam>();
param.output->mutable_data<float16>();
auto x = param.x;
// auto actual_shape = param.actual_shape;
Tensor* actual_shape = nullptr; // TODO(chonwhite) change it.
auto output = param.output;
bool inplace = param.inplace;
auto x_dims = x->dims();
output->mutable_data<float16>();
auto output_dims = output->dims();
if (actual_shape) {
auto actual_shape_dims = actual_shape->dims();
auto* actual_shape_data = actual_shape->data<int>();
auto shape = std::vector<int>(
actual_shape_data, actual_shape_data + actual_shape_dims.production());
// output_dims = lite::operators::ValidateShape(shape, x_dims); //TODO
output->Resize(output_dims);
if (param.inplace) {
output->ShareDataWith(*x);
} else {
// output->CopyDataFrom(*x);
}
// if (inplace) {
// output->ShareDataWith(*x);
// } else {
// output->CopyDataFrom(*x);
// }
x->ZynqTensor()->unalignImage();
// x->ZynqTensor()->saveToFile("fi", true);
output->ZynqTensor()->copyFrom(x->ZynqTensor());
// output->ZynqTensor()->saveToFile("fo", true);
output->ZynqTensor()->flush();
output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
output->Resize(output_dims);
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("flatten",
output->ZynqTensor());
#endif
}
void ReshapeCompute::Run() {
auto& param = Param<operators::ReshapeParam>();
auto x = param.x;
auto output = param.output;
auto output_dims = output->dims();
x->ZynqTensor()->unalignImage();
// x->ZynqTensor()->saveToFile("ri", true);
output->Resize(output_dims);
output->mutable_data<float16>();
if (param.inplace) {
output->ShareDataWith(*x);
} else {
// output->CopyDataFrom(*x);
}
output->ZynqTensor()->copyFrom(x->ZynqTensor());
// output->ZynqTensor()->saveToFile("ro", true);
output->ZynqTensor()->flush();
output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("reshape",
output->ZynqTensor());
#endif
}
} // namespace fpga
......@@ -66,9 +99,9 @@ REGISTER_LITE_KERNEL(reshape,
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......@@ -86,9 +119,9 @@ REGISTER_LITE_KERNEL(reshape2,
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......@@ -103,16 +136,16 @@ REGISTER_LITE_KERNEL(flatten,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ReshapeCompute,
paddle::lite::kernels::fpga::FlattenCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......@@ -123,16 +156,16 @@ REGISTER_LITE_KERNEL(flatten2,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ReshapeCompute,
paddle::lite::kernels::fpga::FlattenCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -30,6 +30,14 @@ class ReshapeCompute
virtual ~ReshapeCompute() = default;
};
class FlattenCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void Run() override;
virtual ~FlattenCompute() = default;
};
class ReshapeComputeFpgaToHost
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
......
......@@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() {
scale_param.output = param.output->ZynqTensor();
int channel = scale_param.input->shape().channel();
zynqmp::Tensor* scale = new zynqmp::Tensor();
zynqmp::Tensor* bias = new zynqmp::Tensor();
zynqmp::Tensor* scale = &scale_;
zynqmp::Tensor* bias = &bias_;
zynqmp::Shape shape(zynqmp::N, {channel});
float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
......
......@@ -37,6 +37,8 @@ class ScaleCompute
private:
zynqmp::ScalePE pe_;
zynqmp::Tensor scale_;
zynqmp::Tensor bias_;
};
} // namespace fpga
......
......@@ -26,7 +26,8 @@ void SoftmaxCompute::PrepareForRun() {
zynqmp::SoftmaxParam& softmax_param = pe_.param();
auto& param = Param<operators::SoftmaxParam>();
param.output->mutable_data<float16>();
// param.output->mutable_data<float16>();
param.output->mutable_data<float>();
softmax_param.input = param.x->ZynqTensor();
softmax_param.output = param.output->ZynqTensor();
pe_.init();
......@@ -34,9 +35,13 @@ void SoftmaxCompute::PrepareForRun() {
}
void SoftmaxCompute::Run() {
zynqmp::SoftmaxParam& softmax_param = pe_.param();
// softmax_param.input->saveToFile("softmax_in", true);
pe_.dispatch();
softmax_param.output->flush();
// softmax_param.output->saveToFile("softmax", true);
#ifdef FPGA_PRINT_TENSOR
zynqmp::SoftmaxParam& softmax_param = pe_.param();
Debugger::get_instance().registerOutput("softmax", softmax_param.output);
#endif
}
......@@ -57,7 +62,17 @@ REGISTER_LITE_KERNEL(softmax,
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
{LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
\ No newline at end of file
......@@ -34,17 +34,17 @@ void transposeCompute(operators::TransposeParam param) {
input_x->ZynqTensor()->invalidate();
input_x->ZynqTensor()->unalignImage();
Tensor float_input;
float_input.Resize(input_x_dims);
float_input.mutable_data<float>();
float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
// Tensor float_input;
// float_input.Resize(input_x_dims);
// float_input.mutable_data<float>();
// float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
const auto* input_x_data = float_input.data<float>();
const auto* input_x_data = input_x->data<float16>();
auto* out = param.output;
const auto axis = param.axis;
auto* out_data = out->mutable_data<float>();
auto* out_data = out->mutable_data<float16>();
size_t ndim = axis.size();
std::vector<int> xdim(ndim);
......@@ -84,10 +84,11 @@ void transposeCompute(operators::TransposeParam param) {
void TransposeCompute::Run() {
auto& param = this->Param<param_t>();
param.output->mutable_data<zynqmp::float16>();
param.x->ZynqTensor()->invalidate();
// param.x->ZynqTensor()->invalidate();
param.x->ZynqTensor()->unalignImage();
if (param.x->dims().size() != 4) {
transposeCompute(param);
param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
} else {
param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
......@@ -96,14 +97,25 @@ void TransposeCompute::Run() {
// Transpose2
void Transpose2Compute::Run() {
auto& param = this->Param<param_t>();
param.output->mutable_data<float>();
param.x->ZynqTensor()->invalidate();
param.output->mutable_data<float16>();
// param.x->ZynqTensor()->syncToCPU();
// param.x->ZynqTensor()->saveToFile("t_in", true);
param.x->ZynqTensor()->unalignImage();
// param.x->ZynqTensor()->saveToFile("t_unaligned", true);
param.x->ZynqTensor()->flush();
param.x->ZynqTensor()->invalidate();
if (param.x->dims().size() != 4) {
transposeCompute(param);
param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
} else {
param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
// param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param.output->ZynqTensor()->flush();
// param.output->ZynqTensor()->saveToFile("Transpose2", true);
}
} // namespace fpga
......@@ -139,6 +151,8 @@ REGISTER_LITE_KERNEL(transpose2,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册