提交 a59d6fab 编写于 作者: C chonwhite

arm & fpga kernel works together

上级 2b32484a
...@@ -73,6 +73,7 @@ class Debugger { ...@@ -73,6 +73,7 @@ class Debugger {
op_config["nms"] = true; op_config["nms"] = true;
op_config["pb_boxes"] = true; op_config["pb_boxes"] = true;
op_config["pb_variances"] = true; op_config["pb_variances"] = true;
op_config["reshape"] = true;
op_config["softmax"] = true; op_config["softmax"] = true;
op_config["split"] = true; op_config["split"] = true;
} }
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace zynqmp {
class Action {
public:
void readScale(float* scale) {
}
void writeScale(float* scale) {
}
private:
int id_ = -1;
int scaleIndex_ = -1;
}
}
}
\ No newline at end of file
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/dispatch/action.hpp"
#include <algorithm>
#include <vector>
namespace paddle {
namespace zynqmp {
class Transaction {
public:
void appendAction(Action* action) {
actions_.push_back(action);
};
void startTraction() {
};
private:
std::std::vector<Action*> actions_;
int id_ = -1;
}
}
}
\ No newline at end of file
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
namespace paddle {
namespace zynqmp {
class TransactionManager {
public:
static TransactionManager& get_instance() {
static TransactionManager s_instance;
return s_instance;
}
Transaction* getTransaction() {
if (currentTransaction_ == nullptr) {
currentTransaction_ = new Transaction();
transactions_.push_back(currentTransaction_);
}
return currentTransaction_;
};
void endTransaction() {
currentTransaction_ = nullptr;
}
private:
Transaction* currentTransaction_ = nullptr;
std::vector<Transaction*> transactions_;
}
}
}
\ No newline at end of file
...@@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in, ...@@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in,
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
float* filter_start = data_in + n * chw; float* filter_start = data_in + n * chw;
int8_t* quantized_start = quantized_data + n * chw; int8_t* quantized_start = quantized_data + n * chw;
// float f_max = find_max(filter_start, chw); float f_max = find_max(filter_start, chw);
float f_max = max; // float f_max = max;
quantize(filter_start, quantized_start, chw, f_max); quantize(filter_start, quantized_start, chw, f_max);
filter_max.push_back(f_max); filter_max.push_back(f_max);
} }
......
...@@ -264,10 +264,10 @@ inline void format_filter(Tensor* filter, ...@@ -264,10 +264,10 @@ inline void format_filter(Tensor* filter,
quantized_filter->flush(); quantized_filter->flush();
fpga_free(quantized_data); fpga_free(quantized_data);
// for (size_t i = 0; i < max_values.size(); i++) { for (size_t i = 0; i < max_values.size(); i++) {
// // scales.push_back(max_values[i] / max_value); scales.push_back(max_values[i] / max_value);
// scales.push_back(1.0f); // scales.push_back(1.0f);
// } }
// filter->saveToFile("filter.txt"); // filter->saveToFile("filter.txt");
// std::ofstream ofs; // std::ofstream ofs;
...@@ -374,17 +374,15 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -374,17 +374,15 @@ inline void split_filter_num(const ConvParam& c_param) {
std::vector<float> v; // TODO(chonwhite) change variable name; std::vector<float> v; // TODO(chonwhite) change variable name;
format_filter(&new_filter, &(conv_param->filter), param.groups, v, max); format_filter(&new_filter, &(conv_param->filter), param.groups, v, max);
conv_param->filter.setDataType(INT8); conv_param->filter.setDataType(INT8);
Tensor scale; Tensor scale;
Tensor bias; Tensor bias;
int chnnnel_start = i * filter_num_per_div; int chnnnel_start = i * filter_num_per_div;
Shape s_shape(NC, {1, filter_num}); Shape s_shape(NC, {1, filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape); float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape); float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start]; scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
} }
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start]; bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
...@@ -513,7 +511,7 @@ inline void pack_channel_filter(const ConvParam& c_param) { ...@@ -513,7 +511,7 @@ inline void pack_channel_filter(const ConvParam& c_param) {
float* scale_data = scale.mutableData<float>(FP32, s_shape); float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape); float* bias_data = bias.mutableData<float>(FP32, s_shape);
for (int n = 0; n < filter_current_pack; n++) { for (int n = 0; n < filter_current_pack; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start]; scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
} }
for (int n = 0; n < filter_current_pack; n++) { for (int n = 0; n < filter_current_pack; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start]; bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
......
...@@ -41,7 +41,9 @@ class InputPE : public PE { ...@@ -41,7 +41,9 @@ class InputPE : public PE {
src = &half_tensor; src = &half_tensor;
} }
output->mutableData<void>(); output->mutableData<void>();
src->alignImage(output, true); src->alignImage();
output->copyFrom(src);
// src->alignImage(output, true);
return true; return true;
} }
......
...@@ -103,6 +103,7 @@ class NormPE : public PE { ...@@ -103,6 +103,7 @@ class NormPE : public PE {
float_out.flush(); float_out.flush();
// float_out.saveToFile("normalize_", true); // float_out.saveToFile("normalize_", true);
param_.output->copyFrom(&float_out); param_.output->copyFrom(&float_out);
param_.output->flush();
} }
bool dispatch() { bool dispatch() {
......
...@@ -56,8 +56,8 @@ class OutputPE : public PE { ...@@ -56,8 +56,8 @@ class OutputPE : public PE {
fpga_reset(); fpga_reset();
auto max = fpga_get_memory_size_max(); // auto max = fpga_get_memory_size_max();
std::cout << "PL ===== Max: ===== :: " << max << std::endl; // std::cout << "PL ===== Max: ===== :: " << max << std::endl;
return true; return true;
} }
......
...@@ -241,7 +241,7 @@ void PriorBoxPE::compute_prior_box() { ...@@ -241,7 +241,7 @@ void PriorBoxPE::compute_prior_box() {
} }
boxes.flush(); boxes.flush();
boxes.syncToCPU(); // boxes.syncToCPU();
variances.flush(); variances.flush();
output_boxes->copyFrom(&boxes); output_boxes->copyFrom(&boxes);
output_variances->copyFrom(&variances); output_variances->copyFrom(&variances);
...@@ -261,11 +261,12 @@ bool PriorBoxPE::dispatch() { ...@@ -261,11 +261,12 @@ bool PriorBoxPE::dispatch() {
} }
param_.outputBoxes->copyFrom(this->cachedBoxes_); param_.outputBoxes->copyFrom(this->cachedBoxes_);
param_.outputVariances->copyFrom(this->cachedVariances_); param_.outputVariances->copyFrom(this->cachedVariances_);
param_.outputBoxes->flush(); param_.outputBoxes->flush();
param_.outputBoxes->syncToCPU(); // param_.outputBoxes->syncToCPU();
param_.outputVariances->flush(); param_.outputVariances->flush();
return true;
} }
} // namespace zynqmp } // namespace zynqmp
......
...@@ -35,6 +35,13 @@ class PriorBoxPE : public PE { ...@@ -35,6 +35,13 @@ class PriorBoxPE : public PE {
PriorBoxParam& param() { return param_; } PriorBoxParam& param() { return param_; }
~PriorBoxPE() {
if (cachedBoxes_ != nullptr) {
delete cachedBoxes_;
delete cachedVariances_;
}
}
private: private:
PriorBoxParam param_; PriorBoxParam param_;
Tensor* cachedBoxes_ = nullptr; Tensor* cachedBoxes_ = nullptr;
......
...@@ -73,9 +73,43 @@ class ResizePE : public PE { ...@@ -73,9 +73,43 @@ class ResizePE : public PE {
scale[0] = max / 127.0; scale[0] = max / 127.0;
scale[1] = 127.0 / max; scale[1] = 127.0 / max;
} }
void cpu_compute() {
Shape& in_shape = param_.input->shape();
Shape& out_shape = param_.output->shape();
int channel = in_shape.channel();
int in_height = in_shape.height();
int in_width = in_shape.width();
int out_width = out_shape.width();
int factor = out_shape.width() / in_shape.width();
param_.input->syncToCPU();
for (int h = 0; h < in_height; h++) {
for (int w = 0; w < in_width; w++) {
int src_index = in_width * channel * h + w * channel;
float16* src = param_.input->data<float16>() + src_index;
// std::cout << "src_index:" << src_index << std::endl;
for (int v = 0; v < factor; v++) {
for (int i =0; i < factor; i++) {
int dst_index = out_width * channel * h * factor +
out_width * channel * v +
w * channel * factor +
channel * i;
float16* dst = param_.output->data<float16>() + dst_index;
memcpy(dst, src, channel * sizeof(float16));
// std::cout << "dst_index:" << dst_index << std::endl;
}
}
}
}
param_.output->flush();
param_.output->copyScaleFrom(param_.input);
}
bool dispatch() { bool dispatch() {
bool ret = compute_fpga_resize(args_) == 0; cpu_compute();
// bool ret = compute_fpga_resize(args_) == 0;
return true; return true;
} }
......
...@@ -141,22 +141,26 @@ class ScalePE : public PE { ...@@ -141,22 +141,26 @@ class ScalePE : public PE {
Tensor* output = param_.output; Tensor* output = param_.output;
Tensor float_input; Tensor float_input;
float* image_addr = float_input.mutableData<float>(FP32, input->shape()); float* image_addr = float_input.mutableData<float>(FP32, input->shape());
input->syncToCPU(); // input->syncToCPU();
// input->invalidate();
float_input.copyFrom(input); float_input.copyFrom(input);
float16* data_out = output->data<float16>(); float16* data_out = output->data<float16>();
float* scale_data = param_.scale->data<float>(); float16* scale_data = param_.scale->data<float16>();
int wh = input->shape().width() * input->shape().height(); int wh = input->shape().width() * input->shape().height();
float16* in_data = input->data<float16>(); float16* in_data = input->data<float16>();
float max = 0; float max = 0;
for (int i = 0; i < wh; i++) { for (int i = 0; i < wh; i++) {
for (int c = 0; c < input->shape().channel(); c++) { for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c; int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c]; float x = image_addr[index];
float y = half_to_float(scale_data[c]);
float value = x * y;
// std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl;
// float value = half_to_float(in_data[index]) * 19.3598f;
data_out[index] = float_to_half(value); data_out[index] = float_to_half(value);
if (value < 0) { if (value < 0) {
...@@ -167,24 +171,27 @@ class ScalePE : public PE { ...@@ -167,24 +171,27 @@ class ScalePE : public PE {
} }
} }
} }
// exit(-1);
output->flush(); output->flush();
output->scale()[0] = max / 127.0f; output->scale()[0] = max / 127.0f;
output->scale()[1] = 127.0f / max; output->scale()[1] = 127.0f / max;
} }
bool dispatch() { bool dispatch() {
if (param_.scale->dataType() == FP16) { // if (param_.scale->dataType() == FP16) {
DepthwiseConvParam& dw_param = dw_pe_.param(); // DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(), // memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
param_.scale->data<float16>(), // param_.scale->data<float16>(),
param_.scale->shape().numel() * sizeof(float16)); // param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0]; // dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1]; // dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
// dw_param.quantizedFilter()->flush();
dw_param.quantizedFilter()->flush(); // }
} // param_.input->syncToDevice();
param_.input->syncToDevice(); // return dw_pe_.dispatch();
return dw_pe_.dispatch();
cpu_compute();
return true;
} }
ScaleParam& param() { return param_; } ScaleParam& param() { return param_; }
......
...@@ -154,6 +154,7 @@ bool SoftmaxPE::dispatch() { ...@@ -154,6 +154,7 @@ bool SoftmaxPE::dispatch() {
float_output.flush(); float_output.flush();
output->copyFrom(&float_output); output->copyFrom(&float_output);
output->flush();
return true; return true;
} }
......
...@@ -105,7 +105,7 @@ class SplitPE : public PE { ...@@ -105,7 +105,7 @@ class SplitPE : public PE {
in_stride, in_stride,
out_stride[axis]); out_stride[axis]);
input_offset += out_stride[axis]; input_offset += out_stride[axis];
// out->flush(); out->flush();
} }
return true; return true;
} }
......
...@@ -266,22 +266,25 @@ class Tensor { ...@@ -266,22 +266,25 @@ class Tensor {
return; return;
} }
BypassArgs args; BypassArgs args;
args.input_data_type = args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
args.input_layout_type = LAYOUT_HWC; args.input_layout_type = LAYOUT_HWC;
args.output_layout_type = LAYOUT_HWC; args.output_layout_type = LAYOUT_HWC;
args.image = {.address = src->data<void>(), args.image = {
.scale_address = src->scale(), .address = src->data<void>(),
.channels = (uint32_t)src->shape().numel(), .scale_address = src->scale(),
.width = 1, .channels = (uint32_t)src->shape().numel(),
.height = 1, .width = 1,
.pad_width = 0u, .height = 1,
.pad_height = 0u}; .pad_width = 0U,
.pad_height = 0U
};
ImageOutputArgs output = { ImageOutputArgs output = {
.address = data<void>(), .scale_address = scale(), .address = data<void>(),
.scale_address = scale(),
}; };
args.output = output; args.output = output;
size_t aligned_remainder = src->shape().numel() % 16; size_t aligned_remainder = src->shape().numel() % 16;
if (aligned_remainder > 0) { if (aligned_remainder > 0) {
...@@ -380,6 +383,10 @@ class Tensor { ...@@ -380,6 +383,10 @@ class Tensor {
} }
void save_file_with_name(std::string path) { void save_file_with_name(std::string path) {
// std::cout << "saving file: " << path << std::endl;
void* add = (void*)this;
// printf("tensor @: %p data: %p \n", (void *)add, (void*)data<void>());
// return;
std::ofstream ofs; std::ofstream ofs;
ofs.open(path); ofs.open(path);
ofs << scale()[0] << " / " << scale()[1] << std::endl; ofs << scale()[0] << " / " << scale()[1] << std::endl;
...@@ -399,8 +406,15 @@ class Tensor { ...@@ -399,8 +406,15 @@ class Tensor {
if (dataType_ == INT32) { if (dataType_ == INT32) {
value = data<int32_t>()[i]; value = data<int32_t>()[i];
} }
if (i < 10) {
std::cout << value << ",";
}
ofs << value << std::endl; ofs << value << std::endl;
} }
usleep(30000);
ofs.close(); ofs.close();
} }
...@@ -451,6 +465,7 @@ class Tensor { ...@@ -451,6 +465,7 @@ class Tensor {
value = half_to_float(tensor.data<float16>()[i]); value = half_to_float(tensor.data<float16>()[i]);
} }
os << value << " "; os << value << " ";
} }
os << "\n"; os << "\n";
return os; return os;
......
...@@ -102,6 +102,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { ...@@ -102,6 +102,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
Resize(other.dims()); Resize(other.dims());
auto shape = other.zynq_tensor_->shape(); auto shape = other.zynq_tensor_->shape();
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape); zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
precision_ = other.precision_;
// this->ZynqTensor()->copyFrom(other.ZynqTensor()); // this->ZynqTensor()->copyFrom(other.ZynqTensor());
memcpy(this->ZynqTensor()->data<void>(), memcpy(this->ZynqTensor()->data<void>(),
......
...@@ -109,6 +109,7 @@ class TensorLite { ...@@ -109,6 +109,7 @@ class TensorLite {
template <typename T, typename R = T> template <typename T, typename R = T>
const R *data() const { const R *data() const {
return zynq_tensor_->data<R>() + offset_; return zynq_tensor_->data<R>() + offset_;
// return zynq_tensor_->data<R>();
} }
void Resize(const DDimLite &ddim) { dims_ = ddim; } void Resize(const DDimLite &ddim) { dims_ = ddim; }
...@@ -198,7 +199,8 @@ class TensorLite { ...@@ -198,7 +199,8 @@ class TensorLite {
// set values of precision_ and persistable_ after updating it. // set values of precision_ and persistable_ after updating it.
// If your tensor is just a temp tensor, such as activations, // If your tensor is just a temp tensor, such as activations,
// you can ignore these two attributes. // you can ignore these two attributes.
PrecisionType precision_{PrecisionType::kUnk}; // PrecisionType precision_{PrecisionType::kUnk};
PrecisionType precision_{PrecisionType::kFloat};
bool persistable_{false}; bool persistable_{false};
DDimLite dims_; DDimLite dims_;
...@@ -235,6 +237,28 @@ zynqmp::DataType get_date_type() { ...@@ -235,6 +237,28 @@ zynqmp::DataType get_date_type() {
return data_type; return data_type;
} }
template <typename T>
PrecisionType get_precistion_type() {
PrecisionType data_type = PrecisionType::kUnk;
if (typeid(T) == typeid(float)) {
data_type = PrecisionType::kFloat;
}
if (typeid(T) == typeid(zynqmp::float16)) {
data_type = PrecisionType::kFP16;
}
if (typeid(T) == typeid(int)) {
data_type = PrecisionType::kInt32;
}
if (typeid(T) == typeid(int32_t)) {
data_type = PrecisionType::kInt32;
}
if (typeid(T) == typeid(int8_t)) {
data_type = PrecisionType::kInt8;
}
return data_type;
}
template <typename T, typename R> template <typename T, typename R>
R *TensorLite::mutable_data() { R *TensorLite::mutable_data() {
std::vector<int> v; std::vector<int> v;
...@@ -261,6 +285,7 @@ R *TensorLite::mutable_data() { ...@@ -261,6 +285,7 @@ R *TensorLite::mutable_data() {
} }
zynqmp::Shape input_shape(layout_type, v); zynqmp::Shape input_shape(layout_type, v);
zynqmp::DataType data_type = get_date_type<T>(); zynqmp::DataType data_type = get_date_type<T>();
precision_ = get_precistion_type<T>();
if (zynq_tensor_.get() == nullptr) { if (zynq_tensor_.get() == nullptr) {
zynq_tensor_.reset(new zynqmp::Tensor()); zynq_tensor_.reset(new zynqmp::Tensor());
......
...@@ -50,6 +50,7 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -50,6 +50,7 @@ class KernelPlaceCorrectPass : public DebugPass {
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"]; VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
// std::cout << ""
for (auto& x : graph->StmtTopologicalOrder()) { for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt(); auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference. // The IoCopyOp is a tool operator, it won't support the type inference.
...@@ -77,6 +78,80 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -77,6 +78,80 @@ class KernelPlaceCorrectPass : public DebugPass {
bool need_correct_place = true; bool need_correct_place = true;
auto in = x->inlinks.front();
auto out = x->outlinks.front();
auto p = in->AsArg().type->precision();
std::string node_name = out->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->outputs());
auto op_type = inst.op_type();
if (op_type == "reshape" || op_type == "reshape2") {
for (auto* x_in : x->inlinks) {
std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
if (in_name == "X") {
in = x_in;
std::cout << "found input \n";
// exit(-1);
}
}
p = in->AsArg().type->precision();
if ( p != PrecisionType::kFP16) {
// std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl;
// std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
UpdateTarget(inst, TargetType::kHost);
UpdateTensor(inst, in, out, TargetType::kHost);
}
}
if (inst.op_type() == "fetch") {
UpdateTarget(inst, TargetType::kFPGA);
}
if (inst.op_type() == "split" || inst.op_type() == "transpose") {
if ( p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
for (auto* x_out : x->outlinks) {
UpdateTensor(inst, in, x_out, TargetType::kARM);
}
}
}
if (inst.op_type() == "concat") {
std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl;
if ( p != PrecisionType::kFP16) {
UpdateTarget(inst, TargetType::kARM);
UpdateTensor(inst, in, out, TargetType::kARM);
}
}
// if (inst.op_type() == "elementwise_mul") {
// for (auto* x_in : x->inlinks) {
// std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
// if (in_name == "Y") {
// in = x_in;
// std::cout << "found y \n";
// // exit(-1);
// }
// }
// if ( p != PrecisionType::kFP16) {
// UpdateTarget(inst, TargetType::kARM);
// UpdateTensor(inst, in, out, TargetType::kARM);
// }
// }
std::vector<TargetType> in_types; std::vector<TargetType> in_types;
std::vector<TargetType> out_types; std::vector<TargetType> out_types;
for (auto* x_in : x->inlinks) { for (auto* x_in : x->inlinks) {
...@@ -88,6 +163,21 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -88,6 +163,21 @@ class KernelPlaceCorrectPass : public DebugPass {
<< "-- node name:" << node_name; << "-- node name:" << node_name;
auto type = inst.picked_kernel().GetInputDeclType(arg_name); auto type = inst.picked_kernel().GetInputDeclType(arg_name);
// std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight)
// << " is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl;
// std::cout << " type: "<< inst.op_type() << std::endl;
if (!x_in->AsArg().is_weight) {
auto p = x_in->AsArg().type->precision();
auto t = x_in->AsArg().type->target();
auto l = x_in->AsArg().type->layout();
// std::cout << "p:" << PrecisionToStr(p) << std::endl;
// std::cout << "t:" << TargetRepr(t) << std::endl;
// std::cout << "layout:" << DataLayoutToStr(l) << std::endl;
}
if (!x_in->AsArg().type) { if (!x_in->AsArg().type) {
need_correct_place &= false; need_correct_place &= false;
} else { } else {
...@@ -129,18 +219,69 @@ class KernelPlaceCorrectPass : public DebugPass { ...@@ -129,18 +219,69 @@ class KernelPlaceCorrectPass : public DebugPass {
need_correct_place &= (io_target_same && (in_types[0] != this_type)); need_correct_place &= (io_target_same && (in_types[0] != this_type));
if (need_correct_place) { if (need_correct_place) {
// update this kernel's valid place; // update this kernel's valid place;
UpdateTarget(inst, in_types[0]); // UpdateTarget(inst, in_types[0]);
} }
} }
} }
// Update me's kUnk fields by other's fields. // Update me's kUnk fields by other's fields.
void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT
// std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
auto new_place = inst.place(); auto new_place = inst.place();
new_place.target = new_target; new_place.target = new_target;
if (new_target == TargetType::kARM) {
new_place.precision = PrecisionType::kFloat;
new_place.layout = DataLayoutType::kNCHW;
}
if (new_target == TargetType::kHost) {
new_place.precision = PrecisionType::kFloat;
new_place.layout = DataLayoutType::kNCHW;
}
std::vector<Place> places; std::vector<Place> places;
places.push_back(new_place); places.push_back(new_place);
inst.ResetKernels(places); inst.ResetKernels(places);
// std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
}
void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) {
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs());
std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs());
auto type = inst.picked_kernel().GetInputDeclType(in_name);
auto tmp_ptype = in->AsArg().type->precision();
auto tmp_target = type->target();
auto tmp_layout = type->layout();
if (new_target == TargetType::kARM) {
tmp_target = TargetType::kARM;
tmp_ptype = PrecisionType::kFloat;
tmp_layout = DataLayoutType::kNCHW;
}
if (new_target == TargetType::kHost) {
tmp_target = TargetType::kHost;
tmp_ptype = PrecisionType::kFloat;
tmp_layout = DataLayoutType::kNCHW;
}
out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout);
} }
}; };
......
...@@ -144,6 +144,23 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -144,6 +144,23 @@ class StaticKernelPickPass : public mir::StmtPass {
} }
} }
if (kernel.target() == TARGET(kFPGA)) {
final_score = 4000;
bool in_match = true;
for (size_t i = 0; i < in_names.size(); ++i) {
std::string tmp;
CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
if (in_types.count(in_names[i]) &&
in_types.at(in_names[i]) !=
kernel.GetInputDeclType(tmp)->precision()) {
in_match = false;
}
}
if (in_match) {
final_score = 5000;
}
}
VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "[score(final)]:" << final_score;
VLOG(2) << "-------- pick summary for " << instruct.op_type() VLOG(2) << "-------- pick summary for " << instruct.op_type()
<< " --------"; << " --------";
......
...@@ -134,6 +134,12 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -134,6 +134,12 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Start from inputs of the graph, those should have place set. // Start from inputs of the graph, those should have place set.
std::list<Node*> nodes; std::list<Node*> nodes;
for (auto& node : graph->StmtTopologicalOrder()) { for (auto& node : graph->StmtTopologicalOrder()) {
// if (node->IsStmt()) {
// auto& s = node->AsStmt();
// std::cout << "type_precision type:" << s.op_type() << std::endl;
// }
// type_precision_cast_pass
nodes.push_back(node); nodes.push_back(node);
} }
...@@ -231,6 +237,10 @@ void PrecisionCastPass::AddCastInst( ...@@ -231,6 +237,10 @@ void PrecisionCastPass::AddCastInst(
// create Op and kernels. // create Op and kernels.
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
std::string cast_type = in_persist ? "calib_once" : "calib"; std::string cast_type = in_persist ? "calib_once" : "calib";
// TODO
cast_type = "calib";
cast_op_output_arg->AsArg().is_persist = in_persist; cast_op_output_arg->AsArg().is_persist = in_persist;
auto cast_op = LiteOpRegistry::Global().Create(cast_type); auto cast_op = LiteOpRegistry::Global().Create(cast_type);
CHECK(cast_op) << "create op [" << cast_op << "] failed"; CHECK(cast_op) << "create op [" << cast_op << "] failed";
......
...@@ -32,6 +32,12 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -32,6 +32,12 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Start from inputs of the graph, those should have place set. // Start from inputs of the graph, those should have place set.
std::list<Node*> nodes; std::list<Node*> nodes;
for (auto& node : graph->StmtTopologicalOrder()) { for (auto& node : graph->StmtTopologicalOrder()) {
// if (node->IsStmt()) {
// auto& s = node->AsStmt();
// // std::cout << "type_target type:" << s.op_type() << std::endl;
// }else {
// // std::cout << "type_target not a statement \n";
// }
nodes.push_back(node); nodes.push_back(node);
} }
...@@ -47,6 +53,7 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -47,6 +53,7 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
ComplementInputs(graph.get(), node, in, &copied_nodes); ComplementInputs(graph.get(), node, in, &copied_nodes);
} }
} }
} }
void TypeTargetTransformPass::ComplementInputs( void TypeTargetTransformPass::ComplementInputs(
...@@ -127,7 +134,8 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -127,7 +134,8 @@ void TypeTargetTransformPass::AddIoCopyInst(
auto* io_copy_inst = graph->NewInstructNode(); auto* io_copy_inst = graph->NewInstructNode();
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy"; // std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
std::string io_copy_type = "io_copy";
io_copy_output_arg->AsArg().is_persist = in_persist; io_copy_output_arg->AsArg().is_persist = in_persist;
// create Op and kernels. // create Op and kernels.
auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type); auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
...@@ -147,6 +155,7 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -147,6 +155,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
// fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
bool is_found = false; bool is_found = false;
std::vector<std::unique_ptr<KernelBase>> selected_kernels; std::vector<std::unique_ptr<KernelBase>> selected_kernels;
std::cout << "kernels:" << std::to_string(kernels.size()) << std::endl;
for (auto& kernel : kernels) { for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
......
...@@ -64,6 +64,7 @@ void ConcatCompute::Run() { ...@@ -64,6 +64,7 @@ void ConcatCompute::Run() {
auto& param = Param<operators::ConcatParam>(); auto& param = Param<operators::ConcatParam>();
std::vector<lite::Tensor*> inputs = param.x; std::vector<lite::Tensor*> inputs = param.x;
CHECK_GE(inputs.size(), 1); CHECK_GE(inputs.size(), 1);
// std::cout << "concat size:" << std::to_string(inputs.size()) << std::endl;
auto* out = param.output; auto* out = param.output;
int axis = param.axis; int axis = param.axis;
auto* axis_tensor = param.axis_tensor; auto* axis_tensor = param.axis_tensor;
...@@ -72,21 +73,22 @@ void ConcatCompute::Run() { ...@@ -72,21 +73,22 @@ void ConcatCompute::Run() {
axis = axis_tensor_data[0]; axis = axis_tensor_data[0];
} }
switch (inputs.front()->precision()) { ConcatFunc<float>(inputs, axis, out);
case PRECISION(kFloat): // switch (inputs.front()->precision()) {
ConcatFunc<float>(inputs, axis, out); // case PRECISION(kFloat):
break; // ConcatFunc<float>(inputs, axis, out);
case PRECISION(kInt32): // break;
ConcatFunc<int32_t>(inputs, axis, out); // case PRECISION(kInt32):
break; // ConcatFunc<int32_t>(inputs, axis, out);
case PRECISION(kInt64): // break;
ConcatFunc<int64_t>(inputs, axis, out); // case PRECISION(kInt64):
break; // ConcatFunc<int64_t>(inputs, axis, out);
default: // break;
LOG(FATAL) << "Concat does not implement for the " // default:
<< "input type:" // LOG(FATAL) << "Concat does not implement for the "
<< static_cast<int>(inputs.front()->precision()); // << "input type:"
} // << static_cast<int>(inputs.front()->precision());
// }
} }
} // namespace arm } // namespace arm
......
...@@ -17,6 +17,8 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) ...@@ -17,6 +17,8 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps}) add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
add_kernel(interpolate_compute_fpga FPGA basic SRCS interpolate_compute.cc DEPS ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
......
...@@ -44,6 +44,17 @@ void CalibComputeFP16ToFp32::Run() { ...@@ -44,6 +44,17 @@ void CalibComputeFP16ToFp32::Run() {
return; return;
} }
void CalibComputeFloat2Int::Run() {
auto& param = this->Param<operators::CalibParam>();
const auto* din = param.input->data<float>();
auto* dout = param.output->mutable_data<int>();
// param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
//TODO
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
}
} // namespace fpga } // namespace fpga
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
...@@ -65,12 +76,28 @@ REGISTER_LITE_KERNEL(calib, ...@@ -65,12 +76,28 @@ REGISTER_LITE_KERNEL(calib,
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(calib,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::CalibComputeFloat2Int,
float_2_int_fpga)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
PRECISION(kInt32),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(calib, REGISTER_LITE_KERNEL(calib,
kFPGA, kFPGA,
kFP16, kFP16,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::CalibComputeFP16ToFp32, paddle::lite::kernels::fpga::CalibComputeFP16ToFp32,
fp16_to_fp32_fpga) float_to_int_fpga)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
......
...@@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32 ...@@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32
private: private:
}; };
class CalibComputeFloat2Int
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::CalibParam;
void Run() override;
~CalibComputeFloat2Int() override{};
private:
};
} // namespace fpga } // namespace fpga
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -47,7 +47,8 @@ void ConcatCompute::Run() { ...@@ -47,7 +47,8 @@ void ConcatCompute::Run() {
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ConcatParam& concat_param = pe_.param(); zynqmp::ConcatParam& concat_param = pe_.param();
Debugger::get_instance().registerOutput("concat", concat_param.output); concat_param.output->flush();
// Debugger::get_instance().registerOutput("concat", concat_param.output);
#endif #endif
} }
......
...@@ -51,6 +51,11 @@ void ConvCompute::PrepareForRun() { ...@@ -51,6 +51,11 @@ void ConvCompute::PrepareForRun() {
conv_param.activeParam.type = zynqmp::TYPE_RELU; conv_param.activeParam.type = zynqmp::TYPE_RELU;
} }
if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
}
dw_conv_pe_.init(); dw_conv_pe_.init();
dw_conv_pe_.apply(); dw_conv_pe_.apply();
} else { } else {
...@@ -72,9 +77,15 @@ void ConvCompute::PrepareForRun() { ...@@ -72,9 +77,15 @@ void ConvCompute::PrepareForRun() {
conv_param.activeParam.type = zynqmp::TYPE_RELU; conv_param.activeParam.type = zynqmp::TYPE_RELU;
} }
if (param.activation_param.Leaky_relu_alpha > 0.001) {
conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU;
conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha;
}
conv_pe_.init(); conv_pe_.init();
conv_pe_.apply(); conv_pe_.apply();
} }
// std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl;
} }
void ConvCompute::Run() { void ConvCompute::Run() {
......
...@@ -88,13 +88,33 @@ void ElementwiseMulCompute::PrepareForRun() { ...@@ -88,13 +88,33 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape); scale_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
zynqmp::float16* bias_data = zynqmp::float16* bias_data =
bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape); bias_.mutableData<zynqmp::float16>(zynqmp::FP16, shape);
float scale_value = param.Y->data<float>()[0]; zynqmp::float16 scale_value = 0;
if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
scale_value = zynqmp::float_to_half(param.Y->data<float>()[0]);
// std::cout << "FP32 \n";
} else {
scale_value = param.Y->data<zynqmp::float16>()[0];
// std::cout << "FP16 \n";
}
// std::cout << "channel:" << channel << std::endl;
// std::cout << "production:" << param.Y->dims().production() << std::endl;
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
if (param.Y->dims().production() != 1) { if (param.Y->dims().production() != 1) {
scale_value = param.Y->ZynqTensor()->data<float>()[i]; // scale_value = param.Y->ZynqTensor()->data<zynqmp::float16>()[i];
if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) {
scale_value = zynqmp::float_to_half(param.Y->data<float>()[i]);
} else {
scale_value = param.Y->data<zynqmp::float16>()[i];
}
} }
scale_data[i] = zynqmp::float_to_half(scale_value); // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
scale_data[i] = scale_value;
bias_data[i] = zero_; bias_data[i] = zero_;
} }
...@@ -104,15 +124,17 @@ void ElementwiseMulCompute::PrepareForRun() { ...@@ -104,15 +124,17 @@ void ElementwiseMulCompute::PrepareForRun() {
void ElementwiseMulCompute::Run() { void ElementwiseMulCompute::Run() {
auto& param = Param<operators::ElementwiseParam>(); auto& param = Param<operators::ElementwiseParam>();
// std::cout << "param.Y :" << param.Y->persistable() << std::endl;
if (!param.Y->persistable()) { if (!param.Y->persistable()) {
// TODO
scale_.copyFrom(param.Y->ZynqTensor()); scale_.copyFrom(param.Y->ZynqTensor());
scale_.invalidate(); scale_.flush();//TODO
} }
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param(); zynqmp::ScaleParam& scale_param = pe_.param();
Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input); // Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
Debugger::get_instance().registerOutput("ew_mul", scale_param.output); // Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif #endif
} }
...@@ -181,3 +203,21 @@ REGISTER_LITE_KERNEL(elementwise_mul, ...@@ -181,3 +203,21 @@ REGISTER_LITE_KERNEL(elementwise_mul,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(elementwise_mul,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::ElementwiseMulCompute,
ew_mul_y_arm)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
\ No newline at end of file
...@@ -64,18 +64,18 @@ void FetchCompute::Run() { ...@@ -64,18 +64,18 @@ void FetchCompute::Run() {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_LITE_KERNEL(fetch, // REGISTER_LITE_KERNEL(fetch,
kFPGA, // kFPGA,
kFP16, // kFP16,
kNHWC, // kNHWC,
paddle::lite::kernels::fpga::FetchCompute, // paddle::lite::kernels::fpga::FetchCompute,
fpga_host) // fpga_host)
.BindInput("X", // .BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA), // {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny), // PRECISION(kFP16),
DATALAYOUT(kAny))}) // DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) // .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize(); // .Finalize();
REGISTER_LITE_KERNEL(fetch, REGISTER_LITE_KERNEL(fetch,
kFPGA, kFPGA,
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/interpolate_compute.h"
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
void BilinearInterpCompute::Run() {
// auto& param = Param<operators::InterpolateParam>();
// lite::Tensor* X = param.X;
// lite::Tensor* OutSize = param.OutSize;
// auto SizeTensor = param.SizeTensor;
// auto Scale = param.Scale;
// lite::Tensor* Out = param.Out;
// float scale = param.scale;
// int out_w = param.out_w;
// int out_h = param.out_h;
// bool align_corners = param.align_corners;
// std::string interp_method = "Bilinear";
// lite::arm::math::interpolate(X,
// OutSize,
// SizeTensor,
// Scale,
// Out,
// out_h,
// out_w,
// scale,
// align_corners,
// interp_method);
}
void nearest_interp(const float16* src,
int w_in,
int h_in,
int c,
float16* dst,
int w_out,
int h_out,
float scale_x,
float scale_y,
bool with_align) {
float scale_w_new = (with_align)
? (static_cast<float>(w_in - 1) / (w_out - 1))
: (static_cast<float>(w_in) / (w_out));
float scale_h_new = (with_align)
? (static_cast<float>(h_in - 1) / (h_out - 1))
: (static_cast<float>(h_in) / (h_out));
if (with_align) {
for (int h = 0; h < h_out; ++h) {
float16* dst_p = dst + h * w_out * c;
int near_y = static_cast<int>(scale_h_new * h + 0.5);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w + 0.5);
// *dst_p++ = src[near_y * w_in + near_x];
const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c;
}
}
} else {
for (int h = 0; h < h_out; ++h) {
float16* dst_p = dst + h * w_out;
int near_y = static_cast<int>(scale_h_new * h);
for (int w = 0; w < w_out; ++w) {
int near_x = static_cast<int>(scale_w_new * w);
const float16* src_n = src + (near_y * w_in + near_x) * c;
memcpy(dst_p, src_n, c * sizeof(float16));
dst_p += c;
}
}
}
}
void NearestInterpCompute::PrepareForRun() {
auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize;
lite::Tensor* Out = param.Out;
Out->mutable_data<float16>();
zynqmp::ResizeParam& norm_param = pe_.param();
norm_param.input = X->ZynqTensor();
norm_param.output = Out->ZynqTensor();
pe_.init();
pe_.apply();
}
// TODO
inline std::vector<int> get_new_shape(
std::vector<const lite::Tensor*> list_new_shape_tensor) {
// get tensor from
std::vector<int> vec_new_shape;
for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
auto tensor = list_new_shape_tensor[i];
vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
}
return vec_new_shape;
}
template <typename T>
inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
std::vector<T> vec_new_data;
auto* new_data = new_data_tensor->data<T>();
lite::Tensor cpu_starts_tensor;
vec_new_data =
std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
return vec_new_data;
}
void interpolate(lite::Tensor* X,
lite::Tensor* OutSize,
std::vector<const lite::Tensor*> SizeTensor,
lite::Tensor* Scale,
lite::Tensor* Out,
int out_height,
int out_width,
float scale,
bool with_align,
std::string interpolate_type) {
int in_h = X->dims()[2];
int in_w = X->dims()[3];
if (SizeTensor.size() > 0) {
auto new_size = get_new_shape(SizeTensor);
out_height = new_size[0];
out_width = new_size[1];
} else {
auto scale_tensor = Scale;
if (scale_tensor != nullptr) {
auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
scale = scale_data[0];
}
if (scale > 0) {
out_height = static_cast<int>(in_h * scale);
out_width = static_cast<int>(in_w * scale);
}
auto out_size = OutSize;
if (out_size != nullptr) {
auto out_size_data = get_new_data_from_tensor<int>(out_size);
out_height = out_size_data[0];
out_width = out_size_data[1];
}
}
float height_scale = scale;
float width_scale = scale;
if (out_width > 0 && out_height > 0) {
height_scale = static_cast<float>(out_height / X->dims()[2]);
width_scale = static_cast<float>(out_width / X->dims()[3]);
}
int num_cout = X->dims()[0];
int c_cout = X->dims()[1];
Out->Resize({num_cout, c_cout, out_height, out_width});
float16* dout = Out->mutable_data<float16>();
const float16* din = X->data<float16>();
int out_num = Out->dims()[0];
int out_c = Out->dims()[1];
int count = out_num;
int out_h = Out->dims()[2];
int out_w = Out->dims()[3];
int spatial_in = in_h * in_w;
int spatial_out = out_h * out_w;
for (int i = 0; i < count; ++i) {
nearest_interp(din + spatial_in * i,
in_w,
in_h,
out_c,
dout + spatial_out * i,
out_w,
out_h,
1.f / width_scale,
1.f / height_scale,
with_align);
}
}
void NearestInterpCompute::Run() {
auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize;
auto SizeTensor = param.SizeTensor;
auto Scale = param.Scale;
lite::Tensor* Out = param.Out;
float scale = param.scale;
int out_w = param.out_w;
int out_h = param.out_h;
bool align_corners = param.align_corners;
std::string interp_method = "";
X->ZynqTensor()->invalidate();//TODO
X->ZynqTensor()->saveToFile("n_in", true);
interpolate(X,
OutSize,
SizeTensor,
Scale,
Out,
out_h,
out_w,
scale,
align_corners,
interp_method);
Out->ZynqTensor()->flush();
Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor());
Out->ZynqTensor()->saveToFile("n_out", true);
}
} /* namespace fpga */
} /* namespace kernels */
} /* namespace lite */
} /* namespace paddle */
REGISTER_LITE_KERNEL(bilinear_interp,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::BilinearInterpCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(nearest_interp,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::NearestInterpCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class BilinearInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void Run() override;
virtual ~BilinearInterpCompute() = default;
};
class NearestInterpCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
virtual ~NearestInterpCompute() = default;
private:
zynqmp::ResizePE pe_;
};
} /* namespace fpga */
} /* namespace kernels */
} /* namespace lite */
} /* namespace paddle */
...@@ -25,10 +25,17 @@ namespace fpga { ...@@ -25,10 +25,17 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
void copy_properties(operators::IoCopyParam& param) {
param.y->set_persistable(param.x->persistable());
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
}
/* /*
* This kernel copies a tensor from host to FPGA space. * This kernel copies a tensor from host to FPGA space.
*/ */
class IoCopyHostToFpgaCompute class IoCopyHostCHWToFpgaHWCCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> { : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
public: public:
void Run() override { void Run() override {
...@@ -37,52 +44,33 @@ class IoCopyHostToFpgaCompute ...@@ -37,52 +44,33 @@ class IoCopyHostToFpgaCompute
param.x->target() == TARGET(kFPGA)); param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->flush(); param.x->ZynqTensor()->flush();
if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) { if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) {
param.y->mutable_data<int>(); param.y->mutable_data<int>();
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
copy_properties(param);
return; return;
} }
if (param.x->ZynqTensor()->dataType() == zynqmp::FP32) { param.y->mutable_data<float16>();
param.y->mutable_data<float16>(); param.y->ZynqTensor()->setDataLocation(zynqmp::Device);
if (param.x->ZynqTensor()->aligned() && if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) { param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor; zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16, tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape()); param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor()); tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true); tempTensor.setAligned(true);
tempTensor.unalignImage(); tempTensor.unalignImage();
param.y->ZynqTensor()->copyFrom(&tempTensor); tempTensor.flush();
} else { param.y->ZynqTensor()->copyFrom(&tempTensor);
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } else {
} param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->invalidate();
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
} }
copy_properties(param);
auto out_lod = param.y->mutable_lod(); param.y->ZynqTensor()->invalidate();
*out_lod = param.x->lod();
}
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
*res = [](const std::map<std::string, const Type*>& inputs,
const std::string& out) -> const Type* {
CHECK(!inputs.empty());
auto* type = inputs.at("Input");
CHECK(type->target() == TARGET(kHost));
auto out_place = type->place();
out_place.target = TARGET(kFPGA);
auto* out_type = Type::Get(type->id(),
out_place.target,
out_place.precision,
out_place.layout,
out_place.device);
return out_type;
};
return res;
} }
std::string doc() const override { return "Copy IO from HOST to FPGA"; } std::string doc() const override { return "Copy IO from HOST to FPGA"; }
...@@ -98,10 +86,11 @@ class IoCopyFpgaToHostCompute ...@@ -98,10 +86,11 @@ class IoCopyFpgaToHostCompute
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) || CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA)); param.x->target() == TARGET(kFPGA));
param.x->ZynqTensor()->syncToDevice();
param.y->mutable_data<float>(); param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32); param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice(); param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
if (param.x->ZynqTensor()->aligned() && if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) { param.x->ZynqTensor()->shape().shouldAlign()) {
...@@ -115,10 +104,9 @@ class IoCopyFpgaToHostCompute ...@@ -115,10 +104,9 @@ class IoCopyFpgaToHostCompute
} else { } else {
param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
} }
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush(); param.y->ZynqTensor()->invalidate();
auto out_lod = param.y->mutable_lod(); copy_properties(param);
*out_lod = param.x->lod();
} }
std::string doc() const override { return "Copy IO from FPGA to HOST"; } std::string doc() const override { return "Copy IO from FPGA to HOST"; }
}; };
...@@ -153,14 +141,16 @@ class IoCopyFpgaToHostCHWCompute ...@@ -153,14 +141,16 @@ class IoCopyFpgaToHostCHWCompute
CHECK(param.x->target() == TARGET(kHost) || CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA)); param.x->target() == TARGET(kFPGA));
Tensor hwc; Tensor hwc;
hwc.Resize(param.y->dims()); hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>(); float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>(); float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32); param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice(); param.x->ZynqTensor()->syncToDevice();
hwc.ZynqTensor()->setDataLocation(zynqmp::CPU);
param.y->ZynqTensor()->setDataLocation(zynqmp::CPU);
if (param.x->ZynqTensor()->aligned() && if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) { param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor; zynqmp::Tensor tempTensor;
...@@ -168,10 +158,30 @@ class IoCopyFpgaToHostCHWCompute ...@@ -168,10 +158,30 @@ class IoCopyFpgaToHostCHWCompute
param.x->ZynqTensor()->shape()); param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor()); tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true); tempTensor.setAligned(true);
// tempTensor.saveToFile("temp_1", true);
tempTensor.unalignImage(); tempTensor.unalignImage();
// tempTensor.saveToFile("temp_2", true);
hwc.ZynqTensor()->copyFrom(&tempTensor); hwc.ZynqTensor()->copyFrom(&tempTensor);
} else { } else {
hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor()); // hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
float16* in_data = param.x->ZynqTensor()->data<float16>();
// float* f_data =
param.x->ZynqTensor()->flush();
float max = 0;
for (int i = 0; i < param.x->dims().production(); i++) {
float value = zynqmp::half_to_float(in_data[i]);
hwc_data[i] = value;
if (value < 0) {
value = -value;
}
if (value > max) {
max = value;
}
}
param.x->ZynqTensor()->scale()[0] = max / 127;
param.x->ZynqTensor()->scale()[1] = 127 / max;
} }
int num = 1; int num = 1;
...@@ -188,10 +198,15 @@ class IoCopyFpgaToHostCHWCompute ...@@ -188,10 +198,15 @@ class IoCopyFpgaToHostCHWCompute
dims.height(), dims.height(),
dims.width()); dims.width());
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); // param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush(); param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod(); copy_properties(param);
*out_lod = param.x->lod();
param.x->ZynqTensor()->invalidate();
param.x->ZynqTensor()->flush();
// hwc.ZynqTensor()->saveToFile("hwc", true);
// param.x->ZynqTensor()->saveToFile("io2_x", true);
// param.y->ZynqTensor()->saveToFile("io2_y", true);
} }
std::string doc() const override { return "Copy IO from FPGA to HOST"; } std::string doc() const override { return "Copy IO from FPGA to HOST"; }
}; };
...@@ -201,52 +216,36 @@ class IoCopyFpgaToHostCHWCompute ...@@ -201,52 +216,36 @@ class IoCopyFpgaToHostCHWCompute
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// REGISTER_LITE_KERNEL(io_copy,
// kFPGA,
// kAny,
// kAny,
// paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
// host_to_device)
// .BindInput("Input",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .Finalize();
REGISTER_LITE_KERNEL(io_copy, REGISTER_LITE_KERNEL(io_copy,
kFPGA, kFPGA,
kAny, kAny,
kAny, kAny,
paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute, paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
host_to_device_any_any) host_to_device)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy( {LiteType::GetTensorTy(TARGET(kHost),
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) PRECISION(kInt32),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy,
kFPGA,
kAny,
kAny,
paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute,
host_float_chw_to_device_fp16_hwc)
.BindInput("Input", {LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
// REGISTER_LITE_KERNEL(io_copy,
// kFPGA,
// kAny,
// kAny,
// paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
// device_to_host)
// .BindInput("Input",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .Finalize();
REGISTER_LITE_KERNEL(io_copy, REGISTER_LITE_KERNEL(io_copy,
kFPGA, kFPGA,
...@@ -311,3 +310,26 @@ REGISTER_LITE_KERNEL(io_copy, ...@@ -311,3 +310,26 @@ REGISTER_LITE_KERNEL(io_copy,
// PRECISION(kAny), // PRECISION(kAny),
// DATALAYOUT(kAny))}) // DATALAYOUT(kAny))})
// .Finalize(); // .Finalize();
// ==========================================================
// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
// std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
// *res = [](const std::map<std::string, const Type*>& inputs,
// const std::string& out) -> const Type* {
// CHECK(!inputs.empty());
// auto* type = inputs.at("Input");
// CHECK(type->target() == TARGET(kHost));
// auto out_place = type->place();
// out_place.target = TARGET(kFPGA);
// auto* out_type = Type::Get(type->id(),
// out_place.target,
// out_place.precision,
// out_place.layout,
// out_place.device);
// return out_type;
// };
// return res;
// }
\ No newline at end of file
...@@ -94,6 +94,7 @@ T PolyIoU(const T* box1, ...@@ -94,6 +94,7 @@ T PolyIoU(const T* box1,
const size_t box_size, const size_t box_size,
const bool normalized) { const bool normalized) {
LOG(FATAL) << "PolyIoU not implement."; LOG(FATAL) << "PolyIoU not implement.";
return *box1;
} }
template <class T> template <class T>
...@@ -128,34 +129,44 @@ void NMSFast(const Tensor& bbox, ...@@ -128,34 +129,44 @@ void NMSFast(const Tensor& bbox,
std::vector<int>* selected_indices, std::vector<int>* selected_indices,
const bool normalized) { const bool normalized) {
// The total boxes for each instance. // The total boxes for each instance.
// std::cout << "1\n";
int64_t num_boxes = bbox.dims()[0]; int64_t num_boxes = bbox.dims()[0];
// std::cout << "1,1\n";
// 4: [xmin ymin xmax ymax] // 4: [xmin ymin xmax ymax]
// 8: [x1 y1 x2 y2 x3 y3 x4 y4] // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
// 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16 // 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16
int64_t box_size = bbox.dims()[1]; int64_t box_size = bbox.dims()[1];
// std::cout << "1,2\n";
std::vector<T> scores_data(num_boxes); std::vector<T> scores_data(num_boxes);
std::copy_n(scores.data<T>(), num_boxes, scores_data.begin()); std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
// std::cout << "1,3\n";
std::vector<std::pair<T, int>> sorted_indices; std::vector<std::pair<T, int>> sorted_indices;
// std::cout << "1,4\n";
GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
// std::cout << "2\n";
selected_indices->clear(); selected_indices->clear();
T adaptive_threshold = nms_threshold; T adaptive_threshold = nms_threshold;
const T* bbox_data = bbox.data<T>(); const T* bbox_data = bbox.data<T>();
// std::cout << "3\n";
while (sorted_indices.size() != 0) { while (sorted_indices.size() != 0) {
const int idx = sorted_indices.front().second; const int idx = sorted_indices.front().second;
// std::cout << "4\n";
bool keep = true; bool keep = true;
for (size_t k = 0; k < selected_indices->size(); ++k) { for (size_t k = 0; k < selected_indices->size(); ++k) {
// std::cout << "5\n";
if (keep) { if (keep) {
const int kept_idx = (*selected_indices)[k]; const int kept_idx = (*selected_indices)[k];
T overlap = T(0.); T overlap = T(0.);
// std::cout << "6\n";
// 4: [xmin ymin xmax ymax] // 4: [xmin ymin xmax ymax]
if (box_size == 4) { if (box_size == 4) {
overlap = JaccardOverlap<T>(bbox_data + idx * box_size, overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, bbox_data + kept_idx * box_size,
normalized); normalized);
} }
// std::cout << "7\n";
// 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
if (box_size == 8 || box_size == 16 || box_size == 24 || if (box_size == 8 || box_size == 16 || box_size == 24 ||
box_size == 32) { box_size == 32) {
...@@ -168,10 +179,13 @@ void NMSFast(const Tensor& bbox, ...@@ -168,10 +179,13 @@ void NMSFast(const Tensor& bbox,
} else { } else {
break; break;
} }
// std::cout << "8\n";
} }
// std::cout << "9\n";
if (keep) { if (keep) {
selected_indices->push_back(idx); selected_indices->push_back(idx);
} }
// std::cout << "10\n";
sorted_indices.erase(sorted_indices.begin()); sorted_indices.erase(sorted_indices.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) { if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta; adaptive_threshold *= eta;
...@@ -195,21 +209,25 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, ...@@ -195,21 +209,25 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
T score_threshold = static_cast<T>(param.score_threshold); T score_threshold = static_cast<T>(param.score_threshold);
int num_det = 0; int num_det = 0;
int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
Tensor bbox_slice, score_slice;
for (int64_t c = 0; c < class_num; ++c) { for (int64_t c = 0; c < class_num; ++c) {
Tensor bbox_slice, score_slice;
if (c == background_label) continue; if (c == background_label) continue;
// std::cout << "------ 1 \n";
if (scores_size == 3) { if (scores_size == 3) {
// std::cout << "------ scores_size = 3 \n";
scores.Slice<T>(score_slice, c, c + 1); scores.Slice<T>(score_slice, c, c + 1);
bbox_slice = bboxes; // bbox_slice = bboxes;
} else { } else {
// std::cout << "------ scores_size != 3 \n";
score_slice.Resize({scores.dims()[0], 1}); score_slice.Resize({scores.dims()[0], 1});
bbox_slice.Resize({scores.dims()[0], 4}); bbox_slice.Resize({scores.dims()[0], 4});
SliceOneClass<T>(scores, c, &score_slice); SliceOneClass<T>(scores, c, &score_slice);
SliceOneClass<T>(bboxes, c, &bbox_slice); SliceOneClass<T>(bboxes, c, &bbox_slice);
} }
NMSFast(bboxes, NMSFast(bboxes,// TODO
score_slice, score_slice,
score_threshold, score_threshold,
nms_threshold, nms_threshold,
...@@ -226,8 +244,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, ...@@ -226,8 +244,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
*num_nmsed_out = num_det; *num_nmsed_out = num_det;
const T* scores_data = scores.data<T>(); const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) { if (keep_top_k > -1 && num_det > keep_top_k) {
Tensor score_slice;
const T* sdata; const T* sdata;
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs; std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (const auto& it : *indices) { for (const auto& it : *indices) {
...@@ -275,7 +291,9 @@ void MultiClassOutput(const Tensor& scores, ...@@ -275,7 +291,9 @@ void MultiClassOutput(const Tensor& scores,
const Tensor& bboxes, const Tensor& bboxes,
const std::map<int, std::vector<int>>& selected_indices, const std::map<int, std::vector<int>>& selected_indices,
const int scores_size, const int scores_size,
Tensor* outs) { Tensor* outs,
int* oindices = nullptr,
const int offset = 0) {
int64_t class_num = scores.dims()[1]; int64_t class_num = scores.dims()[1];
int64_t predict_dim = scores.dims()[1]; int64_t predict_dim = scores.dims()[1];
int64_t box_size = bboxes.dims()[1]; int64_t box_size = bboxes.dims()[1];
...@@ -305,9 +323,15 @@ void MultiClassOutput(const Tensor& scores, ...@@ -305,9 +323,15 @@ void MultiClassOutput(const Tensor& scores,
if (scores_size == 3) { if (scores_size == 3) {
bdata = bboxes_data + idx * box_size; bdata = bboxes_data + idx * box_size;
odata[count * out_dim + 1] = sdata[idx]; // score odata[count * out_dim + 1] = sdata[idx]; // score
if (oindices != nullptr) {
oindices[count] = offset + idx;
}
} else { } else {
bdata = bbox.data<T>() + idx * box_size; bdata = bbox.data<T>() + idx * box_size;
odata[count * out_dim + 1] = *(scores_data + idx * class_num + label); odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
if (oindices != nullptr) {
oindices[count] = offset + idx * class_num + label;
}
} }
// xmin, ymin, xmax, ymax or multi-points coordinates // xmin, ymin, xmax, ymax or multi-points coordinates
std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
...@@ -318,36 +342,18 @@ void MultiClassOutput(const Tensor& scores, ...@@ -318,36 +342,18 @@ void MultiClassOutput(const Tensor& scores,
void MulticlassNmsCompute::Run() { void MulticlassNmsCompute::Run() {
auto& param = Param<operators::MulticlassNmsParam>(); auto& param = Param<operators::MulticlassNmsParam>();
auto* boxes_in = param.bboxes; auto* boxes = param.bboxes;
auto* scores_in = param.scores; auto* scores = param.scores;
auto* outs = param.out; auto* outs = param.out;
outs->mutable_data<float>(); bool return_index = param.index ? true : false;
auto* index = param.index;
auto score_dims = boxes_in->dims(); auto score_dims = scores->dims();
auto score_size = score_dims.size(); auto score_size = score_dims.size();
Tensor boxes_float;
Tensor scores_float;
boxes_float.Resize(boxes_in->dims());
scores_float.Resize(scores_in->dims());
boxes_float.mutable_data<float>();
scores_float.mutable_data<float>();
boxes_float.ZynqTensor()->copyFrom(boxes_in->ZynqTensor());
scores_float.ZynqTensor()->copyFrom(scores_in->ZynqTensor());
Tensor* boxes = &boxes_float;
Tensor* scores = &scores_float;
auto box_dims = boxes->dims();
int64_t box_dim = boxes->dims()[2];
std::vector<std::map<int, std::vector<int>>> all_indices; std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<uint64_t> batch_starts = {0}; std::vector<uint64_t> batch_starts = {0};
int64_t batch_size = score_dims[0]; int64_t batch_size = score_dims[0];
int64_t box_dim = boxes->dims()[2];
int64_t out_dim = box_dim + 2; int64_t out_dim = box_dim + 2;
int num_nmsed_out = 0; int num_nmsed_out = 0;
Tensor boxes_slice, scores_slice; Tensor boxes_slice, scores_slice;
...@@ -372,79 +378,104 @@ void MulticlassNmsCompute::Run() { ...@@ -372,79 +378,104 @@ void MulticlassNmsCompute::Run() {
uint64_t num_kept = batch_starts.back(); uint64_t num_kept = batch_starts.back();
if (num_kept == 0) { if (num_kept == 0) {
outs->Resize({1, 1}); if (return_index) {
float* od = outs->mutable_data<float>(); outs->Resize({0, out_dim});
od[0] = -1; index->Resize({0, 1});
batch_starts = {0, 1}; } else {
outs->Resize({1, 1});
float* od = outs->mutable_data<float>();
od[0] = -1;
batch_starts = {0, 1};
}
} else { } else {
outs->Resize({static_cast<int64_t>(num_kept), out_dim}); outs->Resize({static_cast<int64_t>(num_kept), out_dim});
outs->mutable_data<float>();
int offset = 0;
int* oindices = nullptr;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
if (score_size == 3) { if (score_size == 3) {
scores->Slice<float>(scores_slice, i, i + 1); scores->Slice<float>(scores_slice, i, i + 1);
boxes->Slice<float>(boxes_slice, i, i + 1); boxes->Slice<float>(boxes_slice, i, i + 1);
scores_slice.Resize({score_dims[1], score_dims[2]}); scores_slice.Resize({score_dims[1], score_dims[2]});
boxes_slice.Resize({score_dims[2], box_dim}); boxes_slice.Resize({score_dims[2], box_dim});
if (return_index) {
offset = i * score_dims[2];
}
} else { } else {
auto boxes_lod = boxes->lod().back(); auto boxes_lod = boxes->lod().back();
scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]); scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]); boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
if (return_index) {
offset = boxes_lod[i] * score_dims[1];
}
} }
int64_t s = static_cast<int64_t>(batch_starts[i]); int64_t s = static_cast<int64_t>(batch_starts[i]);
int64_t e = static_cast<int64_t>(batch_starts[i + 1]); int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
if (e > s) { if (e > s) {
Tensor out; Tensor out;
outs->Slice<float>(out, s, e); outs->Slice<float>(out, s, e);
MultiClassOutput<float>( if (return_index) {
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); index->Resize({static_cast<int64_t>(num_kept), 1});
int* output_idx = index->mutable_data<int>();
oindices = output_idx + s;
}
MultiClassOutput<float>(scores_slice,
boxes_slice,
all_indices[i],
score_dims.size(),
&out,
oindices,
offset);
// out.ZynqTensor()->saveToFile("nms_o", true);
outs->ZynqTensor()->copyFrom(out.ZynqTensor()); outs->ZynqTensor()->copyFrom(out.ZynqTensor());
out.ZynqTensor()->saveToFile("nms_oo", true); outs->ZynqTensor()->flush();
} }
outs->Resize({static_cast<int64_t>(e - s), out_dim});
} }
} }
LoD lod; LoD lod;
lod.emplace_back(batch_starts); lod.emplace_back(batch_starts);
if (return_index) {
index->set_lod(lod);
}
outs->set_lod(lod); outs->set_lod(lod);
#ifdef FPGA_PRINT_TENSOR // boxes->ZynqTensor()->saveToFile("boxes", true);
Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor()); // scores->ZynqTensor()->saveToFile("scores", true);
Debugger::get_instance().registerOutput("scores", scores->ZynqTensor()); // outs->ZynqTensor()->saveToFile("nms", true);
Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
#endif
} }
} // namespace fpga } // namespace fpga
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// REGISTER_LITE_KERNEL(multiclass_nms,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::MulticlassNmsCompute,
// def)
// .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
REGISTER_LITE_KERNEL(multiclass_nms, REGISTER_LITE_KERNEL(multiclass_nms,
kFPGA, kFPGA,
kFP16, kFP16,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::MulticlassNmsCompute, paddle::lite::kernels::fpga::MulticlassNmsCompute,
def2) def)
.BindInput("BBoxes", .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kARM))})
{LiteType::GetTensorTy(TARGET(kFPGA), .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
PRECISION(kFP16), .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
DATALAYOUT(kNHWC))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
// REGISTER_LITE_KERNEL(multiclass_nms,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::MulticlassNmsCompute,
// def2)
// .BindInput("BBoxes",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("Scores",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .Finalize();
...@@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() { ...@@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() {
float offset = param.offset; float offset = param.offset;
std::vector<float> aspect_ratios_vec; std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec); ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
size_t prior_num = aspect_ratios_vec.size() * min_size.size(); int prior_num = aspect_ratios_vec.size() * min_size.size();
prior_num += max_size.size(); prior_num += max_size.size();
std::vector<std::string> order = param.order; std::vector<std::string> order = param.order;
bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order; bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
...@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() { ...@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
param.boxes->mutable_data<float>(); param.boxes->mutable_data<float>();
param.variances->mutable_data<float>(); param.variances->mutable_data<float>();
zynqmp::PriorBoxParam& priobox_param = pe_.param(); zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor(); priobox_param.input = param.input->ZynqTensor();
priobox_param.image = param.image->ZynqTensor(); priobox_param.image = param.image->ZynqTensor();
......
...@@ -23,31 +23,64 @@ namespace fpga { ...@@ -23,31 +23,64 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
void ReshapeCompute::Run() {
void FlattenCompute::Run() {
auto& param = Param<operators::ReshapeParam>(); auto& param = Param<operators::ReshapeParam>();
param.output->mutable_data<float16>();
auto x = param.x; auto x = param.x;
// auto actual_shape = param.actual_shape;
Tensor* actual_shape = nullptr; // TODO(chonwhite) change it.
auto output = param.output; auto output = param.output;
bool inplace = param.inplace; output->mutable_data<float16>();
auto x_dims = x->dims();
auto output_dims = output->dims(); auto output_dims = output->dims();
if (actual_shape) { if (param.inplace) {
auto actual_shape_dims = actual_shape->dims(); output->ShareDataWith(*x);
auto* actual_shape_data = actual_shape->data<int>(); } else {
auto shape = std::vector<int>( // output->CopyDataFrom(*x);
actual_shape_data, actual_shape_data + actual_shape_dims.production());
// output_dims = lite::operators::ValidateShape(shape, x_dims); //TODO
output->Resize(output_dims);
} }
// if (inplace) { x->ZynqTensor()->unalignImage();
// output->ShareDataWith(*x); // x->ZynqTensor()->saveToFile("fi", true);
// } else {
// output->CopyDataFrom(*x);
// }
output->ZynqTensor()->copyFrom(x->ZynqTensor()); output->ZynqTensor()->copyFrom(x->ZynqTensor());
// output->ZynqTensor()->saveToFile("fo", true);
output->ZynqTensor()->flush();
output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
output->Resize(output_dims); output->Resize(output_dims);
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("flatten",
output->ZynqTensor());
#endif
}
void ReshapeCompute::Run() {
auto& param = Param<operators::ReshapeParam>();
auto x = param.x;
auto output = param.output;
auto output_dims = output->dims();
x->ZynqTensor()->unalignImage();
// x->ZynqTensor()->saveToFile("ri", true);
output->Resize(output_dims);
output->mutable_data<float16>();
if (param.inplace) {
output->ShareDataWith(*x);
} else {
// output->CopyDataFrom(*x);
}
output->ZynqTensor()->copyFrom(x->ZynqTensor());
// output->ZynqTensor()->saveToFile("ro", true);
output->ZynqTensor()->flush();
output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned());
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("reshape",
output->ZynqTensor());
#endif
} }
} // namespace fpga } // namespace fpga
...@@ -66,9 +99,9 @@ REGISTER_LITE_KERNEL(reshape, ...@@ -66,9 +99,9 @@ REGISTER_LITE_KERNEL(reshape,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Shape", .BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
...@@ -86,9 +119,9 @@ REGISTER_LITE_KERNEL(reshape2, ...@@ -86,9 +119,9 @@ REGISTER_LITE_KERNEL(reshape2,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Shape", .BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
...@@ -103,16 +136,16 @@ REGISTER_LITE_KERNEL(flatten, ...@@ -103,16 +136,16 @@ REGISTER_LITE_KERNEL(flatten,
kFPGA, kFPGA,
kFP16, kFP16,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::ReshapeCompute, paddle::lite::kernels::fpga::FlattenCompute,
def) def)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Shape", .BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
...@@ -123,16 +156,16 @@ REGISTER_LITE_KERNEL(flatten2, ...@@ -123,16 +156,16 @@ REGISTER_LITE_KERNEL(flatten2,
kFPGA, kFPGA,
kFP16, kFP16,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::ReshapeCompute, paddle::lite::kernels::fpga::FlattenCompute,
def) def)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Shape", .BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
......
...@@ -30,6 +30,14 @@ class ReshapeCompute ...@@ -30,6 +30,14 @@ class ReshapeCompute
virtual ~ReshapeCompute() = default; virtual ~ReshapeCompute() = default;
}; };
class FlattenCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void Run() override;
virtual ~FlattenCompute() = default;
};
class ReshapeComputeFpgaToHost class ReshapeComputeFpgaToHost
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
......
...@@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() { ...@@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() {
scale_param.output = param.output->ZynqTensor(); scale_param.output = param.output->ZynqTensor();
int channel = scale_param.input->shape().channel(); int channel = scale_param.input->shape().channel();
zynqmp::Tensor* scale = new zynqmp::Tensor(); zynqmp::Tensor* scale = &scale_;
zynqmp::Tensor* bias = new zynqmp::Tensor(); zynqmp::Tensor* bias = &bias_;
zynqmp::Shape shape(zynqmp::N, {channel}); zynqmp::Shape shape(zynqmp::N, {channel});
float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape); float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape); float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
......
...@@ -37,6 +37,8 @@ class ScaleCompute ...@@ -37,6 +37,8 @@ class ScaleCompute
private: private:
zynqmp::ScalePE pe_; zynqmp::ScalePE pe_;
zynqmp::Tensor scale_;
zynqmp::Tensor bias_;
}; };
} // namespace fpga } // namespace fpga
......
...@@ -26,7 +26,8 @@ void SoftmaxCompute::PrepareForRun() { ...@@ -26,7 +26,8 @@ void SoftmaxCompute::PrepareForRun() {
zynqmp::SoftmaxParam& softmax_param = pe_.param(); zynqmp::SoftmaxParam& softmax_param = pe_.param();
auto& param = Param<operators::SoftmaxParam>(); auto& param = Param<operators::SoftmaxParam>();
param.output->mutable_data<float16>(); // param.output->mutable_data<float16>();
param.output->mutable_data<float>();
softmax_param.input = param.x->ZynqTensor(); softmax_param.input = param.x->ZynqTensor();
softmax_param.output = param.output->ZynqTensor(); softmax_param.output = param.output->ZynqTensor();
pe_.init(); pe_.init();
...@@ -34,9 +35,13 @@ void SoftmaxCompute::PrepareForRun() { ...@@ -34,9 +35,13 @@ void SoftmaxCompute::PrepareForRun() {
} }
void SoftmaxCompute::Run() { void SoftmaxCompute::Run() {
zynqmp::SoftmaxParam& softmax_param = pe_.param();
// softmax_param.input->saveToFile("softmax_in", true);
pe_.dispatch(); pe_.dispatch();
softmax_param.output->flush();
// softmax_param.output->saveToFile("softmax", true);
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::SoftmaxParam& softmax_param = pe_.param();
Debugger::get_instance().registerOutput("softmax", softmax_param.output); Debugger::get_instance().registerOutput("softmax", softmax_param.output);
#endif #endif
} }
...@@ -57,7 +62,17 @@ REGISTER_LITE_KERNEL(softmax, ...@@ -57,7 +62,17 @@ REGISTER_LITE_KERNEL(softmax,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kARM))})
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
\ No newline at end of file
...@@ -34,17 +34,17 @@ void transposeCompute(operators::TransposeParam param) { ...@@ -34,17 +34,17 @@ void transposeCompute(operators::TransposeParam param) {
input_x->ZynqTensor()->invalidate(); input_x->ZynqTensor()->invalidate();
input_x->ZynqTensor()->unalignImage(); input_x->ZynqTensor()->unalignImage();
Tensor float_input; // Tensor float_input;
float_input.Resize(input_x_dims); // float_input.Resize(input_x_dims);
float_input.mutable_data<float>(); // float_input.mutable_data<float>();
float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor()); // float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
const auto* input_x_data = float_input.data<float>(); const auto* input_x_data = input_x->data<float16>();
auto* out = param.output; auto* out = param.output;
const auto axis = param.axis; const auto axis = param.axis;
auto* out_data = out->mutable_data<float>(); auto* out_data = out->mutable_data<float16>();
size_t ndim = axis.size(); size_t ndim = axis.size();
std::vector<int> xdim(ndim); std::vector<int> xdim(ndim);
...@@ -84,10 +84,11 @@ void transposeCompute(operators::TransposeParam param) { ...@@ -84,10 +84,11 @@ void transposeCompute(operators::TransposeParam param) {
void TransposeCompute::Run() { void TransposeCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
param.output->mutable_data<zynqmp::float16>(); param.output->mutable_data<zynqmp::float16>();
param.x->ZynqTensor()->invalidate(); // param.x->ZynqTensor()->invalidate();
param.x->ZynqTensor()->unalignImage(); param.x->ZynqTensor()->unalignImage();
if (param.x->dims().size() != 4) { if (param.x->dims().size() != 4) {
transposeCompute(param); transposeCompute(param);
param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
} else { } else {
param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
} }
...@@ -96,14 +97,25 @@ void TransposeCompute::Run() { ...@@ -96,14 +97,25 @@ void TransposeCompute::Run() {
// Transpose2 // Transpose2
void Transpose2Compute::Run() { void Transpose2Compute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
param.output->mutable_data<float>(); param.output->mutable_data<float16>();
param.x->ZynqTensor()->invalidate(); // param.x->ZynqTensor()->syncToCPU();
// param.x->ZynqTensor()->saveToFile("t_in", true);
param.x->ZynqTensor()->unalignImage(); param.x->ZynqTensor()->unalignImage();
// param.x->ZynqTensor()->saveToFile("t_unaligned", true);
param.x->ZynqTensor()->flush();
param.x->ZynqTensor()->invalidate();
if (param.x->dims().size() != 4) { if (param.x->dims().size() != 4) {
transposeCompute(param); transposeCompute(param);
param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned());
} else { } else {
param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
} }
// param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param.output->ZynqTensor()->flush();
// param.output->ZynqTensor()->saveToFile("Transpose2", true);
} }
} // namespace fpga } // namespace fpga
...@@ -139,6 +151,8 @@ REGISTER_LITE_KERNEL(transpose2, ...@@ -139,6 +151,8 @@ REGISTER_LITE_KERNEL(transpose2,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册