提交 945aa36f 编写于 作者: C chonwhite

removed some comments

上级 c6f500fd
......@@ -15,6 +15,7 @@ USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box_fpga, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def);
......
......@@ -135,16 +135,16 @@ TEST(ResNet50, test) {
// std::cout << ":" << out1->data<float>()[i] << std::endl;
// }
// std::string file = "output/" + FLAGS_input_file.substr (6);
// std::cout << "file:::" << file << std::endl;
// std::ofstream ofs;
// ofs.open(file);
// for (int i = 0; i < out->dims().production(); i++) {
// float value = out->data<float>()[i];
// ofs << value << std::endl;
// }
// ofs.close();
std::string file = "output/" + FLAGS_input_file.substr (6);
std::cout << "file:::" << file << std::endl;
std::ofstream ofs;
ofs.open(file);
for (int i = 0; i < out->dims().production(); i++) {
float value = out->data<float>()[i];
ofs << value << std::endl;
}
ofs.close();
LOG(INFO) << "================== Speed Report ===================";
}
......
......@@ -5,6 +5,8 @@
namespace paddle {
namespace lite {
#define FPGA_PRINT_TENSOR
class Debugger {
public:
static Debugger& get_instance() {
......@@ -12,7 +14,7 @@ class Debugger {
return s_instance;
}
void registerOutput(std::string op_type, Tensor* tensor) {
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
// tensor->printScale();
// tensor->saveToFile(op_type, true);
}
......@@ -101,8 +103,6 @@ inline void save_float(float* data, const std::string& name, int len) {
}
inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert = true) {
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
if (convert) {
......@@ -111,7 +111,6 @@ inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert =
}
save_float(data, name, t->numel());
delete[] dst;
}
......
......@@ -86,10 +86,8 @@ void format_bias_array(float **bias_array, int num) {
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
std::cout << "bias::" << std::endl;
for (int i = 0; i < num_before_align; i++) {
float value = ptr_aligned[i];
std::cout << "@:" << i << " = " << value << std::endl;
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
}
*bias_array = (float *)ptr_aligned; // NOLINT
......
......@@ -28,7 +28,7 @@ limitations under the License. */
namespace paddle {
namespace zynqmp {
#define PADDLE_MOBILE_OS_LINUX
#define PADDLE_OS_LINUX
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
......@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
static size_t memory_size = 0;
static inline int do_ioctl(uint64_t req, const void *arg) {
#ifdef PADDLE_MOBILE_OS_LINUX
#ifdef PADDLE_OS_LINUX
return ioctl(fd, req, arg);
#else
return -1;
......@@ -46,11 +46,9 @@ static inline int do_ioctl(uint64_t req, const void *arg) {
}
int open_device() {
// std::cout << "open_device" << std::endl;
if (fd == -1) {
fd = open(device_path, O_RDWR);
}
// std::cout << "open_device fd:" << fd << std::endl;
return fd;
}
......@@ -68,7 +66,7 @@ void *fpga_malloc(size_t size) {
#ifdef ENABLE_DEBUG
// std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_MOBILE_OS_LINUX
#ifdef PADDLE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
if (ptr == NULL) {
......@@ -113,7 +111,7 @@ void fpga_free(void *ptr) {
memory_size -= size;
#ifdef PADDLE_MOBILE_OS_LINUX
#ifdef PADDLE_OS_LINUX
munmap(ptr, size);
#else
......
......@@ -64,14 +64,11 @@ class ConvPE : public PE {
if (!use_cpu_) {
// param_.filter->releaseData();
}
// exit(-1);
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
......
......@@ -324,7 +324,7 @@ inline void split_filter_num(const ConvParam& c_param) {
Shape s_shape(N, {filter_num});
float* scale_data = scale.mutableData<float>(FP32, s_shape);
float* bias_data = bias.mutableData<float>(FP32, s_shape);
std::cout << "v size: " << v.size() << std::endl;
// std::cout << "v size: " << v.size() << std::endl;
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
// scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
......
......@@ -94,21 +94,7 @@ class FullyConnectedPE : public PE {
}
bool dispatch() {
// return
return convPE_.dispatch();
// convPE_.dispatch();
// if (num_ == 1) {
// return true;
// }
// Tensor* output = param_.output;
// int size = output->shape().numel() * sizeof(floa16);
// memcpy(output->data<void>(), tempOut_->data<void>(), size);
// for (int i = 1;i < num_;i ++) {
// memcpy(output->data<void>(), tempOut_->data<void>(), size);
// }
// return true;
}
FullyConnectedParam& param() { return param_; }
......
......@@ -395,7 +395,7 @@ class Tensor {
}
void save_file_with_name(std::string path) {
// return;
return;
invalidate();
// usleep(20000);
// return;
......
......@@ -92,34 +92,17 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
}
void TensorLite::CopyDataFrom(const TensorLite &other) {
// std::cout << "1\n";
dims_ = other.dims_;
// std::cout << "2\n";
target_ = other.target_;
// std::cout << "3\n";
lod_ = other.lod_;
auto dt = zynq_tensor_->dataType();
// std::cout << "4\n";
// std::cout << "dt:" << dt << std::endl;
auto shape = other.zynq_tensor_->shape();
Resize(other.dims());
// mutable_data<float>();
zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
// std::cout << "copy Data From: \n";
// std::cout << "ss" << (void*)(other.ZynqTensor()) << "\n";
this->ZynqTensor()->copyFrom(other.ZynqTensor());
// set_lod(other.lod());
}
// template <typename T>
// void TensorLite::mutable_data_internal() {
// }
} // namespace lite
} // namespace paddle
......@@ -293,23 +293,7 @@ void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
int64_t base = numel() / dims_[0];
T* src_data = const_cast<T*>(data<T>());
std::cout << "end:" << end << " begin:" << begin << std::endl;
std::cout << "base:" << base << std::endl;
std::cout << "production:" << dst_dims.production() << std::endl;
memcpy(dst_data, src_data + static_cast<size_t>(begin * dst_dims.production()), dst_dims.production() * sizeof(T));
// dst.ZynqTensor()->saveToFile("_slice", true);
// if (dims_[0] == 1) {
// dst-
// return;
// } else {
// // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
// return dst;
// }
}
template <typename TensorT>
......
......@@ -109,7 +109,8 @@ class Optimizer {
"runtime_context_assign_pass",
"argument_type_display_pass",
"memory_optimize_pass"}};
// "memory_optimize_pass"
}};
RunPasses(passes_local);
} else {
RunPasses(passes);
......
......@@ -98,6 +98,18 @@ REGISTER_LITE_KERNEL(prior_box,
kNCHW,
paddle::lite::kernels::arm::PriorBoxCompute,
def)
.BindInput("Input",{LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(prior_box_fpga,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::PriorBoxCompute,
def)
.BindInput("Input",{LiteType::GetTensorTy(
TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
.BindInput("Image", {LiteType::GetTensorTy(
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/beam_search_decode_compute.h"
#include <algorithm>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
using LoDTensor = lite::Tensor;
using LoDTensorArray = std::vector<lite::Tensor>;
// all the lod have 2 levels.
// The first is source level, the second is sentence level.
// source level describe how many prefixes (branchs) for each source sentece
// (beam). sentence level describe how these candidates belong to the prefixes.
const size_t kSourceLevel = 0;
const size_t kSentenceLevel = 1;
template <typename T>
struct Sentence {
std::vector<float> word_ids;
std::vector<T> scores;
};
template <typename T>
using SentenceVector = std::vector<Sentence<T>>;
template <typename T>
struct BeamSearchDecoder {
BeamSearchDecoder(size_t beam_size, int end_id)
: beam_size_(beam_size), end_id_(end_id) {}
/**
* convert the result sentence_vector for each source sentence into two
* LodTensor.
* One is all candidate sentences with word id, one is all candidate sentences
* with word score.
* Param:
* sentence_vector_list: sentence_vector for each source sentence.
* id_tensor: result LoDTensor for sentences of id.
* score_tensor: result LoDTensor for sentences of score.
* reverse: whether ids of sentence in sentence_vector_list is reversed
* sort_by_score: whether to sort hypotheses of each sentence by scores.
*/
void ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
bool reverse = true,
bool sort_by_score = true) const {
size_t src_num = sentence_vector_list.size();
CHECK_GT(src_num, 0) << "src_num should not be 0";
std::vector<uint64_t> source_level_lod = {0};
std::vector<uint64_t> sentence_level_lod = {0};
std::vector<float> id_data;
std::vector<T> score_data;
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
if (sort_by_score) {
sort(sentence_vector_list[src_idx].begin(),
sentence_vector_list[src_idx].end(),
[reverse](const Sentence<T>& a, const Sentence<T>& b) {
if (reverse)
return a.scores.front() > b.scores.front();
else
return a.scores.back() > b.scores.back();
});
}
for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
if (reverse) {
id_data.insert(id_data.end(),
sentence.word_ids.rbegin(),
sentence.word_ids.rend());
score_data.insert(score_data.end(),
sentence.scores.rbegin(),
sentence.scores.rend());
} else {
id_data.insert(id_data.end(),
sentence.word_ids.begin(),
sentence.word_ids.end());
score_data.insert(
score_data.end(), sentence.scores.begin(), sentence.scores.end());
}
sentence_level_lod.push_back(sentence_level_lod.back() +
sentence.word_ids.size());
}
source_level_lod.push_back(source_level_lod.back() +
sentence_vector_list[src_idx].size());
}
LoD lod;
lod.push_back(source_level_lod);
lod.push_back(sentence_level_lod);
*(id_tensor->mutable_lod()) = lod;
id_tensor->Resize({static_cast<int64_t>(id_data.size())});
auto id_ptr = id_tensor->mutable_data<float>();
TargetCopy(
TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float));
*(score_tensor->mutable_lod()) = lod;
score_tensor->Resize({static_cast<int64_t>(score_data.size())});
auto score_ptr = score_tensor->mutable_data<T>();
TargetCopy(TARGET(kARM),
score_ptr,
score_data.data(),
score_data.size() * sizeof(T));
}
/**
* Gather the hypotheses for each source sentence by backtrace though the
* LoDTensorArray step_ids whose lods reserve the path in the tree.
*/
void Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor) const {
CHECK(!step_ids.empty()) << "step num should be larger than 0";
CHECK_EQ(step_ids.size(), step_scores.size())
<< "step_ids and step_scores should be the same";
const size_t step_num = step_ids.size();
const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
std::vector<SentenceVector<T>> sentence_vector_list(
src_num, SentenceVector<T>(beam_size_));
std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
for (int step_id = step_num - 1; step_id >= 0; --step_id) {
auto& cur_ids = step_ids.at(step_id);
auto& cur_scores = step_scores.at(step_id);
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
// for each source sentence
auto& sentence_vector = sentence_vector_list.at(src_idx);
auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
if (prefix_idx_vector.empty()) { // be finished and pruned at this step
// or the last time step
for (size_t prefix_idx = src_prefix_start;
prefix_idx < src_prefix_end;
++prefix_idx) {
size_t candidate_start =
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
size_t candidate_end =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
for (size_t candidate_idx = candidate_start;
candidate_idx < candidate_end;
++candidate_idx) {
prefix_idx_vector.push_back(prefix_idx);
size_t idx = prefix_idx_vector.size() - 1;
auto cur_id = cur_ids.data<float>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
}
} else { // use prefix_idx_vector to backtrace
size_t src_candidate_start =
cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
size_t prefix_idx = src_prefix_start;
size_t candidate_num =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
auto candidate_idx = prefix_idx_vector.at(idx);
auto cur_id = cur_ids.data<float>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
// to skip redundant end tokens
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
while (src_candidate_start + candidate_num <=
candidate_idx) { // search the corresponding prefix
prefix_idx++;
candidate_num +=
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
}
prefix_idx_vector.at(idx) = prefix_idx;
}
}
}
}
ConvertSentenceVectorToLodTensor(
sentence_vector_list, id_tensor, score_tensor, true, true);
}
size_t beam_size_;
int end_id_;
};
struct BeamSearchDecodeFunctor {
BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores,
LoDTensor* id_tensor,
LoDTensor* score_tensor,
size_t beam_size,
int end_id)
: beam_size_(beam_size),
end_id_(end_id),
step_ids_(step_ids),
step_scores_(step_scores),
id_tensor_(id_tensor),
score_tensor_(score_tensor) {}
template <typename T>
void apply() const {
BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
beam_search_decoder.Backtrace(
step_ids_, step_scores_, id_tensor_, score_tensor_);
}
size_t beam_size_;
int end_id_;
const LoDTensorArray& step_ids_;
const LoDTensorArray& step_scores_;
LoDTensor* id_tensor_;
LoDTensor* score_tensor_;
};
template <>
void BeamSearchDecodeFunctor::apply<bool>() const {
LOG(FATAL) << "beam search decode op does not support bool!";
}
void BeamSearchDecodeCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
// inputs
auto ids = param.ids;
auto scores = param.scores;
// outputs
auto sentence_ids = param.sentence_ids;
auto sentence_scores = param.sentence_scores;
const size_t step_num = ids->size();
CHECK_GT(step_num, 0UL) << "beam search steps should be larger than 0";
const size_t source_num = ids->at(0).lod().at(0).size() - 1;
CHECK_GT(source_num, 0UL) << "source num should be larger than 0";
for (size_t i = 0; i < step_num; ++i) {
CHECK_EQ(ids->at(i).lod().size(), 2UL) << "Level of LodTensor should be 2";
}
//! fixme
// only support float score now
BeamSearchDecodeFunctor func(*ids,
*scores,
sentence_ids,
sentence_scores,
param.beam_size,
param.end_id);
func.apply<float>();
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(beam_search_decode,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::BeamSearchDecodeCompute,
def)
.BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))})
.BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))})
.BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class BeamSearchDecodeCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::BeamSearchDecodeParam;
BeamSearchDecodeCompute() = default;
void Run() override;
virtual ~BeamSearchDecodeCompute() = default;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/box_coder_compute.h"
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/float16.hpp"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using float16 = zynqmp::float16;
void BoxCoderCompute::Run() {
auto& param = Param<operators::ReshapeParam>();
param.output->mutable_data<float16>();
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(box_coder,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::BoxCoderCompute,
def)
.BindInput("PriorBox",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("PriorBoxVar",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("TargetBox",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("OutputBox",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class BoxCoderCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::BoxCoderParam;
void Run() override;
virtual ~BoxCoderCompute() = default;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -33,13 +33,6 @@ void CalibComputeFp32ToFP16::Run() {
const auto* din = param.input->data<float>();
param.output->mutable_data<float16>();
param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
// for (int i = 0; i < param.input->numel(); ++i) {
// dout[i] = zynqmp::float_to_half(din[i]);
// }
param.input->ZynqTensor()->saveToFile("calib_input.txt");
param.output->ZynqTensor()->saveToFile("ouput_31.txt");
param.output->ZynqTensor()->printScale("calib");
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
......@@ -53,13 +46,7 @@ void CalibComputeFP16ToFp32::Run() {
auto& param = this->Param<operators::CalibParam>();
const auto* din = param.input->data<float16>();
auto* dout = param.output->mutable_data<float>();
// for (int i = 0; i < param.input->numel(); ++i) {
// dout[i] = zynqmp::half_to_float(din[i]);
// }
param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
param.output->ZynqTensor()->saveToFile("ouput_13.txt");
auto out_lod = param.output->mutable_lod();
*out_lod = param.input->lod();
return;
......
......@@ -12,13 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/concat_compute.h"
#include <string>
#include <vector>
#include "lite/kernels/fpga/concat_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
namespace kernels {
......@@ -43,8 +45,10 @@ void ConcatCompute::PrepareForRun() {
void ConcatCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ConcatParam& concat_param = pe_.param();
concat_param.output->saveToFile("concat", true);
Debugger.get_instance()::registerOutput("concat", concat_param.output);
#endif
}
} // namespace fpga
......
......@@ -12,10 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/conv_compute.h"
#include <vector>
#include "lite/kernels/fpga/conv_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
namespace kernels {
......@@ -60,14 +63,9 @@ void ConvCompute::PrepareForRun() {
fill_scale_bias_const(&conv_param);
if (param.bias != nullptr) {
conv_param.bias()->copyFrom(param.bias->ZynqTensor());
std::cout << "copy bias \n";
}
conv_param.relu.enabled = param.fuse_relu;
// conv_param.filter->saveToFile("filter", true);
// conv_param.bias()->saveToFile("bias", true);
// conv_param.scale()->saveToFile("scale", true);
conv_pe_.init();
conv_pe_.apply();
}
......@@ -75,18 +73,15 @@ void ConvCompute::PrepareForRun() {
void ConvCompute::Run() {
auto& param = this->Param<param_t>();
// std::cout << "in:" << param.x->ZynqTensor()->data<void>() << std::endl;
if (param.x->ZynqTensor()->shape().channel() != 1 &&
param.groups == param.x->ZynqTensor()->shape().channel()) {
dw_conv_pe_.dispatch();
// param.output->ZynqTensor()->saveToFile("dw", true);
} else {
zynqmp::ConvParam& conv_param = conv_pe_.param();
conv_pe_.dispatch();
// conv_param.input->saveToFile("_conv_in", true);
conv_param.output->printScale("conv");
param.output->ZynqTensor()->saveToFile("_conv", true);
// conv_param.output->saveToFile("_conv_param", true);
#ifdef FPGA_PRINT_TENSOR
zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output);
#endif
}
}
......
......@@ -12,11 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/dropout_compute.h"
#include <string>
#include "lite/kernels/fpga/dropout_compute.h"
#include "lite/backends/fpga/KD/float16.hpp"
// #include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -54,12 +54,11 @@ void DropoutCompute::PrepareForRun() {
}
void DropoutCompute::Run() {
auto& param = Param<operators::DropoutParam>();
zynqmp::ScaleParam& scale_param = pe_.param();
// scale_param.input->saveToFile("drop_in.txt");
pe_.dispatch();
// scale_param.output->saveToFile("drop_out.txt");
// std::cout << "prob:" << param.dropout_prob << std::endl;
#ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param();
Debugger::get_instance().registerOutput("dropout", scale_param.output);
#endif
}
} // namespace fpga
......
......@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/elementwise_compute.h"
#include <string>
#include "lite/kernels/fpga/elementwise_compute.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -39,8 +40,10 @@ void ElementwiseAddCompute::PrepareForRun() {
}
void ElementwiseAddCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ElementwiseAddParam& ew_param = pe_.param();
// ew_param.output->saveToFile("ew", true);
Debugger::get_instance().registerOutput("ew_add", ew_param.output);
#endif
}
void ElementwiseAddActivationCompute::PrepareForRun() {
......@@ -59,6 +62,10 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
}
void ElementwiseAddActivationCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ElementwiseAddParam& ew_param = pe_.param();
Debugger::get_instance().registerOutput("ew_add", ew_param.output);
#endif
}
void ElementwiseMulCompute::PrepareForRun() {
......@@ -66,14 +73,8 @@ void ElementwiseMulCompute::PrepareForRun() {
auto& param = Param<operators::ElementwiseParam>();
param.Out->mutable_data<float16>();
scale_param.input = param.X->ZynqTensor();
scale_param.output = param.Out->ZynqTensor();
// param.Y->ZynqTensor()->saveToFile("scale_y", true);
std::cout << "y_production:" << param.Y->dims().production() << std::endl;
// exit(-1);
scale_param.relu.enabled = false;
......@@ -85,39 +86,26 @@ void ElementwiseMulCompute::PrepareForRun() {
zynqmp::Shape shape(zynqmp::N, {channel});
float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
float scale_value = param.Y->data<float>()[0];;
std::cout << "scale_value:" << scale_value << std::endl;
std::cout << "channel:" << channel << std::endl;
std::cout << "data_type:" << param.Y->ZynqTensor()->dataType() << std::endl;
// exit(-1);
for (int i = 0; i < channel; ++i) {
if (param.Y->dims().production() != 1) {
scale_value = param.Y->ZynqTensor()->data<float>()[i];
}
scale_data[i] = scale_value;
bias_data[i] = 0;
}
pe_.init();
pe_.apply();
// scale_param.input->saveToFile("scale_input", true);
// scale_param.scale->saveToFile("scale_scale", true);
param.Y->ZynqTensor()->saveToFile("ew_y", true);
// exit(-1);
}
void ElementwiseMulCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param();
// scale_param.output->saveToFile("ew_mul", true);
// exit(-1);
Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif
}
} // namespace fpga
......
......@@ -15,6 +15,7 @@
#include "lite/kernels/fpga/fc_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -30,7 +31,6 @@ void FcCompute::PrepareForRun() {
zynqmp::FullyConnectedParam& fc_param = pe_.param();
param.output->mutable_data<float16>();
fc_param.input = param.input->ZynqTensor();
fc_param.output = param.output->ZynqTensor();
fc_param.filter = param.w->ZynqTensor();
......@@ -42,8 +42,10 @@ void FcCompute::PrepareForRun() {
void FcCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::FullyConnectedParam& fc_param = pe_.param();
// fc_param.output->saveToFile("fc", true);
Debugger::get_instance().registerOutput("fc", fc_param.output);
#endif
}
} // namespace fpga
......
......@@ -15,6 +15,7 @@
#include "lite/kernels/fpga/feed_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -37,24 +38,17 @@ void FeedCompute::PrepareForRun() {
}
void FeedCompute::Run() {
std::cout << "================= FeedCompute ================= \n";
auto& param = this->Param<param_t>();
Tensor& x = param.feed_list->at(param.col);
zynqmp::InputParam& feed_param = pe_.param();
if (x.dims().production() == 7590) {
feed_param.input->readFromFile("position_encoding.data");
feed_param.input->saveToFile("read.txt");
}
pe_.dispatch();
auto out_lod = param.out->mutable_lod();
*out_lod = x.lod();
feed_param.input->saveToFile("feed_in.txt");
feed_param.output->saveToFile("feed.txt");
#ifdef FPGA_PRINT_TENSOR
zynqmp::InputParam& feed_param = pe_.param();
Debugger::get_instance().registerOutput("feed", feed_param.output);
#endif
}
} // namespace fpga
......
......@@ -14,6 +14,7 @@
#include "lite/kernels/fpga/fetch_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -25,7 +26,7 @@ using float16 = zynqmp::float16;
void FetchCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
// ====================================================
zynqmp::OutputParam& conv_param = pe_.param();
zynqmp::OutputParam& fetch_param = pe_.param();
auto fetch_list = param.fetch_list;
if (fetch_list->size() <= static_cast<size_t>(param.col)) {
fetch_list->resize(param.col + 1);
......@@ -34,8 +35,8 @@ void FetchCompute::PrepareForRun() {
out.Resize(param.input->dims());
out.mutable_data<float>();
conv_param.input = param.input->ZynqTensor();
conv_param.output = out.ZynqTensor();
fetch_param.input = param.input->ZynqTensor();
fetch_param.output = out.ZynqTensor();
pe_.init();
pe_.apply();
......@@ -44,8 +45,11 @@ void FetchCompute::PrepareForRun() {
void FetchCompute::Run() {
pe_.dispatch();
auto& param = this->Param<param_t>();
zynqmp::OutputParam& conv_param = pe_.param();
conv_param.output->saveToFile("fetch", true);
#ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param();
Debugger::get_instance().registerOutput("fetch", fetch_param.output);
#endif
}
} // namespace fpga
......
......@@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/mul_compute.h"
#include <vector>
// #include "lite/backends/arm/math/funcs.h"
#include "lite/kernels/fpga/mul_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
namespace kernels {
......@@ -37,7 +38,6 @@ void MulCompute::PrepareForRun() {
fc_param.output = param.output->ZynqTensor();
fc_param.filter = param.y->ZynqTensor();
// fc_param.bias = param.bias->ZynqTensor();
fc_param.bias = &bias_;
int channel = fc_param.filter->shape().channel();
......@@ -59,15 +59,7 @@ void mul(MulCompute* k) {
int fn = param.y->dims()[1];
std::cout << "num: " << num << std::endl;
std::cout << "channel: " << channel << std::endl;
std::cout << "fn: " << fn << std::endl;
param.y->ZynqTensor()->saveToFile("filter.txt");
float16* out_data = param.output->mutable_data<float16>();
// int si = 0;
int g_index = 0;
for (int n = 0; n < 1; n++) {
......@@ -77,12 +69,10 @@ void mul(MulCompute* k) {
for (int c = 0; c < channel; c++) {
float value = zynqmp::half_to_float(param.x->data<float16>()[si]);
int index = c * fn + on;
// std::cout << "index: " << index << std::endl;
float weight = param.y->data<float>()[index];
sum += value * weight;
si++;
}
std::cout << sum << "\n";
out_data[g_index] = zynqmp::float_to_half(sum);
g_index++;
}
......@@ -91,37 +81,12 @@ void mul(MulCompute* k) {
void MulCompute::Run() {
// auto& param = Param<param_t>();
zynqmp::FullyConnectedParam& fc_param = pe_.param();
std::cout << "1\n";
// fc_param.input->readFromFile("arm_8_im_in.data");
// fc_param.input->flush();
float16* data_in = fc_param.input->data<float16>();
// float16 one = zynqmp::float_to_half(1.0f);
// for (int i = 0; i < fc_param.input->shape().alignedElementCount(); i++) {
// data_in[i] = one;
// }
// fc_param.input->scale()[0] = 1.0 / 127;
// fc_param.input->scale()[1] = 127;
pe_.dispatch();
// std::cout << "2\n";
// fc_param.input->printScale("mul");
// std::cout << "3\n";
fc_param.input->saveToFile("mul_in.txt");
// std::cout << "4\n";
// mul(this);
// std::cout << "5\n";
fc_param.output->saveToFile("mul_out.txt");
// exit(-1);
// exit(-1);
// fc_param.output->saveToFile("mul.txt");
// Tensor* output = const_cast<Tensor*>(param.output);
// const auto* x_data = param.x->data<float>();
// param.y->mutable_data<float16>();
// param.output->mutable_data<float16>();
#ifdef FPGA_PRINT_TENSOR
zynqmp::FullyConnectedParam& fc_param = pe_.param();
Debugger.get_instance().registerOutput("mul", fc_param.output);
#endif
}
} // namespace fpga
......
......@@ -195,17 +195,13 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
T score_threshold = static_cast<T>(param.score_threshold);
int num_det = 0;
int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
// scores.ZynqTensor()->saveToFile("nms_scores", true);
for (int64_t c = 0; c < class_num; ++c) {
Tensor bbox_slice, score_slice;
if (c == background_label) continue;
if (scores_size == 3) {
scores.Slice<T>(score_slice, c, c + 1);
// score_slice.ZynqTensor()->saveToFile("nms_slice", true);
bbox_slice = bboxes;
} else {
score_slice.Resize({scores.dims()[0], 1});
......@@ -387,27 +383,19 @@ void MulticlassNmsCompute::Run() {
if (e > s) {
Tensor out;
outs->Slice<float>(out, s, e);
// scores_slice.ZynqTensor()->saveToFile("scores_slice", true);
MultiClassOutput<float>(
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
out.ZynqTensor()->saveToFile("out", true);
outs->ZynqTensor()->copyFrom(out.ZynqTensor());
}
}
}
// save_tensor(param.scores, "_scores.txt", false);
// save_tensor(param.bboxes, "_bboxes.txt", false);
boxes->ZynqTensor()->saveToFile("_boxes", true);
scores->ZynqTensor()->saveToFile("_scores", true);
outs->ZynqTensor()->saveToFile("_outs", true);
LoD lod;
lod.emplace_back(batch_starts);
outs->set_lod(lod);
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
#endif
}
} // namespace host
} // namespace kernels
......
......@@ -13,8 +13,7 @@
// limitations under the License.
#include "lite/kernels/fpga/norm_compute.h"
// #include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -27,7 +26,6 @@ void NormCompute::PrepareForRun() {
auto& param = this->Param<operators::NormParam>();
param.Out->mutable_data<float16>();
zynqmp::NormParam& norm_param = pe_.param();
norm_param.input = param.X->ZynqTensor();
norm_param.output = param.Out->ZynqTensor();
......@@ -39,20 +37,10 @@ void NormCompute::PrepareForRun() {
void NormCompute::Run() {
pe_.dispatch();
pe_.param().output->saveToFile("norm.txt", true);
// auto& ctx = this->ctx_->template As<ARMContext>();
// auto& param = this->Param<operators::NormParam>();
// auto input_dims = param.X->dims();
// int dim_size = param.X->dims().size();
// auto axis = (param.axis < 0) ? param.axis + dim_size : param.axis;
// const auto* x_data = param.X->data<float>();
// auto* o_data = param.Out->mutable_data<float>();
// int pre_n = input_dims.count(0, axis);
// int post_n = input_dims.count(axis + 1, dim_size);
// int n = input_dims[axis];
// lite::arm::math::norm(x_data, pre_n, n, post_n, param.epsilon, o_data, &ctx);
#ifdef FPGA_PRINT_TENSOR
zynqmp::NormParam& norm_param = pe_.param();
Debugger::get_instance().registerOutput("norm", norm_param.output);
#endif
}
} // namespace fpga
......
......@@ -12,12 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/pooling_compute.h"
#include <string>
#include <vector>
#include "lite/kernels/fpga/pooling_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
namespace kernels {
......@@ -47,9 +49,10 @@ void PoolCompute::PrepareForRun() {
void PoolCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::PoolingParam& pool_param = pe_.param();
pool_param.output->printScale("pooling");
pool_param.output->saveToFile("pool", true);
Debugger::get_instance().registerOutput("pooling", pool_param.output);
#endif
}
} // namespace fpga
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/while_compute.h"
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
void WhileCompute::PrepareForRun() {
auto &param = Param<operators::WhileParam>();
auto cur_scope = param.scope;
executor_ =
std::make_shared<StepExecutor>(param.sub_block, cur_scope, place());
}
void WhileCompute::Run() {
auto &param = Param<operators::WhileParam>();
while (param.cond->data<bool>()[0]) {
executor_->Run();
}
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
while, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::WhileCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Condition",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
.BindOutput("Out",{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <memory>
#include <vector>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/operators/while_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class StepExecutor {
typedef std::shared_ptr<OpLite> OpPtr;
public:
StepExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
: scope_(scope), place_(place) {
int32_t op_size = block->OpsSize();
for (int32_t i = 0; i < op_size; ++i) {
auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
auto op_type = op_desc.Type();
auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
VLOG(LOG_INFO) << "while: creating Op [" << op_type << "]";
op_handler->Attach(op_desc, scope);
auto hostplace = place_;
hostplace.target = TARGET(kHost);
auto kernels = op_handler->CreateKernels({place_, hostplace});
CHECK_GT(kernels.size(), 0) << "cannot create kernel";
op_handler->AttachKernel(kernels[0].get());
op_handler->SetKernel(kernels);
ops_of_block_.push_back(op_handler);
}
}
void Run() {
for (auto &op_handler : ops_of_block_) {
// VLOG(4) << op_handler->op_info()->Repr();
op_handler->InferShape();
// VLOG(4) << "while: infered shape";
op_handler->Run();
}
}
private:
Scope *scope_;
Place place_;
std::vector<OpPtr> ops_of_block_;
};
class WhileCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::WhileParam;
void Run() override;
void PrepareForRun() override;
virtual ~WhileCompute() = default;
private:
std::shared_ptr<StepExecutor> executor_;
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/write_to_array_compute.h"
#include "lite/backends/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
void WriteToArrayCompute::PrepareForRun() {}
void WriteToArrayCompute::Run() {
auto& ctx = this->ctx_->template As<ARMContext>();
auto& param = this->Param<operators::WriteToArrayParam>();
CHECK_EQ(param.I->numel(), 1) << "input2 should have only one element";
const auto* x_data = param.X->data<float>();
int id = param.I->data<int>()[0];
int id_test = param.I->data<int64_t>()[0];
if (id >= param.Out->size()) {
for (int i = param.Out->size(); i < id + 1; i++) {
lite::Tensor tmp;
param.Out->push_back(tmp);
}
}
(*param.Out)[id].Resize(param.X->dims());
auto out_lod = (*param.Out)[id].mutable_lod();
*out_lod = param.X->lod();
auto* o_data = (*param.Out)[id].mutable_data<float>(TARGET(kHost));
int input_size = param.X->numel();
memcpy(o_data, x_data, sizeof(float) * input_size);
}
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(write_to_array,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::WriteToArrayCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include "lite/backends/arm/math/type_trans.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
class WriteToArrayCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::WriteToArrayParam;
void PrepareForRun() override;
void Run() override;
~WriteToArrayCompute() {}
private:
};
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册