提交 c6d82e0e 编写于 作者: C chonwhite

merge attention_diff into fpga_pr

......@@ -104,3 +104,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources
# generated files
lite/api/paddle_use_kernels.h
lite/api/paddle_use_ops.h
lite/backends/arm/math/dotprod/gemm_sdot.h
lite/tools/cmake_tools/ast.pyc
......@@ -22,6 +22,8 @@ if (WITH_PADDLE_MOBILE)
return()
endif(WITH_PADDLE_MOBILE)
# set(CMAKE_BUILD_TYPE DEBUG)
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(CMAKE_CXX_STANDARD 11)
......
./lite/tools/build.sh \
--arm_os=armlinux \
--arm_abi=armv8 \
--arm_lang=gcc \
test
......@@ -198,14 +198,24 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc
DEPS ${lite_model_test_DEPS})
# lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
# DEPS ${lite_model_test_DEPS}
......
......@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
<< kpf_path;
}
#ifndef LITE_WITH_FPGA
lite::Tensor *Predictor::GetInput(size_t offset) {
CHECK(input_names_.size() > offset)
<< "The network has " << input_names_.size() << " inputs"
......@@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
<< " in exec_scope";
return in_var->GetMutable<lite::Tensor>();
}
#else
lite::Tensor *Predictor::GetInput(size_t offset) {
auto *_feed_list = exec_scope_->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
#endif
// get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
......@@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() {
}
}
#ifndef LITE_WITH_FPGA
const lite::Tensor *Predictor::GetOutput(size_t offset) const {
CHECK(output_names_.size() > offset)
<< "The network has " << output_names_.size() << " outputs"
......@@ -186,6 +200,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
}
return outputs;
}
#else
const lite::Tensor *Predictor::GetOutput(size_t offset) const {
auto *_fetch_list = exec_scope_->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
return &fetch_list.at(offset);
}
std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
auto *_fetch_list = exec_scope_->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
std::vector<const lite::Tensor *> outputs;
for (auto out : fetch_list) {
outputs.push_back(&out);
}
return outputs;
}
#endif
const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
#ifdef LITE_WITH_FPGA
TEST(ResNet50, test) {
lite::Predictor predictor;
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
// std::vector<Place> valid_places(
// {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
predictor.Build("",
FLAGS_model_dir + "/model",
FLAGS_model_dir + "/params",
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
}
#endif
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file");
namespace paddle {
namespace lite {
void read_from_file(const std::string& path, float* data, int num) {
std::ifstream file_stream;
file_stream.open(path);
if (!file_stream) {
exit(-1);
return;
}
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
}
void chw_to_hwc(float* src, float* dst, int channel, int height, int width) {
int amount_per_row = width * channel;
int index = 0;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
int dst_index = offset_height + w * channel + c;
dst[dst_index] = src[index];
index = index + 1;
}
}
}
}
void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place,
bool use_npu = false) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
// predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
predictor.Build("", "attention/model", "attention/params", valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 100, 200})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
read_from_file(FLAGS_input_file, data, 100 * 200);
//=============================================
auto* init_ids = predictor.GetInput(1);
init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
auto* data_ids = init_ids->mutable_data<float>();
auto ids_size = init_ids->dims().production();
for (int i = 0; i < ids_size; i++) {
data_ids[i] = 0;
}
auto lod_ids = init_ids->mutable_lod();
std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
*lod_ids = lod_i;
//=============================================
auto* init_scores = predictor.GetInput(2);
init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
auto* data_scores = init_scores->mutable_data<float>();
auto scores_size = input_tensor->dims().production();
for (int i = 0; i < scores_size; i++) {
data_scores[i] = 0;
}
auto lod_scores = init_scores->mutable_lod();
std::vector<std::vector<uint64_t>> lod_s{{0, 1}, {0, 1}};
*lod_scores = lod_s;
//=============================================
auto* position_encoding = predictor.GetInput(3);
position_encoding->Resize(
DDim(std::vector<DDim::value_type>({1, 33, 10, 23})));
auto* position_encoding_data = position_encoding->mutable_data<float>();
float* temp_data = position_encoding_data;
for (int i = 0; i < position_encoding->dims().production(); ++i) {
temp_data[i] = 0;
}
int index = 0;
for (int i = 0; i < 10; i++) {
for (int row = 0; row < 10; row++) {
for (int col = 0; col < 23; col++) {
if (i == row) {
temp_data[index] = 1.0f;
} else {
temp_data[index] = 0.0f;
}
index++;
}
}
}
for (int i = 0; i < 23; i++) {
for (int row = 0; row < 10; row++) {
for (int col = 0; col < 23; col++) {
if (i == col) {
temp_data[index] = 1.0f;
} else {
temp_data[index] = 0.0f;
}
index++;
}
}
}
// chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23);
// delete[] temp_data;
// read_from_file("position_encoding.data", position_encoding_data, 33 * 10 *
// 23);
auto start = GetCurrentUS();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
std::cout << "================== Speed Report ===================";
std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
std::string file = "plate_data/" + FLAGS_input_file.substr(9);
std::cout << "file:::" << file << std::endl;
std::ofstream ofs;
ofs.open(file);
for (int i = 0; i < out->dims().production(); i++) {
float value = out->data<float>()[i];
ofs << value << std::endl;
}
ofs.close();
}
TEST(OcrAttention, test_arm) {
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)});
}
} // namespace lite
} // namespace paddle
......@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
std::vector<Place> valid_places(
{Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
predictor.Build(FLAGS_model_dir,
"",
"",
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
valid_places);
predictor.Build(FLAGS_model_dir, "", "", valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <dirent.h>
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file");
namespace paddle {
namespace lite {
std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
std::vector<std::string> files;
std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
[](DIR* dir) { dir&& closedir(dir); });
struct dirent* dirent_ptr;
if (!directory_ptr) {
std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
return files;
}
while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
files.push_back(std::string(dirent_ptr->d_name));
}
return files;
}
void readFromFile(int num, std::string path, float* data) {
std::ifstream file_stream(path);
// file_stream.open(path);
if (!file_stream.good()) {
std::cout << "file: " << path << " dones not exist!\n";
exit(-1);
return;
}
// float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
file_stream.close();
}
// #ifdef LITE_WITH_FPGA
TEST(ResNet50, test) {
lite::Predictor predictor;
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
// predictor.Build(FLAGS_model_dir, "", "", valid_places);
predictor.Build("",
FLAGS_model_dir + "/model",
FLAGS_model_dir + "/params",
valid_places);
auto* input_tensor = predictor.GetInput(0);
int width = 300;
int height = 300;
// std::ifstream file_stream(FLAGS_input_file);
// if (!file_stream.good()) {
// std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
// exit(-1);
// return;
// }
// file_stream >> height;
// file_stream >> width;
input_tensor->Resize(
DDim(std::vector<DDim::value_type>({1, 3, height, width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
// readFromFile(item_size, "car.data", data);
int num = 3 * width * height;
// for (int i = 0; i < num; ++i) {
// float value = 0;
// file_stream >> value;
// data[i] = value;
// }
// file_stream.close();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
auto* out = predictor.GetOutput(0);
for (int i = 0; i < out->dims().production(); i++) {
std::cout << ":" << out->data<float>()[i] << std::endl;
}
std::string file = "output/" + FLAGS_input_file.substr(6);
std::cout << "file:::" << file << std::endl;
std::ofstream ofs;
ofs.open(file);
for (int i = 0; i < out->dims().production(); i++) {
float value = out->data<float>()[i];
ofs << value << std::endl;
}
ofs.close();
LOG(INFO) << "================== Speed Report ===================";
}
// #endif
} // namespace lite
} // namespace paddle
......@@ -32,7 +32,8 @@ class Debugger {
}
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
if (op_type != "conv") { // NOLINT
if (op_config[op_type]) {
tensor->saveToFile(op_type, true);
}
}
......@@ -40,8 +41,19 @@ class Debugger {
std::unordered_map<std::string, bool> op_config;
Debugger() {
op_config["concat"] = true;
op_config["pooling"] = true;
op_config["conv"] = true;
op_config["crop"] = true;
op_config["feed"] = true;
op_config["mul"] = true;
op_config["fetch"] = true;
op_config["boxes"] = true;
op_config["scores"] = true;
op_config["nms"] = true;
op_config["pb_boxes"] = true;
op_config["pb_variances"] = true;
// op_config["fc"] = true;
op_config["softmax"] = true;
}
};
......@@ -131,9 +143,7 @@ inline void save_tensor(const lite::Tensor* t,
chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
data = dst;
}
save_float(data, name, t->numel());
delete[] dst;
}
} // namespace lite
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/fpga_cv.hpp"
using paddle::zynqmp::float16;
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height) {
paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
paddle::zynqmp::config_inplace(inplace_args);
paddle::zynqmp::ImageInputArgs input_args = {nullptr};
input_args.address = nullptr;
input_args.scale_address = nullptr;
float16* input_image_address =
reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
input_width * input_height * input_channel * sizeof(float16)));
int index = 0;
for (int i = 0; i < input_width * input_height * input_channel; i++) {
input_image_address[i] = float16(1.0 * input[i]);
}
paddle::zynqmp::ResizeArgs resize_args = {0};
resize_args.input_width = input_width;
resize_args.input_height = input_height;
resize_args.image_channel = input_channel;
resize_args.output_width = output_width;
resize_args.output_height = output_height;
float height_ratio = static_cast<float>(input_height) /
static_cast<float>(resize_args.output_height);
float width_ratio = static_cast<float>(input_width) /
static_cast<float>(resize_args.output_width);
resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
int output_size =
resize_args.output_width * resize_args.output_height * input_channel;
float16* fpga_output = reinterpret_cast<float16*>(
paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
resize_args.input_image_address = input_image_address;
resize_args.output_image_address = fpga_output;
memset(fpga_output, 0, output_size * sizeof(float16));
paddle::zynqmp::fpga_flush(
input_image_address,
input_width * input_height * input_channel * sizeof(float16));
paddle::zynqmp::fpga_flush(resize_args.output_image_address,
output_size * sizeof(float16));
int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
if (ret == 0) {
paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
output_size * sizeof(float16));
}
for (int i = 0; i < output_size; i++) {
output[i] = fpga_output[i];
}
}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define PADDLE_LITE_ZU5
#define FPGA_PRINT_MODE
#define PADDLE_LITE_PROFILE
......@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
std::ofstream ofs;
ofs.open(name);
int8_t* data = static_cast<int8_t*> data_in;
int8_t* data = static_cast<int8_t*>(data_in);
for (int i = 0; i < size; i++) {
float value = data[i];
ofs << value << std::endl;
......@@ -221,6 +221,7 @@ int8_t* format_filter(float* data_in,
align_to_x(num_per_div_before_alignment, filter_num_alignment);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
......
文件模式从 100644 更改为 100755
......@@ -62,6 +62,7 @@ void reset_device() {
// memory management;
void *fpga_malloc(size_t size) {
#ifdef ENABLE_DEBUG
std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_OS_LINUX
void *ptr = reinterpret_cast<void *>(
......
......@@ -62,6 +62,7 @@ class ConvPE : public PE {
param_.filter->shape().height() == 1) { // NOLINT
}
if (!use_cpu_) { // NOLINT
// param_.filter->releaseData();
}
}
......@@ -92,6 +93,7 @@ class ConvPE : public PE {
int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1];
int pooled_height_ = output->shape().height();
int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width;
......
......@@ -266,8 +266,8 @@ inline void split_filter_num(const ConvParam& c_param) {
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) {
BasicConvParam* conv_param = new BasicConvParam();
......@@ -364,6 +364,7 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0];
args.output.address = out_address;
......@@ -419,6 +420,7 @@ inline void split_channel(const ConvParam& c_param) {
}
scale.flush();
bias.flush();
// Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale,
&bias,
&conv_param->filter,
......@@ -446,6 +448,7 @@ inline void split_channel(const ConvParam& c_param) {
args.image.height = conv_param->input.shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale();
......@@ -476,6 +479,7 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
}
size_t size = params.size();
if (ret == 0 && size > 1) {
// Tensor* output = conv_params.output;
Tensor& img = params[0]->output;
for (int i = 0; i < 1; i++) {
for (int i = 0; i < img.shape().numel(); i++) {
......
......@@ -62,6 +62,7 @@ class DepthwiseConvPE : public PE {
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
......
文件模式从 100644 更改为 100755
......@@ -47,8 +47,10 @@ class GRUPE : public PE {
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
// copy 2/3 weight from param.weight;
zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float));
......@@ -115,11 +117,9 @@ class GRUPE : public PE {
if (hidden_prev) {
// TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
}
mul_pe_.dispatch();
reset_hidden_.saveToFile("reset_hidden_.txt");
// reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update;
reset_gate_data += stride_update;
......@@ -170,6 +170,7 @@ class GRUPE : public PE {
zynqmp::Tensor bias_;
zynqmp::Tensor weight_;
zynqmp::Tensor state_weight_;
zynqmp::Tensor update_gate_;
zynqmp::Tensor reset_gate_;
zynqmp::Tensor cell_state_;
......
......@@ -66,7 +66,7 @@ class PoolingPE : public PE {
param_.poolingArgs = args;
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7);
(k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE;
}
......@@ -76,6 +76,7 @@ class PoolingPE : public PE {
input->syncToCPU();
Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
......@@ -188,7 +189,9 @@ class PoolingPE : public PE {
bool dispatch() {
if (use_cpu_) {
// cpu_compute();
compute();
// exit(-1);
return true;
}
param_.input->syncToDevice();
......
......@@ -89,7 +89,6 @@ class ScalePE : public PE {
}
}
}
float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) {
......
......@@ -348,9 +348,19 @@ class Tensor {
if (placeHolder_ == nullptr) {
return;
}
std::cout << scale()[0] << " , " << scale()[1] << std::endl;
}
void printScale(std::string type) { printScale(); }
void printScale(std::string type) {
std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width())
<< std::endl;
std::cout << type << " \n";
printScale();
}
std::string dimsFileName() {
return std::to_string(shape_->num()) + "_" +
......@@ -378,6 +388,7 @@ class Tensor {
static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path;
counter++;
std::cout << "======== saving file:" << npath << " ============\n";
save_file_with_name(npath);
}
......
......@@ -165,6 +165,9 @@ class TensorLite {
TargetType target() const { return target_; }
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
......@@ -254,6 +257,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
int64_t base = numel() / dims_[0];
TensorLite dst;
dst.target_ = target_;
auto dst_dims = dims_;
dst_dims[0] = end - begin;
......
文件模式从 100644 更改为 100755
......@@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst(
auto io_copy_output_name =
string_format("%s/target_trans", in->AsArg().name.c_str());
// string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
if (copied_nodes->count(in->AsArg().name)) {
// Remove the old link
RemoveDirectedLink(in, inst_node);
......
文件模式从 100644 更改为 100755
......@@ -138,11 +138,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
void RuntimeProgram::Run() {
for (auto& inst : instructions_) {
std::string op_type = inst.op()->op_info()->Type();
#ifndef LITE_WITH_FPGA
if (op_type == "feed" || op_type == "fetch") continue;
#endif
inst.Run();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
LITE_PRECISION_PROFILE(inst)
#ifndef LITE_WITH_FPGA
// LITE_PRECISION_PROFILE(inst)
#endif
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
......
文件模式从 100644 更改为 100755
......@@ -46,7 +46,7 @@ class Tensor {
*/
class PaddlePredictor {
public:
void Init();
void Init() {}
std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);
......
......@@ -62,6 +62,10 @@ void CastCompute::Run() {
int32_t* out_data = param.Out->mutable_data<int32_t>();
std::transform(
x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
} else if (param.in_dtype == 3 && param.out_dtype == 5) {
const auto* x_data = param.X->data<float>();
auto* o_data = param.Out->mutable_data<float>();
memcpy(o_data, x_data, sizeof(float) * param.X->numel());
} else {
LOG(FATAL) << "other has not been implemented";
}
......
......@@ -60,25 +60,10 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<ARMContext>();
if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
auto data = param.Out->template mutable_data<float>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) {
auto data = param.Out->template mutable_data<int32_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) {
auto data = param.Out->template mutable_data<int8_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else {
LOG(FATAL) << "not supported dtype " << param.dtype;
// auto data = param.Out->template mutable_data<T>();
auto data = param.Out->template mutable_data<float>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
}
......@@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<ARMContext>();
if (param.input->lod().size() && param.input_dim_idx == 0) {
auto odims = param.out->dims();
odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
param.out->Resize(odims);
// auto data = param.out->template mutable_data<T>();
auto data = param.out->template mutable_data<float>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
auto data = param.out->template mutable_data<float>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) {
auto data = param.out->template mutable_data<int32_t>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) {
auto data = param.out->template mutable_data<int8_t>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else {
LOG(FATAL) << "not supported dtype " << param.dtype;
}
// if (param.input->lod().size() && param.input_dim_idx == 0) {
// auto odims = param.out->dims();
// odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
// param.out->Resize(odims);
// }
// if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
// auto data = param.out->template mutable_data<float>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else if (param.dtype ==
// static_cast<int32_t>(lite::core::FluidType::INT32)) {
// auto data = param.out->template mutable_data<int32_t>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else if (param.dtype ==
// static_cast<int32_t>(lite::core::FluidType::INT8)) {
// auto data = param.out->template mutable_data<int8_t>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else {
// LOG(FATAL) << "not supported dtype " << param.dtype;
// }
}
virtual ~FillConstantBatchLikeCompute() = default;
......@@ -144,6 +135,7 @@ REGISTER_LITE_KERNEL(fill_constant,
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
kARM,
kAny,
......@@ -153,3 +145,4 @@ REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize();
......@@ -59,6 +59,8 @@ namespace arm {
template <>
void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() {
NCHWTONHWC(float);
// auto& param = this->template Param<param_t>();
// param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
template <>
......@@ -69,6 +71,9 @@ void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() {
template <>
void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() {
NHWCTONCHW(float);
// auto& param = this->template Param<param_t>();
// param.y->mutable_data<float>();
// param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
template <>
......
......@@ -28,6 +28,7 @@ namespace arm {
void LookupTableCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
// inputs
auto w = param.W;
auto ids = param.Ids;
......@@ -36,7 +37,7 @@ void LookupTableCompute::Run() {
auto table_dim = w->dims();
int64_t ids_numel = ids->numel();
auto ids_data = ids->data<int64_t>();
auto ids_data = ids->data<float>();
int64_t row_number = table_dim[0];
int64_t row_width = table_dim[1];
......@@ -75,14 +76,3 @@ REGISTER_LITE_KERNEL(lookup_table,
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(lookup_table_v2,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -7,7 +7,9 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
......@@ -16,9 +18,11 @@ add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
......
文件模式从 100644 更改为 100755
......@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(feed,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FeedCompute,
def_host)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() {
}
void FetchCompute::Run() {
pe_.dispatch();
auto& param = this->Param<param_t>();
auto fetch_list = param.fetch_list;
if (fetch_list->size() <= static_cast<size_t>(param.col)) {
fetch_list->resize(param.col + 1);
}
Tensor& out = param.fetch_list->at(param.col);
out.Resize(param.input->dims());
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param();
......@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
REGISTER_LITE_KERNEL(fetch,
......@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
kNHWC,
paddle::lite::kernels::fpga::FetchCompute,
host_host)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <unistd.h>
#include <iostream>
#include <string>
#include <vector>
......@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() {
void GRUCompute::Run() {
auto& param = this->Param<param_t>();
param.hidden->mutable_data<float>();
// inputs
auto input = param.input;
auto h0 = param.h0;
......@@ -130,6 +132,7 @@ void GRUCompute::Run() {
// //3.
gru_value.prev_out_value = ordered_h0.mutable_data<float>();
gru_tensors.pre_output = ordered_h0.ZynqTensor();
} else {
gru_value.prev_out_value = nullptr;
gru_tensors.pre_output = nullptr;
......@@ -169,6 +172,7 @@ void GRUCompute::Run() {
float* hidden_data =
hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
gru_tensors.gate = &float_input;
gru_tensors.output = &hidden_out;
......@@ -187,11 +191,6 @@ void GRUCompute::Run() {
*(batch_hidden->mutable_lod()) = batch_gate->lod();
batch_hidden->mutable_data<float>();
to_seq(*batch_hidden, hidden);
save_tensor(const_cast<Tensor*>(input), "_input.txt");
save_tensor(hidden, "_gru.txt");
exit(-1);
}
} // namespace fpga
......
文件模式从 100644 更改为 100755
......@@ -119,7 +119,81 @@ class IoCopyFpgaToHostCompute
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
void hwc_to_chw(float* chw_data,
float* hwc_data,
int num,
int channel,
int height,
int width) {
int chw = channel * height * width;
int wc = width * channel;
int wh = width * height;
int index = 0;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
index++;
}
}
}
}
}
class IoCopyFpgaToHostCHWCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
Tensor hwc;
hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
tempTensor.unalignImage();
hwc.ZynqTensor()->copyFrom(&tempTensor);
} else {
hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
int num = 1;
int channel = 1;
int height = 1;
int width = 1;
auto dims = param.y->ZynqTensor()->shape();
hwc_to_chw(chw_data,
hwc_data,
dims.num(),
dims.channel(),
dims.height(),
dims.width());
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
// param.x->ZynqTensor()->saveToFile("io_x", true);
// param.y->ZynqTensor()->saveToFile("io_y", true);
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
......@@ -170,7 +244,7 @@ REGISTER_LITE_KERNEL(io_copy,
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM),
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -179,8 +253,8 @@ REGISTER_LITE_KERNEL(io_copy,
kFPGA,
kAny,
kAny,
paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
device_to_host_22)
paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
device_to_host_chw)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......
......@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() {
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
outs->ZynqTensor()->copyFrom(out.ZynqTensor());
}
outs->Resize({static_cast<int64_t>(e - s), out_dim});
}
}
LoD lod;
......
......@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
param.boxes->mutable_data<float>();
param.variances->mutable_data<float>();
zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor();
priobox_param.image = param.image->ZynqTensor();
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(one_hot_compute_host Host basic SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
......@@ -392,7 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms,
kNCHW,
paddle::lite::kernels::host::MulticlassNmsCompute,
def)
.BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("BBoxes",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <map>
#include <utility>
#include <vector>
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/kernels/host/one_hot_compute.h"
#include "lite/utils/paddle_enforce.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
void OneHotCompute::Run() {
auto& param = Param<operators::OneHotParam>();
param.Out->mutable_data<float>();
int depth = param.depth;
if (param.depth_tensor) {
auto* depth_tensor = param.depth_tensor;
auto* depth_data = depth_tensor->data<int32_t>();
depth = depth_data[0];
auto in_dims = param.X->dims();
DDim out_dims(in_dims);
out_dims[out_dims.size() - 1] = depth;
param.Out->Resize(out_dims);
}
auto* p_in_data = param.X->data<float>();
auto numel = param.X->numel();
auto* p_out_data = param.Out->mutable_data<float>();
for (int i = 0; i < param.Out->numel(); ++i) {
p_out_data[i] = 0;
}
if (param.allow_out_of_range) {
for (int i = 0; i < numel; ++i) {
if (p_in_data[i] >= 0 && p_in_data[i] < param.depth) {
*(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT
}
}
} else {
for (int i = 0; i < numel; ++i) {
PADDLE_ENFORCE_GE(
p_in_data[i], 0, "Illegal index value, should be at least 0.");
PADDLE_ENFORCE_LT(p_in_data[i],
param.depth,
"Illegal index value, should be less than depth (%d).",
param.depth);
*(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT
}
}
}
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(one_hot,
kHost,
kFloat,
kNCHW,
paddle::lite::kernels::host::OneHotCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
class OneHotCompute
: public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override;
virtual ~OneHotCompute() = default;
};
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -46,19 +46,43 @@ REGISTER_LITE_KERNEL(reshape,
paddle::lite::kernels::host::ReshapeCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("ShapeTensor",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("Shape",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
// REGISTER_LITE_KERNEL(reshape,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::host::ReshapeCompute,
// def)
// .BindInput("X",
// {LiteType::GetTensorTy(
// TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
// .BindInput("ShapeTensor",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
// .BindInput("Shape",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
// .Finalize();
REGISTER_LITE_KERNEL(reshape2,
kHost,
kAny,
......
......@@ -136,6 +136,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS})
if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc
DEPS fc_op memory
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/one_hot_op.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
namespace operators {
bool OneHotOp::CheckShape() const {
CHECK_OR_FALSE(param_.X);
CHECK_OR_FALSE(param_.Out);
return true;
}
bool OneHotOp::InferShape() const {
CHECK_OR_FALSE(param_.Out);
// TODO(Superjomn) Enable data sharing.
auto out_dims = param_.X->dims();
out_dims[out_dims.size() - 1] = param_.depth;
param_.Out->Resize(out_dims);
return true;
}
bool OneHotOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
param_.X =
scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
param_.Out =
scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
if (opdesc.HasInput("depth_tensor")) {
auto depth_tensor = opdesc.Input("depth_tensor").front();
param_.depth_tensor =
scope->FindVar(depth_tensor)->GetMutable<lite::Tensor>();
}
CHECK(param_.X);
CHECK(param_.Out);
param_.depth = opdesc.GetAttr<int>("depth");
param_.dtype = opdesc.GetAttr<int>("dtype");
if (opdesc.HasAttr("allow_out_of_range")) {
param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
}
auto out_lod = param_.Out->mutable_lod();
*out_lod = param_.X->lod();
// param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(one_hot, paddle::lite::operators::OneHotOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class OneHotOp : public OpLite {
public:
OneHotOp() {}
explicit OneHotOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "one_hot"; }
private:
mutable OneHotParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
......@@ -1130,7 +1130,15 @@ struct GridSamplerParam {
lite::Tensor* out{};
lite::Tensor* grid{};
};
} // namespace operators
} // namespace lite
} // namespace paddle
/// --------------------- attentions operators --------------
struct OneHotParam {
lite::Tensor* X{};
lite::Tensor* depth_tensor{nullptr};
lite::Tensor* Out{};
int depth{-1};
int dtype{};
bool allow_out_of_range{false};
};
}; // namespace operators
}; // namespace lite
}; // namespace paddle
......@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
# global variables
BUILD_EXTRA=OFF
BUILD_EXTRA=ON
BUILD_JAVA=ON
BUILD_PYTHON=OFF
BUILD_DIR=$(pwd)
......
......@@ -2,12 +2,16 @@
build_dir=build_fpga
mkdir -p ${build_dir}
cd ${build_dir}
GEN_CODE_PATH_PREFIX=lite/gen_code
mkdir -p ./${GEN_CODE_PATH_PREFIX}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
root_dir=$(pwd)
build_dir=${build_dir}
# in build directory
# 1. Prepare gen_code file
GEN_CODE_PATH_PREFIX=${build_dir}/lite/gen_code
mkdir -p ${GEN_CODE_PATH_PREFIX}
touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
cd ${build_dir}
cmake .. \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
......@@ -19,8 +23,9 @@ cmake .. \
-DLITE_WITH_OPENMP=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_TESTING=OFF \
-DARM_TARGET_OS=armlinux
make -j8
-DARM_TARGET_OS=armlinux \
-DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_PROFILE=OFF
make -j42
cd -
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册