提交 c6d82e0e 编写于 作者: C chonwhite

merge attention_diff into fpga_pr

...@@ -104,3 +104,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources ...@@ -104,3 +104,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources metal/MobileNetDemo/MobileNetDemo/Resources
# generated files
lite/api/paddle_use_kernels.h
lite/api/paddle_use_ops.h
lite/backends/arm/math/dotprod/gemm_sdot.h
lite/tools/cmake_tools/ast.pyc
...@@ -22,6 +22,8 @@ if (WITH_PADDLE_MOBILE) ...@@ -22,6 +22,8 @@ if (WITH_PADDLE_MOBILE)
return() return()
endif(WITH_PADDLE_MOBILE) endif(WITH_PADDLE_MOBILE)
# set(CMAKE_BUILD_TYPE DEBUG)
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
......
./lite/tools/build.sh \
--arm_os=armlinux \
--arm_abi=armv8 \
--arm_lang=gcc \
test
...@@ -198,14 +198,24 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ...@@ -198,14 +198,24 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}) FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
DEPS ${lite_model_test_DEPS} DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc
# DEPS ${lite_model_test_DEPS}) DEPS ${lite_model_test_DEPS})
# lite_cc_test(model_run_test_image SRCS model_run_test_image.cc # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
# DEPS ${lite_model_test_DEPS} # DEPS ${lite_model_test_DEPS}
......
...@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) { ...@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
<< kpf_path; << kpf_path;
} }
#ifndef LITE_WITH_FPGA
lite::Tensor *Predictor::GetInput(size_t offset) { lite::Tensor *Predictor::GetInput(size_t offset) {
CHECK(input_names_.size() > offset) CHECK(input_names_.size() > offset)
<< "The network has " << input_names_.size() << " inputs" << "The network has " << input_names_.size() << " inputs"
...@@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) { ...@@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
<< " in exec_scope"; << " in exec_scope";
return in_var->GetMutable<lite::Tensor>(); return in_var->GetMutable<lite::Tensor>();
} }
#else
lite::Tensor *Predictor::GetInput(size_t offset) {
auto *_feed_list = exec_scope_->FindVar("feed");
CHECK(_feed_list) << "no feed variable in exec_scope";
auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
if (offset >= feed_list->size()) {
feed_list->resize(offset + 1);
}
return &feed_list->at(offset);
}
#endif
// get inputs names // get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; } std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
...@@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() { ...@@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() {
} }
} }
#ifndef LITE_WITH_FPGA
const lite::Tensor *Predictor::GetOutput(size_t offset) const { const lite::Tensor *Predictor::GetOutput(size_t offset) const {
CHECK(output_names_.size() > offset) CHECK(output_names_.size() > offset)
<< "The network has " << output_names_.size() << " outputs" << "The network has " << output_names_.size() << " outputs"
...@@ -186,6 +200,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const { ...@@ -186,6 +200,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
} }
return outputs; return outputs;
} }
#else
const lite::Tensor *Predictor::GetOutput(size_t offset) const {
auto *_fetch_list = exec_scope_->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
return &fetch_list.at(offset);
}
std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
auto *_fetch_list = exec_scope_->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
std::vector<const lite::Tensor *> outputs;
for (auto out : fetch_list) {
outputs.push_back(&out);
}
return outputs;
}
#endif
const cpp::ProgramDesc &Predictor::program_desc() const { const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_; return program_desc_;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
#ifdef LITE_WITH_FPGA
TEST(ResNet50, test) {
lite::Predictor predictor;
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
// std::vector<Place> valid_places(
// {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
predictor.Build("",
FLAGS_model_dir + "/model",
FLAGS_model_dir + "/params",
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
}
#endif
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file");
namespace paddle {
namespace lite {
void read_from_file(const std::string& path, float* data, int num) {
std::ifstream file_stream;
file_stream.open(path);
if (!file_stream) {
exit(-1);
return;
}
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
}
void chw_to_hwc(float* src, float* dst, int channel, int height, int width) {
int amount_per_row = width * channel;
int index = 0;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
int dst_index = offset_height + w * channel + c;
dst[dst_index] = src[index];
index = index + 1;
}
}
}
}
void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place,
bool use_npu = false) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
// predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
predictor.Build("", "attention/model", "attention/params", valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 100, 200})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
read_from_file(FLAGS_input_file, data, 100 * 200);
//=============================================
auto* init_ids = predictor.GetInput(1);
init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
auto* data_ids = init_ids->mutable_data<float>();
auto ids_size = init_ids->dims().production();
for (int i = 0; i < ids_size; i++) {
data_ids[i] = 0;
}
auto lod_ids = init_ids->mutable_lod();
std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
*lod_ids = lod_i;
//=============================================
auto* init_scores = predictor.GetInput(2);
init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
auto* data_scores = init_scores->mutable_data<float>();
auto scores_size = input_tensor->dims().production();
for (int i = 0; i < scores_size; i++) {
data_scores[i] = 0;
}
auto lod_scores = init_scores->mutable_lod();
std::vector<std::vector<uint64_t>> lod_s{{0, 1}, {0, 1}};
*lod_scores = lod_s;
//=============================================
auto* position_encoding = predictor.GetInput(3);
position_encoding->Resize(
DDim(std::vector<DDim::value_type>({1, 33, 10, 23})));
auto* position_encoding_data = position_encoding->mutable_data<float>();
float* temp_data = position_encoding_data;
for (int i = 0; i < position_encoding->dims().production(); ++i) {
temp_data[i] = 0;
}
int index = 0;
for (int i = 0; i < 10; i++) {
for (int row = 0; row < 10; row++) {
for (int col = 0; col < 23; col++) {
if (i == row) {
temp_data[index] = 1.0f;
} else {
temp_data[index] = 0.0f;
}
index++;
}
}
}
for (int i = 0; i < 23; i++) {
for (int row = 0; row < 10; row++) {
for (int col = 0; col < 23; col++) {
if (i == col) {
temp_data[index] = 1.0f;
} else {
temp_data[index] = 0.0f;
}
index++;
}
}
}
// chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23);
// delete[] temp_data;
// read_from_file("position_encoding.data", position_encoding_data, 33 * 10 *
// 23);
auto start = GetCurrentUS();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
std::cout << "================== Speed Report ===================";
std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
std::string file = "plate_data/" + FLAGS_input_file.substr(9);
std::cout << "file:::" << file << std::endl;
std::ofstream ofs;
ofs.open(file);
for (int i = 0; i < out->dims().production(); i++) {
float value = out->data<float>()[i];
ofs << value << std::endl;
}
ofs.close();
}
TEST(OcrAttention, test_arm) {
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)});
}
} // namespace lite
} // namespace paddle
...@@ -31,11 +31,7 @@ TEST(ResNet50, test) { ...@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
std::vector<Place> valid_places( std::vector<Place> valid_places(
{Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}}); {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});
predictor.Build(FLAGS_model_dir, predictor.Build(FLAGS_model_dir, "", "", valid_places);
"",
"",
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
valid_places);
auto* input_tensor = predictor.GetInput(0); auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224}))); input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <dirent.h>
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file");
namespace paddle {
namespace lite {
std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
std::vector<std::string> files;
std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
[](DIR* dir) { dir&& closedir(dir); });
struct dirent* dirent_ptr;
if (!directory_ptr) {
std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
return files;
}
while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
files.push_back(std::string(dirent_ptr->d_name));
}
return files;
}
void readFromFile(int num, std::string path, float* data) {
std::ifstream file_stream(path);
// file_stream.open(path);
if (!file_stream.good()) {
std::cout << "file: " << path << " dones not exist!\n";
exit(-1);
return;
}
// float* data = mutableData<float>();
for (int i = 0; i < num; ++i) {
float value = 0;
file_stream >> value;
data[i] = value;
}
file_stream.close();
}
// #ifdef LITE_WITH_FPGA
TEST(ResNet50, test) {
lite::Predictor predictor;
std::vector<Place> valid_places({
Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
// predictor.Build(FLAGS_model_dir, "", "", valid_places);
predictor.Build("",
FLAGS_model_dir + "/model",
FLAGS_model_dir + "/params",
valid_places);
auto* input_tensor = predictor.GetInput(0);
int width = 300;
int height = 300;
// std::ifstream file_stream(FLAGS_input_file);
// if (!file_stream.good()) {
// std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
// exit(-1);
// return;
// }
// file_stream >> height;
// file_stream >> width;
input_tensor->Resize(
DDim(std::vector<DDim::value_type>({1, 3, height, width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
// readFromFile(item_size, "car.data", data);
int num = 3 * width * height;
// for (int i = 0; i < num; ++i) {
// float value = 0;
// file_stream >> value;
// data[i] = value;
// }
// file_stream.close();
for (int i = 0; i < 2; ++i) {
predictor.Run();
}
auto* out = predictor.GetOutput(0);
for (int i = 0; i < out->dims().production(); i++) {
std::cout << ":" << out->data<float>()[i] << std::endl;
}
std::string file = "output/" + FLAGS_input_file.substr(6);
std::cout << "file:::" << file << std::endl;
std::ofstream ofs;
ofs.open(file);
for (int i = 0; i < out->dims().production(); i++) {
float value = out->data<float>()[i];
ofs << value << std::endl;
}
ofs.close();
LOG(INFO) << "================== Speed Report ===================";
}
// #endif
} // namespace lite
} // namespace paddle
...@@ -32,7 +32,8 @@ class Debugger { ...@@ -32,7 +32,8 @@ class Debugger {
} }
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) { void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
if (op_type != "conv") { // NOLINT if (op_config[op_type]) {
tensor->saveToFile(op_type, true);
} }
} }
...@@ -40,8 +41,19 @@ class Debugger { ...@@ -40,8 +41,19 @@ class Debugger {
std::unordered_map<std::string, bool> op_config; std::unordered_map<std::string, bool> op_config;
Debugger() { Debugger() {
op_config["concat"] = true; op_config["concat"] = true;
op_config["pooling"] = true;
op_config["conv"] = true; op_config["conv"] = true;
op_config["crop"] = true; op_config["crop"] = true;
op_config["feed"] = true;
op_config["mul"] = true;
op_config["fetch"] = true;
op_config["boxes"] = true;
op_config["scores"] = true;
op_config["nms"] = true;
op_config["pb_boxes"] = true;
op_config["pb_variances"] = true;
// op_config["fc"] = true;
op_config["softmax"] = true;
} }
}; };
...@@ -131,9 +143,7 @@ inline void save_tensor(const lite::Tensor* t, ...@@ -131,9 +143,7 @@ inline void save_tensor(const lite::Tensor* t,
chw_to_hwc(const_cast<lite::Tensor*>(t), dst); chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
data = dst; data = dst;
} }
save_float(data, name, t->numel()); save_float(data, name, t->numel());
delete[] dst; delete[] dst;
} }
} // namespace lite } // namespace lite
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/fpga_cv.hpp"
using paddle::zynqmp::float16;
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height) {
paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
paddle::zynqmp::config_inplace(inplace_args);
paddle::zynqmp::ImageInputArgs input_args = {nullptr};
input_args.address = nullptr;
input_args.scale_address = nullptr;
float16* input_image_address =
reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
input_width * input_height * input_channel * sizeof(float16)));
int index = 0;
for (int i = 0; i < input_width * input_height * input_channel; i++) {
input_image_address[i] = float16(1.0 * input[i]);
}
paddle::zynqmp::ResizeArgs resize_args = {0};
resize_args.input_width = input_width;
resize_args.input_height = input_height;
resize_args.image_channel = input_channel;
resize_args.output_width = output_width;
resize_args.output_height = output_height;
float height_ratio = static_cast<float>(input_height) /
static_cast<float>(resize_args.output_height);
float width_ratio = static_cast<float>(input_width) /
static_cast<float>(resize_args.output_width);
resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
int output_size =
resize_args.output_width * resize_args.output_height * input_channel;
float16* fpga_output = reinterpret_cast<float16*>(
paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
resize_args.input_image_address = input_image_address;
resize_args.output_image_address = fpga_output;
memset(fpga_output, 0, output_size * sizeof(float16));
paddle::zynqmp::fpga_flush(
input_image_address,
input_width * input_height * input_channel * sizeof(float16));
paddle::zynqmp::fpga_flush(resize_args.output_image_address,
output_size * sizeof(float16));
int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
if (ret == 0) {
paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
output_size * sizeof(float16));
}
for (int i = 0; i < output_size; i++) {
output[i] = fpga_output[i];
}
}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdlib.h>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
#include "lite/backends/fpga/KD/pe.hpp"
void fpga_resize(float* input,
int input_width,
int input_height,
int input_channel,
uint8_t* output,
int output_width,
int output_height);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define PADDLE_LITE_ZU5
#define FPGA_PRINT_MODE
#define PADDLE_LITE_PROFILE
...@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) { ...@@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) {
std::ofstream ofs; std::ofstream ofs;
ofs.open(name); ofs.open(name);
int8_t* data = static_cast<int8_t*> data_in; int8_t* data = static_cast<int8_t*>(data_in);
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
float value = data[i]; float value = data[i];
ofs << value << std::endl; ofs << value << std::endl;
...@@ -221,6 +221,7 @@ int8_t* format_filter(float* data_in, ...@@ -221,6 +221,7 @@ int8_t* format_filter(float* data_in,
align_to_x(num_per_div_before_alignment, filter_num_alignment); align_to_x(num_per_div_before_alignment, filter_num_alignment);
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment; int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) + ((residual == 0) ? div_num : (div_num - 1)) +
......
文件模式从 100644 更改为 100755
...@@ -62,6 +62,7 @@ void reset_device() { ...@@ -62,6 +62,7 @@ void reset_device() {
// memory management; // memory management;
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
#ifdef ENABLE_DEBUG #ifdef ENABLE_DEBUG
std::cout << "fpga_malloc:" << size << std::endl;
#endif #endif
#ifdef PADDLE_OS_LINUX #ifdef PADDLE_OS_LINUX
void *ptr = reinterpret_cast<void *>( void *ptr = reinterpret_cast<void *>(
......
...@@ -62,6 +62,7 @@ class ConvPE : public PE { ...@@ -62,6 +62,7 @@ class ConvPE : public PE {
param_.filter->shape().height() == 1) { // NOLINT param_.filter->shape().height() == 1) { // NOLINT
} }
if (!use_cpu_) { // NOLINT if (!use_cpu_) { // NOLINT
// param_.filter->releaseData();
} }
} }
...@@ -92,6 +93,7 @@ class ConvPE : public PE { ...@@ -92,6 +93,7 @@ class ConvPE : public PE {
int kernel_width = param_.filter->shape().width(); int kernel_width = param_.filter->shape().width();
int kernel_step_h = param_.strides[0]; int kernel_step_h = param_.strides[0];
int kernel_step_w = param_.strides[1]; int kernel_step_w = param_.strides[1];
int pooled_height_ = output->shape().height(); int pooled_height_ = output->shape().height();
int pooled_width_ = out_width; int pooled_width_ = out_width;
int filter_chw = image_channels * kernel_height * kernel_width; int filter_chw = image_channels * kernel_height * kernel_width;
......
...@@ -266,8 +266,8 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -266,8 +266,8 @@ inline void split_filter_num(const ConvParam& c_param) {
int filter_num_alignment = filter::get_filter_num_alignment(); int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num = int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups; align_to_x(num / param.groups, filter_num_alignment) * param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape(); Shape& out_shape = out->shape();
for (int i = 0; i < split_num; i++) { for (int i = 0; i < split_num; i++) {
BasicConvParam* conv_param = new BasicConvParam(); BasicConvParam* conv_param = new BasicConvParam();
...@@ -364,6 +364,7 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -364,6 +364,7 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.height = input->shape().height(); args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1]; args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0]; args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0]; args.dilation = param.dilations[0];
args.output.address = out_address; args.output.address = out_address;
...@@ -419,6 +420,7 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -419,6 +420,7 @@ inline void split_channel(const ConvParam& c_param) {
} }
scale.flush(); scale.flush();
bias.flush(); bias.flush();
// Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale, format_scale_bias(&scale,
&bias, &bias,
&conv_param->filter, &conv_param->filter,
...@@ -446,6 +448,7 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -446,6 +448,7 @@ inline void split_channel(const ConvParam& c_param) {
args.image.height = conv_param->input.shape().height(); args.image.height = conv_param->input.shape().height();
args.image.pad_width = param.paddings[1]; args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0]; args.image.pad_height = param.paddings[0];
args.dilation = param.dilations[0]; args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>(); args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale(); args.output.scale_address = conv_param->output.scale();
...@@ -476,6 +479,7 @@ inline bool compute_conv(const ConvParam& c_conv_params) { ...@@ -476,6 +479,7 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
} }
size_t size = params.size(); size_t size = params.size();
if (ret == 0 && size > 1) { if (ret == 0 && size > 1) {
// Tensor* output = conv_params.output;
Tensor& img = params[0]->output; Tensor& img = params[0]->output;
for (int i = 0; i < 1; i++) { for (int i = 0; i < 1; i++) {
for (int i = 0; i < img.shape().numel(); i++) { for (int i = 0; i < img.shape().numel(); i++) {
......
...@@ -62,6 +62,7 @@ class DepthwiseConvPE : public PE { ...@@ -62,6 +62,7 @@ class DepthwiseConvPE : public PE {
float16* scale_data = param_.scale()->data<float16>(); float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>( float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape()); FP16, param.filter->shape());
memcpy(filter_data, memcpy(filter_data,
scale_data, scale_data,
param.filter->shape().numel() * sizeof(float16)); param.filter->shape().numel() * sizeof(float16));
......
文件模式从 100644 更改为 100755
...@@ -47,8 +47,10 @@ class GRUPE : public PE { ...@@ -47,8 +47,10 @@ class GRUPE : public PE {
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}}; zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data = float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape); prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16)); memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
// copy 2/3 weight from param.weight;
zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}}; zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape); float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float)); memset(weight_data, 0, weight_shape.numel() * sizeof(float));
...@@ -115,11 +117,9 @@ class GRUPE : public PE { ...@@ -115,11 +117,9 @@ class GRUPE : public PE {
if (hidden_prev) { if (hidden_prev) {
// TODO(chonwhite): change to pre_out; // TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output); prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
} }
mul_pe_.dispatch(); mul_pe_.dispatch();
reset_hidden_.saveToFile("reset_hidden_.txt"); // reset_hidden_.saveToFile("reset_hidden_.txt");
update_gate_data += stride_update; update_gate_data += stride_update;
reset_gate_data += stride_update; reset_gate_data += stride_update;
...@@ -170,6 +170,7 @@ class GRUPE : public PE { ...@@ -170,6 +170,7 @@ class GRUPE : public PE {
zynqmp::Tensor bias_; zynqmp::Tensor bias_;
zynqmp::Tensor weight_; zynqmp::Tensor weight_;
zynqmp::Tensor state_weight_; zynqmp::Tensor state_weight_;
zynqmp::Tensor update_gate_; zynqmp::Tensor update_gate_;
zynqmp::Tensor reset_gate_; zynqmp::Tensor reset_gate_;
zynqmp::Tensor cell_state_; zynqmp::Tensor cell_state_;
......
...@@ -66,7 +66,7 @@ class PoolingPE : public PE { ...@@ -66,7 +66,7 @@ class PoolingPE : public PE {
param_.poolingArgs = args; param_.poolingArgs = args;
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7); (k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE; use_cpu_ = param_.type == AVERAGE;
} }
...@@ -76,6 +76,7 @@ class PoolingPE : public PE { ...@@ -76,6 +76,7 @@ class PoolingPE : public PE {
input->syncToCPU(); input->syncToCPU();
Tensor float_input; Tensor float_input;
// Tensor float_output;
float* image_addr = float_input.mutableData<float>(FP32, input->shape()); float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
float16* data_out = output->data<float16>(); float16* data_out = output->data<float16>();
...@@ -188,7 +189,9 @@ class PoolingPE : public PE { ...@@ -188,7 +189,9 @@ class PoolingPE : public PE {
bool dispatch() { bool dispatch() {
if (use_cpu_) { if (use_cpu_) {
// cpu_compute();
compute(); compute();
// exit(-1);
return true; return true;
} }
param_.input->syncToDevice(); param_.input->syncToDevice();
......
...@@ -89,7 +89,6 @@ class ScalePE : public PE { ...@@ -89,7 +89,6 @@ class ScalePE : public PE {
} }
} }
} }
float* scale_data_float = param_.scale->data<float>(); float* scale_data_float = param_.scale->data<float>();
for (int i = 0; i < repeat; i++) { for (int i = 0; i < repeat; i++) {
for (int j = 0; j < length; j++) { for (int j = 0; j < length; j++) {
......
...@@ -348,9 +348,19 @@ class Tensor { ...@@ -348,9 +348,19 @@ class Tensor {
if (placeHolder_ == nullptr) { if (placeHolder_ == nullptr) {
return; return;
} }
std::cout << scale()[0] << " , " << scale()[1] << std::endl;
} }
void printScale(std::string type) { printScale(); } void printScale(std::string type) {
std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width())
<< std::endl;
std::cout << type << " \n";
printScale();
}
std::string dimsFileName() { std::string dimsFileName() {
return std::to_string(shape_->num()) + "_" + return std::to_string(shape_->num()) + "_" +
...@@ -378,6 +388,7 @@ class Tensor { ...@@ -378,6 +388,7 @@ class Tensor {
static int counter = 0; static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path; std::string npath = std::to_string(counter) + "_" + path;
counter++; counter++;
std::cout << "======== saving file:" << npath << " ============\n";
save_file_with_name(npath); save_file_with_name(npath);
} }
......
...@@ -165,6 +165,9 @@ class TensorLite { ...@@ -165,6 +165,9 @@ class TensorLite {
TargetType target() const { return target_; } TargetType target() const { return target_; }
// template <typename T>
// TensorLite Slice(int64_t begin, int64_t end) const;
zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; } zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) { friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
...@@ -254,6 +257,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { ...@@ -254,6 +257,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
int64_t base = numel() / dims_[0]; int64_t base = numel() / dims_[0];
TensorLite dst; TensorLite dst;
dst.target_ = target_; dst.target_ = target_;
auto dst_dims = dims_; auto dst_dims = dims_;
dst_dims[0] = end - begin; dst_dims[0] = end - begin;
......
文件模式从 100644 更改为 100755
...@@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst(
auto io_copy_output_name = auto io_copy_output_name =
string_format("%s/target_trans", in->AsArg().name.c_str()); string_format("%s/target_trans", in->AsArg().name.c_str());
// string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id()); // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
if (copied_nodes->count(in->AsArg().name)) { if (copied_nodes->count(in->AsArg().name)) {
// Remove the old link // Remove the old link
RemoveDirectedLink(in, inst_node); RemoveDirectedLink(in, inst_node);
......
文件模式从 100644 更改为 100755
...@@ -138,11 +138,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { ...@@ -138,11 +138,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
void RuntimeProgram::Run() { void RuntimeProgram::Run() {
for (auto& inst : instructions_) { for (auto& inst : instructions_) {
std::string op_type = inst.op()->op_info()->Type(); std::string op_type = inst.op()->op_info()->Type();
#ifndef LITE_WITH_FPGA
if (op_type == "feed" || op_type == "fetch") continue; if (op_type == "feed" || op_type == "fetch") continue;
#endif
inst.Run(); inst.Run();
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
LITE_PRECISION_PROFILE(inst) #ifndef LITE_WITH_FPGA
// LITE_PRECISION_PROFILE(inst)
#endif
#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
} }
......
文件模式从 100644 更改为 100755
...@@ -46,7 +46,7 @@ class Tensor { ...@@ -46,7 +46,7 @@ class Tensor {
*/ */
class PaddlePredictor { class PaddlePredictor {
public: public:
void Init(); void Init() {}
std::unique_ptr<Tensor> GetTensor(const std::string &id) const; std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
std::unique_ptr<Tensor> GetMutableTensor(const std::string &id); std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);
......
...@@ -62,6 +62,10 @@ void CastCompute::Run() { ...@@ -62,6 +62,10 @@ void CastCompute::Run() {
int32_t* out_data = param.Out->mutable_data<int32_t>(); int32_t* out_data = param.Out->mutable_data<int32_t>();
std::transform( std::transform(
x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>); x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
} else if (param.in_dtype == 3 && param.out_dtype == 5) {
const auto* x_data = param.X->data<float>();
auto* o_data = param.Out->mutable_data<float>();
memcpy(o_data, x_data, sizeof(float) * param.X->numel());
} else { } else {
LOG(FATAL) << "other has not been implemented"; LOG(FATAL) << "other has not been implemented";
} }
......
...@@ -60,26 +60,11 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> { ...@@ -60,26 +60,11 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<ARMContext>(); auto& context = ctx_->As<ARMContext>();
if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) { // auto data = param.Out->template mutable_data<T>();
auto data = param.Out->template mutable_data<float>(); auto data = param.Out->template mutable_data<float>();
for (int i = 0; i < param.Out->numel(); i++) { for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value; data[i] = param.value;
} }
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) {
auto data = param.Out->template mutable_data<int32_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) {
auto data = param.Out->template mutable_data<int8_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else {
LOG(FATAL) << "not supported dtype " << param.dtype;
}
} }
virtual ~FillConstantCompute() = default; virtual ~FillConstantCompute() = default;
...@@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute ...@@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<ARMContext>(); auto& context = ctx_->As<ARMContext>();
if (param.input->lod().size() && param.input_dim_idx == 0) { // auto data = param.out->template mutable_data<T>();
auto odims = param.out->dims();
odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
param.out->Resize(odims);
}
if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
auto data = param.out->template mutable_data<float>(); auto data = param.out->template mutable_data<float>();
for (int i = 0; i < param.out->numel(); i++) { for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value; data[i] = param.value;
} }
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) { // if (param.input->lod().size() && param.input_dim_idx == 0) {
auto data = param.out->template mutable_data<int32_t>(); // auto odims = param.out->dims();
for (int i = 0; i < param.out->numel(); i++) { // odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
data[i] = param.value; // param.out->Resize(odims);
} // }
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) { // if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
auto data = param.out->template mutable_data<int8_t>(); // auto data = param.out->template mutable_data<float>();
for (int i = 0; i < param.out->numel(); i++) { // for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value; // data[i] = param.value;
} // }
} else { // } else if (param.dtype ==
LOG(FATAL) << "not supported dtype " << param.dtype; // static_cast<int32_t>(lite::core::FluidType::INT32)) {
} // auto data = param.out->template mutable_data<int32_t>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else if (param.dtype ==
// static_cast<int32_t>(lite::core::FluidType::INT8)) {
// auto data = param.out->template mutable_data<int8_t>();
// for (int i = 0; i < param.out->numel(); i++) {
// data[i] = param.value;
// }
// } else {
// LOG(FATAL) << "not supported dtype " << param.dtype;
// }
} }
virtual ~FillConstantBatchLikeCompute() = default; virtual ~FillConstantBatchLikeCompute() = default;
...@@ -144,6 +135,7 @@ REGISTER_LITE_KERNEL(fill_constant, ...@@ -144,6 +135,7 @@ REGISTER_LITE_KERNEL(fill_constant,
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(fill_constant_batch_size_like, REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
kARM, kARM,
kAny, kAny,
...@@ -153,3 +145,4 @@ REGISTER_LITE_KERNEL(fill_constant_batch_size_like, ...@@ -153,3 +145,4 @@ REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize(); .Finalize();
...@@ -59,6 +59,8 @@ namespace arm { ...@@ -59,6 +59,8 @@ namespace arm {
template <> template <>
void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() { void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() {
NCHWTONHWC(float); NCHWTONHWC(float);
// auto& param = this->template Param<param_t>();
// param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
} }
template <> template <>
...@@ -69,6 +71,9 @@ void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() { ...@@ -69,6 +71,9 @@ void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() {
template <> template <>
void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() { void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() {
NHWCTONCHW(float); NHWCTONCHW(float);
// auto& param = this->template Param<param_t>();
// param.y->mutable_data<float>();
// param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
} }
template <> template <>
......
...@@ -28,6 +28,7 @@ namespace arm { ...@@ -28,6 +28,7 @@ namespace arm {
void LookupTableCompute::Run() { void LookupTableCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
// inputs // inputs
auto w = param.W; auto w = param.W;
auto ids = param.Ids; auto ids = param.Ids;
...@@ -36,7 +37,7 @@ void LookupTableCompute::Run() { ...@@ -36,7 +37,7 @@ void LookupTableCompute::Run() {
auto table_dim = w->dims(); auto table_dim = w->dims();
int64_t ids_numel = ids->numel(); int64_t ids_numel = ids->numel();
auto ids_data = ids->data<int64_t>(); auto ids_data = ids->data<float>();
int64_t row_number = table_dim[0]; int64_t row_number = table_dim[0];
int64_t row_width = table_dim[1]; int64_t row_width = table_dim[1];
...@@ -75,14 +76,3 @@ REGISTER_LITE_KERNEL(lookup_table, ...@@ -75,14 +76,3 @@ REGISTER_LITE_KERNEL(lookup_table,
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(lookup_table_v2,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
...@@ -7,7 +7,9 @@ set(fpga_deps fpga_target_wrapper kernel_fpga) ...@@ -7,7 +7,9 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) # add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps}) # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
...@@ -16,9 +18,11 @@ add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ...@@ -16,9 +18,11 @@ add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps}) # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps}) add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps}) add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
......
文件模式从 100644 更改为 100755
...@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL( ...@@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(feed,
kFPGA,
kFP16,
kNHWC,
paddle::lite::kernels::fpga::FeedCompute,
def_host)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
...@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() { ...@@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() {
} }
void FetchCompute::Run() { void FetchCompute::Run() {
pe_.dispatch();
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto fetch_list = param.fetch_list;
if (fetch_list->size() <= static_cast<size_t>(param.col)) {
fetch_list->resize(param.col + 1);
}
Tensor& out = param.fetch_list->at(param.col);
out.Resize(param.input->dims());
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param(); zynqmp::OutputParam& fetch_param = pe_.param();
...@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch, ...@@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny))}) DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(fetch, REGISTER_LITE_KERNEL(fetch,
...@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch, ...@@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::FetchCompute, paddle::lite::kernels::fpga::FetchCompute,
host_host) host_host)
.BindInput("X", .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
{LiteType::GetTensorTy(TARGET(kHost), .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize(); .Finalize();
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <unistd.h> #include <unistd.h>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() { ...@@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() {
void GRUCompute::Run() { void GRUCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
param.hidden->mutable_data<float>(); param.hidden->mutable_data<float>();
// inputs // inputs
auto input = param.input; auto input = param.input;
auto h0 = param.h0; auto h0 = param.h0;
...@@ -130,6 +132,7 @@ void GRUCompute::Run() { ...@@ -130,6 +132,7 @@ void GRUCompute::Run() {
// //3. // //3.
gru_value.prev_out_value = ordered_h0.mutable_data<float>(); gru_value.prev_out_value = ordered_h0.mutable_data<float>();
gru_tensors.pre_output = ordered_h0.ZynqTensor(); gru_tensors.pre_output = ordered_h0.ZynqTensor();
} else { } else {
gru_value.prev_out_value = nullptr; gru_value.prev_out_value = nullptr;
gru_tensors.pre_output = nullptr; gru_tensors.pre_output = nullptr;
...@@ -169,6 +172,7 @@ void GRUCompute::Run() { ...@@ -169,6 +172,7 @@ void GRUCompute::Run() {
float* hidden_data = float* hidden_data =
hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape); hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
gru_tensors.gate = &float_input; gru_tensors.gate = &float_input;
gru_tensors.output = &hidden_out; gru_tensors.output = &hidden_out;
...@@ -187,11 +191,6 @@ void GRUCompute::Run() { ...@@ -187,11 +191,6 @@ void GRUCompute::Run() {
*(batch_hidden->mutable_lod()) = batch_gate->lod(); *(batch_hidden->mutable_lod()) = batch_gate->lod();
batch_hidden->mutable_data<float>(); batch_hidden->mutable_data<float>();
to_seq(*batch_hidden, hidden); to_seq(*batch_hidden, hidden);
save_tensor(const_cast<Tensor*>(input), "_input.txt");
save_tensor(hidden, "_gru.txt");
exit(-1);
} }
} // namespace fpga } // namespace fpga
......
文件模式从 100644 更改为 100755
...@@ -119,7 +119,81 @@ class IoCopyFpgaToHostCompute ...@@ -119,7 +119,81 @@ class IoCopyFpgaToHostCompute
auto out_lod = param.y->mutable_lod(); auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod(); *out_lod = param.x->lod();
} }
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
void hwc_to_chw(float* chw_data,
float* hwc_data,
int num,
int channel,
int height,
int width) {
int chw = channel * height * width;
int wc = width * channel;
int wh = width * height;
int index = 0;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
index++;
}
}
}
}
}
class IoCopyFpgaToHostCHWCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kFPGA));
Tensor hwc;
hwc.Resize(param.y->dims());
float* hwc_data = hwc.mutable_data<float>();
float* chw_data = param.y->mutable_data<float>();
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
tempTensor.setAligned(true);
tempTensor.unalignImage();
hwc.ZynqTensor()->copyFrom(&tempTensor);
} else {
hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
int num = 1;
int channel = 1;
int height = 1;
int width = 1;
auto dims = param.y->ZynqTensor()->shape();
hwc_to_chw(chw_data,
hwc_data,
dims.num(),
dims.channel(),
dims.height(),
dims.width());
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param.y->ZynqTensor()->flush();
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
// param.x->ZynqTensor()->saveToFile("io_x", true);
// param.y->ZynqTensor()->saveToFile("io_y", true);
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; } std::string doc() const override { return "Copy IO from FPGA to HOST"; }
}; };
...@@ -170,7 +244,7 @@ REGISTER_LITE_KERNEL(io_copy, ...@@ -170,7 +244,7 @@ REGISTER_LITE_KERNEL(io_copy,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM), {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -179,8 +253,8 @@ REGISTER_LITE_KERNEL(io_copy, ...@@ -179,8 +253,8 @@ REGISTER_LITE_KERNEL(io_copy,
kFPGA, kFPGA,
kAny, kAny,
kAny, kAny,
paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute, paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute,
device_to_host_22) device_to_host_chw)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
......
...@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() { ...@@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() {
scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
outs->ZynqTensor()->copyFrom(out.ZynqTensor()); outs->ZynqTensor()->copyFrom(out.ZynqTensor());
} }
outs->Resize({static_cast<int64_t>(e - s), out_dim});
} }
} }
LoD lod; LoD lod;
......
...@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() { ...@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
param.boxes->mutable_data<float>(); param.boxes->mutable_data<float>();
param.variances->mutable_data<float>(); param.variances->mutable_data<float>();
zynqmp::PriorBoxParam& priobox_param = pe_.param(); zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor(); priobox_param.input = param.input->ZynqTensor();
priobox_param.image = param.image->ZynqTensor(); priobox_param.image = param.image->ZynqTensor();
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
...@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_ ...@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps}) add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(one_hot_compute_host Host basic SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any) #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any) #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
...@@ -392,7 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms, ...@@ -392,7 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms,
kNCHW, kNCHW,
paddle::lite::kernels::host::MulticlassNmsCompute, paddle::lite::kernels::host::MulticlassNmsCompute,
def) def)
.BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("BBoxes",
.BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) {LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize(); .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <map>
#include <utility>
#include <vector>
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/kernels/host/one_hot_compute.h"
#include "lite/utils/paddle_enforce.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
void OneHotCompute::Run() {
auto& param = Param<operators::OneHotParam>();
param.Out->mutable_data<float>();
int depth = param.depth;
if (param.depth_tensor) {
auto* depth_tensor = param.depth_tensor;
auto* depth_data = depth_tensor->data<int32_t>();
depth = depth_data[0];
auto in_dims = param.X->dims();
DDim out_dims(in_dims);
out_dims[out_dims.size() - 1] = depth;
param.Out->Resize(out_dims);
}
auto* p_in_data = param.X->data<float>();
auto numel = param.X->numel();
auto* p_out_data = param.Out->mutable_data<float>();
for (int i = 0; i < param.Out->numel(); ++i) {
p_out_data[i] = 0;
}
if (param.allow_out_of_range) {
for (int i = 0; i < numel; ++i) {
if (p_in_data[i] >= 0 && p_in_data[i] < param.depth) {
*(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT
}
}
} else {
for (int i = 0; i < numel; ++i) {
PADDLE_ENFORCE_GE(
p_in_data[i], 0, "Illegal index value, should be at least 0.");
PADDLE_ENFORCE_LT(p_in_data[i],
param.depth,
"Illegal index value, should be less than depth (%d).",
param.depth);
*(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT
}
}
}
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(one_hot,
kHost,
kFloat,
kNCHW,
paddle::lite::kernels::host::OneHotCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
class OneHotCompute
: public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override;
virtual ~OneHotCompute() = default;
};
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -46,19 +46,43 @@ REGISTER_LITE_KERNEL(reshape, ...@@ -46,19 +46,43 @@ REGISTER_LITE_KERNEL(reshape,
paddle::lite::kernels::host::ReshapeCompute, paddle::lite::kernels::host::ReshapeCompute,
def) def)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy( {LiteType::GetTensorTy(TARGET(kHost),
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("ShapeTensor", .BindInput("ShapeTensor",
{LiteType::GetTensorTy( {LiteType::GetTensorTy(TARGET(kHost),
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("Shape", .BindInput("Shape",
{LiteType::GetTensorTy( {LiteType::GetTensorTy(TARGET(kHost),
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy( {LiteType::GetTensorTy(TARGET(kHost),
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize(); .Finalize();
// REGISTER_LITE_KERNEL(reshape,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::host::ReshapeCompute,
// def)
// .BindInput("X",
// {LiteType::GetTensorTy(
// TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
// .BindInput("ShapeTensor",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
// .BindInput("Shape",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(
// TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))})
// .Finalize();
REGISTER_LITE_KERNEL(reshape2, REGISTER_LITE_KERNEL(reshape2,
kHost, kHost,
kAny, kAny,
......
...@@ -136,6 +136,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS}) ...@@ -136,6 +136,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS}) add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS}) add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS})
if (NOT LITE_WITH_X86) if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc lite_cc_test(test_fc_op SRCS fc_op_test.cc
DEPS fc_op memory DEPS fc_op memory
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/one_hot_op.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
namespace operators {
bool OneHotOp::CheckShape() const {
CHECK_OR_FALSE(param_.X);
CHECK_OR_FALSE(param_.Out);
return true;
}
bool OneHotOp::InferShape() const {
CHECK_OR_FALSE(param_.Out);
// TODO(Superjomn) Enable data sharing.
auto out_dims = param_.X->dims();
out_dims[out_dims.size() - 1] = param_.depth;
param_.Out->Resize(out_dims);
return true;
}
bool OneHotOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
param_.X =
scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
param_.Out =
scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
if (opdesc.HasInput("depth_tensor")) {
auto depth_tensor = opdesc.Input("depth_tensor").front();
param_.depth_tensor =
scope->FindVar(depth_tensor)->GetMutable<lite::Tensor>();
}
CHECK(param_.X);
CHECK(param_.Out);
param_.depth = opdesc.GetAttr<int>("depth");
param_.dtype = opdesc.GetAttr<int>("dtype");
if (opdesc.HasAttr("allow_out_of_range")) {
param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
}
auto out_lod = param_.Out->mutable_lod();
*out_lod = param_.X->lod();
// param_.allow_out_of_range = opdesc.GetAttr<bool>("allow_out_of_range");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(one_hot, paddle::lite::operators::OneHotOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class OneHotOp : public OpLite {
public:
OneHotOp() {}
explicit OneHotOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "one_hot"; }
private:
mutable OneHotParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
...@@ -1130,7 +1130,15 @@ struct GridSamplerParam { ...@@ -1130,7 +1130,15 @@ struct GridSamplerParam {
lite::Tensor* out{}; lite::Tensor* out{};
lite::Tensor* grid{}; lite::Tensor* grid{};
}; };
/// --------------------- attentions operators --------------
} // namespace operators struct OneHotParam {
} // namespace lite lite::Tensor* X{};
} // namespace paddle lite::Tensor* depth_tensor{nullptr};
lite::Tensor* Out{};
int depth{-1};
int dtype{};
bool allow_out_of_range{false};
};
}; // namespace operators
}; // namespace lite
}; // namespace paddle
...@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4} ...@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
# global variables # global variables
BUILD_EXTRA=OFF BUILD_EXTRA=ON
BUILD_JAVA=ON BUILD_JAVA=ON
BUILD_PYTHON=OFF BUILD_PYTHON=OFF
BUILD_DIR=$(pwd) BUILD_DIR=$(pwd)
......
...@@ -2,12 +2,16 @@ ...@@ -2,12 +2,16 @@
build_dir=build_fpga build_dir=build_fpga
mkdir -p ${build_dir} mkdir -p ${build_dir}
cd ${build_dir}
GEN_CODE_PATH_PREFIX=lite/gen_code root_dir=$(pwd)
mkdir -p ./${GEN_CODE_PATH_PREFIX} build_dir=${build_dir}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc # in build directory
# 1. Prepare gen_code file
GEN_CODE_PATH_PREFIX=${build_dir}/lite/gen_code
mkdir -p ${GEN_CODE_PATH_PREFIX}
touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
cd ${build_dir}
cmake .. \ cmake .. \
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
...@@ -19,8 +23,9 @@ cmake .. \ ...@@ -19,8 +23,9 @@ cmake .. \
-DLITE_WITH_OPENMP=ON \ -DLITE_WITH_OPENMP=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_TESTING=OFF \ -DWITH_TESTING=OFF \
-DARM_TARGET_OS=armlinux -DARM_TARGET_OS=armlinux \
-DLITE_BUILD_EXTRA=ON \
make -j8 -DLITE_WITH_PROFILE=OFF
make -j42
cd - cd -
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册