diff --git a/.gitignore b/.gitignore index 9db2912c07bc2d6abb01c322a25519ac0ff158fa..ce40fea2be877c09bb299781d8937c081843b50c 100644 --- a/.gitignore +++ b/.gitignore @@ -104,3 +104,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/MobileNetDemo/MobileNetDemo/Resources + +# generated files +lite/api/paddle_use_kernels.h +lite/api/paddle_use_ops.h +lite/backends/arm/math/dotprod/gemm_sdot.h +lite/tools/cmake_tools/ast.pyc + diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index 77a94bea1efcdafaa67b4c078bfb0a756f7b1cec..786b1322b346631d1570a6ebd9c572302531db4e --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,8 @@ if (WITH_PADDLE_MOBILE) return() endif(WITH_PADDLE_MOBILE) +# set(CMAKE_BUILD_TYPE DEBUG) + set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(CMAKE_CXX_STANDARD 11) diff --git a/fpga.sh b/fpga.sh new file mode 100644 index 0000000000000000000000000000000000000000..e0501ac14b5269139688169017c057bd2458ab7c --- /dev/null +++ b/fpga.sh @@ -0,0 +1,5 @@ +./lite/tools/build.sh \ + --arm_os=armlinux \ + --arm_abi=armv8 \ + --arm_lang=gcc \ + test diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt old mode 100644 new mode 100755 index 70239e94e7a3064fb383246623d05a2079dda1fa..c3388350228207f843c9cbd2c1a3525ba0ef5645 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -198,14 +198,24 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) + lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc + DEPS ${lite_model_test_DEPS} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + + lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc + DEPS ${lite_model_test_DEPS} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc DEPS ${lite_model_test_DEPS} CL_DEPS ${opencl_kernels} ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) - # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc - # DEPS ${lite_model_test_DEPS}) + lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc + DEPS ${lite_model_test_DEPS}) # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc # DEPS ${lite_model_test_DEPS} diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 990d08f18f541088d797510e9dbd4881d42b164f..9afe7e264a960144637df609daeca80f4ed3b2ac 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) { << kpf_path; } +#ifndef LITE_WITH_FPGA lite::Tensor *Predictor::GetInput(size_t offset) { CHECK(input_names_.size() > offset) << "The network has " << input_names_.size() << " inputs" @@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) { << " in exec_scope"; return in_var->GetMutable(); } +#else +lite::Tensor *Predictor::GetInput(size_t offset) { + auto *_feed_list = exec_scope_->FindVar("feed"); + CHECK(_feed_list) << "no feed variable in exec_scope"; + auto *feed_list = _feed_list->GetMutable>(); + if (offset >= feed_list->size()) { + feed_list->resize(offset + 1); + } + return &feed_list->at(offset); +} +#endif // get inputs names std::vector Predictor::GetInputNames() { return input_names_; } @@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() { } } +#ifndef LITE_WITH_FPGA + const lite::Tensor *Predictor::GetOutput(size_t offset) const { CHECK(output_names_.size() > offset) << "The network has " << output_names_.size() << " outputs" @@ -186,6 +200,29 @@ std::vector Predictor::GetOutputs() const { } return outputs; } +#else + +const lite::Tensor *Predictor::GetOutput(size_t offset) const { + auto *_fetch_list = exec_scope_->FindVar("fetch"); + CHECK(_fetch_list) << "no fatch variable in exec_scope"; + auto &fetch_list = *_fetch_list->GetMutable>(); + CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; + return &fetch_list.at(offset); +} + +std::vector Predictor::GetOutputs() const { + auto *_fetch_list = exec_scope_->FindVar("fetch"); + CHECK(_fetch_list) << "no fatch variable in exec_scope"; + auto &fetch_list = *_fetch_list->GetMutable>(); + + std::vector outputs; + for (auto out : fetch_list) { + outputs.push_back(&out); + } + return outputs; +} + +#endif const cpp::ProgramDesc &Predictor::program_desc() const { return program_desc_; diff --git a/lite/api/inceptionv3_test_fpga.cc b/lite/api/inceptionv3_test_fpga.cc new file mode 100644 index 0000000000000000000000000000000000000000..c1dff7990e965465e73ed895c17a15646ef1c993 --- /dev/null +++ b/lite/api/inceptionv3_test_fpga.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { + +#ifdef LITE_WITH_FPGA +TEST(ResNet50, test) { + lite::Predictor predictor; + + std::vector valid_places({ + Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + + // std::vector valid_places( + // {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}}); + + predictor.Build("", + FLAGS_model_dir + "/model", + FLAGS_model_dir + "/params", + valid_places); + + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < 2; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; +} +#endif + +} // namespace lite +} // namespace paddle diff --git a/lite/api/ocr_attention_test_fpga.cc b/lite/api/ocr_attention_test_fpga.cc new file mode 100755 index 0000000000000000000000000000000000000000..326de883d1625f7196426094cc4ccec970f8a399 --- /dev/null +++ b/lite/api/ocr_attention_test_fpga.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +DEFINE_string(input_file, "", "input_file"); + +namespace paddle { +namespace lite { + +void read_from_file(const std::string& path, float* data, int num) { + std::ifstream file_stream; + file_stream.open(path); + if (!file_stream) { + exit(-1); + return; + } + + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + data[i] = value; + } +} + +void chw_to_hwc(float* src, float* dst, int channel, int height, int width) { + int amount_per_row = width * channel; + int index = 0; + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + int dst_index = offset_height + w * channel + c; + dst[dst_index] = src[index]; + index = index + 1; + } + } + } +} + +void TestModel(const std::vector& valid_places, + const Place& preferred_place, + bool use_npu = false) { + DeviceInfo::Init(); + DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); + lite::Predictor predictor; + + // predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); + predictor.Build("", "attention/model", "attention/params", valid_places); + + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector({1, 1, 100, 200}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + + read_from_file(FLAGS_input_file, data, 100 * 200); + //============================================= + auto* init_ids = predictor.GetInput(1); + init_ids->Resize(DDim(std::vector({1, 1}))); + auto* data_ids = init_ids->mutable_data(); + auto ids_size = init_ids->dims().production(); + for (int i = 0; i < ids_size; i++) { + data_ids[i] = 0; + } + auto lod_ids = init_ids->mutable_lod(); + std::vector> lod_i{{0, 1}, {0, 1}}; + *lod_ids = lod_i; + + //============================================= + auto* init_scores = predictor.GetInput(2); + init_scores->Resize(DDim(std::vector({1, 1}))); + auto* data_scores = init_scores->mutable_data(); + auto scores_size = input_tensor->dims().production(); + for (int i = 0; i < scores_size; i++) { + data_scores[i] = 0; + } + auto lod_scores = init_scores->mutable_lod(); + std::vector> lod_s{{0, 1}, {0, 1}}; + *lod_scores = lod_s; + + //============================================= + auto* position_encoding = predictor.GetInput(3); + position_encoding->Resize( + DDim(std::vector({1, 33, 10, 23}))); + auto* position_encoding_data = position_encoding->mutable_data(); + + float* temp_data = position_encoding_data; + + for (int i = 0; i < position_encoding->dims().production(); ++i) { + temp_data[i] = 0; + } + int index = 0; + for (int i = 0; i < 10; i++) { + for (int row = 0; row < 10; row++) { + for (int col = 0; col < 23; col++) { + if (i == row) { + temp_data[index] = 1.0f; + } else { + temp_data[index] = 0.0f; + } + index++; + } + } + } + for (int i = 0; i < 23; i++) { + for (int row = 0; row < 10; row++) { + for (int col = 0; col < 23; col++) { + if (i == col) { + temp_data[index] = 1.0f; + } else { + temp_data[index] = 0.0f; + } + index++; + } + } + } + // chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23); + // delete[] temp_data; + + // read_from_file("position_encoding.data", position_encoding_data, 33 * 10 * + // 23); + auto start = GetCurrentUS(); + for (int i = 0; i < 2; ++i) { + predictor.Run(); + } + + std::cout << "================== Speed Report ==================="; + std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + auto* out = predictor.GetOutput(0); + + std::string file = "plate_data/" + FLAGS_input_file.substr(9); + std::cout << "file:::" << file << std::endl; + + std::ofstream ofs; + ofs.open(file); + for (int i = 0; i < out->dims().production(); i++) { + float value = out->data()[i]; + ofs << value << std::endl; + } + ofs.close(); +} + +TEST(OcrAttention, test_arm) { + std::vector valid_places({ + Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)}); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/api/resnet50_test_fpga.cc b/lite/api/resnet50_test_fpga.cc index ab647f96998f1c0e73476369611218d0a7930c57..75e6f0cbbc43c3cd7eb9bfa89bc004554ea6f85b 100644 --- a/lite/api/resnet50_test_fpga.cc +++ b/lite/api/resnet50_test_fpga.cc @@ -31,11 +31,7 @@ TEST(ResNet50, test) { std::vector valid_places( {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}}); - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, - valid_places); + predictor.Build(FLAGS_model_dir, "", "", valid_places); auto* input_tensor = predictor.GetInput(0); input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); diff --git a/lite/api/test_ssd_fpga.cc b/lite/api/test_ssd_fpga.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb2d75671a637c8042b39e2e90d70f1ae9e6f2fd --- /dev/null +++ b/lite/api/test_ssd_fpga.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +DEFINE_string(input_file, "", "input_file"); + +namespace paddle { +namespace lite { + +std::vector GetDirectoryFiles(const std::string& dir) { + std::vector files; + std::shared_ptr directory_ptr(opendir(dir.c_str()), + [](DIR* dir) { dir&& closedir(dir); }); + struct dirent* dirent_ptr; + if (!directory_ptr) { + std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl; + return files; + } + + while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) { + files.push_back(std::string(dirent_ptr->d_name)); + } + return files; +} + +void readFromFile(int num, std::string path, float* data) { + std::ifstream file_stream(path); + // file_stream.open(path); + if (!file_stream.good()) { + std::cout << "file: " << path << " dones not exist!\n"; + exit(-1); + return; + } + // float* data = mutableData(); + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + data[i] = value; + } + file_stream.close(); +} + +// #ifdef LITE_WITH_FPGA +TEST(ResNet50, test) { + lite::Predictor predictor; + std::vector valid_places({ + Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + + // predictor.Build(FLAGS_model_dir, "", "", valid_places); + predictor.Build("", + FLAGS_model_dir + "/model", + FLAGS_model_dir + "/params", + valid_places); + + auto* input_tensor = predictor.GetInput(0); + int width = 300; + int height = 300; + + // std::ifstream file_stream(FLAGS_input_file); + // if (!file_stream.good()) { + // std::cout << "file: " << FLAGS_input_file << " dones not exist!\n"; + // exit(-1); + // return; + // } + + // file_stream >> height; + // file_stream >> width; + + input_tensor->Resize( + DDim(std::vector({1, 3, height, width}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + + // readFromFile(item_size, "car.data", data); + + int num = 3 * width * height; + + // for (int i = 0; i < num; ++i) { + // float value = 0; + // file_stream >> value; + // data[i] = value; + // } + // file_stream.close(); + + for (int i = 0; i < 2; ++i) { + predictor.Run(); + } + + auto* out = predictor.GetOutput(0); + for (int i = 0; i < out->dims().production(); i++) { + std::cout << ":" << out->data()[i] << std::endl; + } + + std::string file = "output/" + FLAGS_input_file.substr(6); + std::cout << "file:::" << file << std::endl; + + std::ofstream ofs; + ofs.open(file); + for (int i = 0; i < out->dims().production(); i++) { + float value = out->data()[i]; + ofs << value << std::endl; + } + ofs.close(); + + LOG(INFO) << "================== Speed Report ==================="; +} +// #endif + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp old mode 100644 new mode 100755 index 2b9b23070616baf18f347c6b2af2d87a300d428f..5aa6511cdfbcfc831c14bcf03a0c2d8096e30aa4 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -32,7 +32,8 @@ class Debugger { } void registerOutput(std::string op_type, zynqmp::Tensor* tensor) { - if (op_type != "conv") { // NOLINT + if (op_config[op_type]) { + tensor->saveToFile(op_type, true); } } @@ -40,8 +41,19 @@ class Debugger { std::unordered_map op_config; Debugger() { op_config["concat"] = true; + op_config["pooling"] = true; op_config["conv"] = true; op_config["crop"] = true; + op_config["feed"] = true; + op_config["mul"] = true; + op_config["fetch"] = true; + op_config["boxes"] = true; + op_config["scores"] = true; + op_config["nms"] = true; + op_config["pb_boxes"] = true; + op_config["pb_variances"] = true; + // op_config["fc"] = true; + op_config["softmax"] = true; } }; @@ -131,9 +143,7 @@ inline void save_tensor(const lite::Tensor* t, chw_to_hwc(const_cast(t), dst); data = dst; } - save_float(data, name, t->numel()); - delete[] dst; } } // namespace lite diff --git a/lite/backends/fpga/KD/fpga_cv.cpp b/lite/backends/fpga/KD/fpga_cv.cpp deleted file mode 100644 index 15a20e368b09f193e3f43b574ff3682ce96782ad..0000000000000000000000000000000000000000 --- a/lite/backends/fpga/KD/fpga_cv.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/fpga/KD/fpga_cv.hpp" - -using paddle::zynqmp::float16; - -void fpga_resize(float* input, - int input_width, - int input_height, - int input_channel, - uint8_t* output, - int output_width, - int output_height) { - paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0}; - paddle::zynqmp::config_inplace(inplace_args); - - paddle::zynqmp::ImageInputArgs input_args = {nullptr}; - input_args.address = nullptr; - input_args.scale_address = nullptr; - - float16* input_image_address = - reinterpret_cast(paddle::zynqmp::fpga_malloc( - input_width * input_height * input_channel * sizeof(float16))); - int index = 0; - - for (int i = 0; i < input_width * input_height * input_channel; i++) { - input_image_address[i] = float16(1.0 * input[i]); - } - - paddle::zynqmp::ResizeArgs resize_args = {0}; - - resize_args.input_width = input_width; - resize_args.input_height = input_height; - resize_args.image_channel = input_channel; - resize_args.output_width = output_width; - resize_args.output_height = output_height; - float height_ratio = static_cast(input_height) / - static_cast(resize_args.output_height); - float width_ratio = static_cast(input_width) / - static_cast(resize_args.output_width); - resize_args.height_ratio = *reinterpret_cast(&height_ratio); - resize_args.width_ratio = *reinterpret_cast(&width_ratio); - - int output_size = - resize_args.output_width * resize_args.output_height * input_channel; - float16* fpga_output = reinterpret_cast( - paddle::zynqmp::fpga_malloc(output_size * sizeof(float16))); - resize_args.input_image_address = input_image_address; - resize_args.output_image_address = fpga_output; - - memset(fpga_output, 0, output_size * sizeof(float16)); - paddle::zynqmp::fpga_flush( - input_image_address, - input_width * input_height * input_channel * sizeof(float16)); - paddle::zynqmp::fpga_flush(resize_args.output_image_address, - output_size * sizeof(float16)); - int ret = paddle::zynqmp::compute_fpga_resize(resize_args); - if (ret == 0) { - paddle::zynqmp::fpga_invalidate(resize_args.output_image_address, - output_size * sizeof(float16)); - } - - for (int i = 0; i < output_size; i++) { - output[i] = fpga_output[i]; - } -} diff --git a/lite/backends/fpga/KD/fpga_cv.hpp b/lite/backends/fpga/KD/fpga_cv.hpp deleted file mode 100644 index 6aa52edfbb704a0571fb1052aff6ecf022e49596..0000000000000000000000000000000000000000 --- a/lite/backends/fpga/KD/fpga_cv.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" -#include "lite/backends/fpga/KD/pe.hpp" - -void fpga_resize(float* input, - int input_width, - int input_height, - int input_channel, - uint8_t* output, - int output_width, - int output_height); diff --git a/lite/backends/fpga/KD/llapi/config.h b/lite/backends/fpga/KD/llapi/config.h deleted file mode 100755 index acf8c8adf4fc5593dcc4238ddc762fdb9fea6760..0000000000000000000000000000000000000000 --- a/lite/backends/fpga/KD/llapi/config.h +++ /dev/null @@ -1,19 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define PADDLE_LITE_ZU5 -#define FPGA_PRINT_MODE -#define PADDLE_LITE_PROFILE diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp old mode 100644 new mode 100755 index 30250969b6fbe6e9e5ce7e9f96f963e8bee89224..b6932bc27f0019af58cea00e4b5422396d838208 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) { std::ofstream ofs; ofs.open(name); - int8_t* data = static_cast data_in; + int8_t* data = static_cast(data_in); for (int i = 0; i < size; i++) { float value = data[i]; ofs << value << std::endl; @@ -221,6 +221,7 @@ int8_t* format_filter(float* data_in, align_to_x(num_per_div_before_alignment, filter_num_alignment); int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; + // int num_after_alignment = num_per_div_after_alignment * div_num; int residual = num % num_per_div_before_alignment; int num_after_alignment = num_per_div_after_alignment * ((residual == 0) ? div_num : (div_num - 1)) + diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp old mode 100755 new mode 100644 index 06488469d97c077a34b3cfdb8a049c8cd61dfc93..68d0b6c68b722f9c5cf31139ed7308516889bd8c --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp @@ -62,6 +62,7 @@ void reset_device() { // memory management; void *fpga_malloc(size_t size) { #ifdef ENABLE_DEBUG + std::cout << "fpga_malloc:" << size << std::endl; #endif #ifdef PADDLE_OS_LINUX void *ptr = reinterpret_cast( diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp old mode 100644 new mode 100755 index fb15eaf77822eed076ec2001bace6871e93587ff..f274ccab0b755ebd9bf26bed4b41902d29bc1305 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -62,6 +62,7 @@ class ConvPE : public PE { param_.filter->shape().height() == 1) { // NOLINT } if (!use_cpu_) { // NOLINT + // param_.filter->releaseData(); } } @@ -92,6 +93,7 @@ class ConvPE : public PE { int kernel_width = param_.filter->shape().width(); int kernel_step_h = param_.strides[0]; int kernel_step_w = param_.strides[1]; + int pooled_height_ = output->shape().height(); int pooled_width_ = out_width; int filter_chw = image_channels * kernel_height * kernel_width; diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp index ecee45569c8df3d3e3926b2ca78cb49da8415aa4..8751f013967ed3b44a6c6b11560a2f350bc7d6bf 100755 --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ b/lite/backends/fpga/KD/pes/conv_process.hpp @@ -266,8 +266,8 @@ inline void split_filter_num(const ConvParam& c_param) { int filter_num_alignment = filter::get_filter_num_alignment(); int aligned_num = align_to_x(num / param.groups, filter_num_alignment) * param.groups; - split_num = filter::calc_split_num(aligned_num, div_capacity); + split_num = filter::calc_split_num(aligned_num, div_capacity); Shape& out_shape = out->shape(); for (int i = 0; i < split_num; i++) { BasicConvParam* conv_param = new BasicConvParam(); @@ -364,6 +364,7 @@ inline void split_filter_num(const ConvParam& c_param) { args.image.height = input->shape().height(); args.image.pad_width = param.paddings[1]; args.image.pad_height = param.paddings[0]; + args.dilation = param.dilations[0]; args.output.address = out_address; @@ -419,6 +420,7 @@ inline void split_channel(const ConvParam& c_param) { } scale.flush(); bias.flush(); + // Shape sb_shape(N, {2 * channel}); format_scale_bias(&scale, &bias, &conv_param->filter, @@ -446,6 +448,7 @@ inline void split_channel(const ConvParam& c_param) { args.image.height = conv_param->input.shape().height(); args.image.pad_width = param.paddings[1]; args.image.pad_height = param.paddings[0]; + args.dilation = param.dilations[0]; args.output.address = conv_param->output.mutableData(); args.output.scale_address = conv_param->output.scale(); @@ -476,6 +479,7 @@ inline bool compute_conv(const ConvParam& c_conv_params) { } size_t size = params.size(); if (ret == 0 && size > 1) { + // Tensor* output = conv_params.output; Tensor& img = params[0]->output; for (int i = 0; i < 1; i++) { for (int i = 0; i < img.shape().numel(); i++) { diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp index 0efca2ec2e60e8973d92f41463b0444722f2a73b..d610780628612f5a4ac322f06c2c6e9ca7812925 100755 --- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp @@ -62,6 +62,7 @@ class DepthwiseConvPE : public PE { float16* scale_data = param_.scale()->data(); float16* filter_data = param.quantizedFilter()->mutableData( FP16, param.filter->shape()); + memcpy(filter_data, scale_data, param.filter->shape().numel() * sizeof(float16)); diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp old mode 100644 new mode 100755 index dcacab4eeef32b245d4126b72597b398a6627ba6..bbdf2f371f13ce6e4b1ecb6104dec8f35c1f9c3d --- a/lite/backends/fpga/KD/pes/gru_pe.hpp +++ b/lite/backends/fpga/KD/pes/gru_pe.hpp @@ -47,8 +47,10 @@ class GRUPE : public PE { zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}}; float16* prev_hidden_data = prev_hidden_.mutableData(zynqmp::FP16, hidden_shape); + // set previous hidden data to 0; memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16)); + // copy 2/3 weight from param.weight; zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}}; float* weight_data = weight_.mutableData(zynqmp::FP32, weight_shape); memset(weight_data, 0, weight_shape.numel() * sizeof(float)); @@ -115,11 +117,9 @@ class GRUPE : public PE { if (hidden_prev) { // TODO(chonwhite): change to pre_out; prev_hidden_.copyFrom(value.pre_output); - prev_hidden_.saveToFile("prev_.txt"); } - mul_pe_.dispatch(); - reset_hidden_.saveToFile("reset_hidden_.txt"); + // reset_hidden_.saveToFile("reset_hidden_.txt"); update_gate_data += stride_update; reset_gate_data += stride_update; @@ -170,6 +170,7 @@ class GRUPE : public PE { zynqmp::Tensor bias_; zynqmp::Tensor weight_; zynqmp::Tensor state_weight_; + zynqmp::Tensor update_gate_; zynqmp::Tensor reset_gate_; zynqmp::Tensor cell_state_; diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp index a8725b51a690e0e134785fcfdb3dd70edeffd441..84ed4f946e1a394cb0fc40d7c156faf534e1f8db 100755 --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp @@ -66,7 +66,7 @@ class PoolingPE : public PE { param_.poolingArgs = args; use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && - (k_width > 7 || k_height > 7); + (k_width > 255 || k_height > 255); use_cpu_ = param_.type == AVERAGE; } @@ -76,6 +76,7 @@ class PoolingPE : public PE { input->syncToCPU(); Tensor float_input; + // Tensor float_output; float* image_addr = float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); float16* data_out = output->data(); @@ -188,7 +189,9 @@ class PoolingPE : public PE { bool dispatch() { if (use_cpu_) { + // cpu_compute(); compute(); + // exit(-1); return true; } param_.input->syncToDevice(); diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp old mode 100644 new mode 100755 index cc89ac943f90cb20062a3d6ef9a46b705193ad04..09755c65a322da8ccab0d57dd2e877712b112361 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -89,7 +89,6 @@ class ScalePE : public PE { } } } - float* scale_data_float = param_.scale->data(); for (int i = 0; i < repeat; i++) { for (int j = 0; j < length; j++) { diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp old mode 100644 new mode 100755 index f1b07d02622fad32e99205667424a4cb3c9fb46d..f247741a02758a3eb0cfa7f6c653d21e4263601d --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -348,9 +348,19 @@ class Tensor { if (placeHolder_ == nullptr) { return; } + std::cout << scale()[0] << " , " << scale()[1] << std::endl; } - void printScale(std::string type) { printScale(); } + void printScale(std::string type) { + std::cout << type << " : " + << std::to_string(shape_->num()) + "_" + + std::to_string(shape_->channel()) + "_" + + std::to_string(shape_->height()) + "_" + + std::to_string(shape_->width()) + << std::endl; + std::cout << type << " \n"; + printScale(); + } std::string dimsFileName() { return std::to_string(shape_->num()) + "_" + @@ -378,6 +388,7 @@ class Tensor { static int counter = 0; std::string npath = std::to_string(counter) + "_" + path; counter++; + std::cout << "======== saving file:" << npath << " ============\n"; save_file_with_name(npath); } diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h old mode 100644 new mode 100755 index 311fc8a98400e5a6916ba1b9c8de1e6e0bcec4c0..49aded3d7d7db6d293e13298d98c2f3b165f411f --- a/lite/backends/fpga/lite_tensor.h +++ b/lite/backends/fpga/lite_tensor.h @@ -165,6 +165,9 @@ class TensorLite { TargetType target() const { return target_; } + // template + // TensorLite Slice(int64_t begin, int64_t end) const; + zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; } friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) { @@ -254,6 +257,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { int64_t base = numel() / dims_[0]; TensorLite dst; + dst.target_ = target_; auto dst_dims = dims_; dst_dims[0] = end - begin; diff --git a/lite/core/kernel.h b/lite/core/kernel.h old mode 100644 new mode 100755 diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc old mode 100644 new mode 100755 index ae74bd8d4d5647139a13509dfda0bb2b41ecc5c7..17a327f2535a88d943dd36e8b5f4f5d2c8f629cf --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst( auto io_copy_output_name = string_format("%s/target_trans", in->AsArg().name.c_str()); // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id()); - if (copied_nodes->count(in->AsArg().name)) { // Remove the old link RemoveDirectedLink(in, inst_node); diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h old mode 100644 new mode 100755 diff --git a/lite/core/program.cc b/lite/core/program.cc index b0c61bf00ed29e2fa71072b64f11f6ba30f77691..2c90a12b7709323468ed21ab244e3829b62f2ebb 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -138,11 +138,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { void RuntimeProgram::Run() { for (auto& inst : instructions_) { std::string op_type = inst.op()->op_info()->Type(); + +#ifndef LITE_WITH_FPGA if (op_type == "feed" || op_type == "fetch") continue; +#endif inst.Run(); #ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE - LITE_PRECISION_PROFILE(inst) +#ifndef LITE_WITH_FPGA +// LITE_PRECISION_PROFILE(inst) +#endif #endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } diff --git a/lite/core/tensor.h b/lite/core/tensor.h old mode 100644 new mode 100755 diff --git a/lite/gen_code/paddle_infer.h b/lite/gen_code/paddle_infer.h index e01ffc25e29ca94166e8fe12b0643ae9e914001d..2449e1e5d3fb721a39760e78a0417bf9491d8cef 100644 --- a/lite/gen_code/paddle_infer.h +++ b/lite/gen_code/paddle_infer.h @@ -46,7 +46,7 @@ class Tensor { */ class PaddlePredictor { public: - void Init(); + void Init() {} std::unique_ptr GetTensor(const std::string &id) const; std::unique_ptr GetMutableTensor(const std::string &id); diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc old mode 100644 new mode 100755 index 266ae1fc916af4303aca274c39b9b4923fdbb154..0b92317ac51b0af24443ec24436f6a483198dbbc --- a/lite/kernels/arm/cast_compute.cc +++ b/lite/kernels/arm/cast_compute.cc @@ -62,6 +62,10 @@ void CastCompute::Run() { int32_t* out_data = param.Out->mutable_data(); std::transform( x_data_begin, x_data_end, out_data, TransOp); + } else if (param.in_dtype == 3 && param.out_dtype == 5) { + const auto* x_data = param.X->data(); + auto* o_data = param.Out->mutable_data(); + memcpy(o_data, x_data, sizeof(float) * param.X->numel()); } else { LOG(FATAL) << "other has not been implemented"; } diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc old mode 100644 new mode 100755 index ad475538576b9cc73a43bac49cba1a6cf1c73edb..badd3f90288e0885aacdef6c53fbb6cc9b73ea7d --- a/lite/kernels/arm/fill_constant_compute.cc +++ b/lite/kernels/arm/fill_constant_compute.cc @@ -60,25 +60,10 @@ class FillConstantCompute : public KernelLite { auto& param = *param_.get_mutable(); auto& context = ctx_->As(); - if (param.dtype == static_cast(lite::core::FluidType::FP32)) { - auto data = param.Out->template mutable_data(); - for (int i = 0; i < param.Out->numel(); i++) { - data[i] = param.value; - } - } else if (param.dtype == - static_cast(lite::core::FluidType::INT32)) { - auto data = param.Out->template mutable_data(); - for (int i = 0; i < param.Out->numel(); i++) { - data[i] = param.value; - } - } else if (param.dtype == - static_cast(lite::core::FluidType::INT8)) { - auto data = param.Out->template mutable_data(); - for (int i = 0; i < param.Out->numel(); i++) { - data[i] = param.value; - } - } else { - LOG(FATAL) << "not supported dtype " << param.dtype; + // auto data = param.Out->template mutable_data(); + auto data = param.Out->template mutable_data(); + for (int i = 0; i < param.Out->numel(); i++) { + data[i] = param.value; } } @@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute auto& param = *param_.get_mutable(); auto& context = ctx_->As(); - if (param.input->lod().size() && param.input_dim_idx == 0) { - auto odims = param.out->dims(); - odims[param.output_dim_idx] = param.input->lod().back().size() - 1; - param.out->Resize(odims); + // auto data = param.out->template mutable_data(); + auto data = param.out->template mutable_data(); + for (int i = 0; i < param.out->numel(); i++) { + data[i] = param.value; } - if (param.dtype == static_cast(lite::core::FluidType::FP32)) { - auto data = param.out->template mutable_data(); - for (int i = 0; i < param.out->numel(); i++) { - data[i] = param.value; - } - } else if (param.dtype == - static_cast(lite::core::FluidType::INT32)) { - auto data = param.out->template mutable_data(); - for (int i = 0; i < param.out->numel(); i++) { - data[i] = param.value; - } - } else if (param.dtype == - static_cast(lite::core::FluidType::INT8)) { - auto data = param.out->template mutable_data(); - for (int i = 0; i < param.out->numel(); i++) { - data[i] = param.value; - } - } else { - LOG(FATAL) << "not supported dtype " << param.dtype; - } + // if (param.input->lod().size() && param.input_dim_idx == 0) { + // auto odims = param.out->dims(); + // odims[param.output_dim_idx] = param.input->lod().back().size() - 1; + // param.out->Resize(odims); + // } + + // if (param.dtype == static_cast(lite::core::FluidType::FP32)) { + // auto data = param.out->template mutable_data(); + // for (int i = 0; i < param.out->numel(); i++) { + // data[i] = param.value; + // } + // } else if (param.dtype == + // static_cast(lite::core::FluidType::INT32)) { + // auto data = param.out->template mutable_data(); + // for (int i = 0; i < param.out->numel(); i++) { + // data[i] = param.value; + // } + // } else if (param.dtype == + // static_cast(lite::core::FluidType::INT8)) { + // auto data = param.out->template mutable_data(); + // for (int i = 0; i < param.out->numel(); i++) { + // data[i] = param.value; + // } + // } else { + // LOG(FATAL) << "not supported dtype " << param.dtype; + // } } virtual ~FillConstantBatchLikeCompute() = default; @@ -144,6 +135,7 @@ REGISTER_LITE_KERNEL(fill_constant, {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); + REGISTER_LITE_KERNEL(fill_constant_batch_size_like, kARM, kAny, @@ -153,3 +145,4 @@ REGISTER_LITE_KERNEL(fill_constant_batch_size_like, .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); + diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc index bc52c5ea3ee452033cfd3c7d559cb88b21ca48f6..221f081feb0dc9873a183d5df215342da7fef6b7 100644 --- a/lite/kernels/arm/layout_compute.cc +++ b/lite/kernels/arm/layout_compute.cc @@ -59,6 +59,8 @@ namespace arm { template <> void NCHWToNHWCCompute::Run() { NCHWTONHWC(float); + // auto& param = this->template Param(); + // param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } template <> @@ -69,6 +71,9 @@ void NCHWToNHWCCompute::Run() { template <> void NHWCToNCHWCompute::Run() { NHWCTONCHW(float); + // auto& param = this->template Param(); + // param.y->mutable_data(); + // param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } template <> diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc old mode 100644 new mode 100755 index ba58b378f4dda22fd78ce76b80bdbca8d8f284a3..fa7e2c0c3ae4580f5d19e82f7c48c74db3058847 --- a/lite/kernels/arm/lookup_table_compute.cc +++ b/lite/kernels/arm/lookup_table_compute.cc @@ -28,6 +28,7 @@ namespace arm { void LookupTableCompute::Run() { auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); // inputs auto w = param.W; auto ids = param.Ids; @@ -36,7 +37,7 @@ void LookupTableCompute::Run() { auto table_dim = w->dims(); int64_t ids_numel = ids->numel(); - auto ids_data = ids->data(); + auto ids_data = ids->data(); int64_t row_number = table_dim[0]; int64_t row_width = table_dim[1]; @@ -75,14 +76,3 @@ REGISTER_LITE_KERNEL(lookup_table, .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); - -REGISTER_LITE_KERNEL(lookup_table_v2, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::LookupTableCompute, - def) - .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index 7c47e72872ecae6216288c20fa1a6ae30fac65bd..e71e5255ca6daa0c86c7f1b1c3d9174df66cac25 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -7,7 +7,9 @@ set(fpga_deps fpga_target_wrapper kernel_fpga) # add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) -# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) + +add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) + add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps}) add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) @@ -16,9 +18,11 @@ add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) + # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps}) add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps}) add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) + # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/calib_compute.h b/lite/kernels/fpga/calib_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc index 7670bf0007def88c27c12ea54c569a7fcf263693..79329e99a3e5e812dca487c17452f3f5d1e96449 100755 --- a/lite/kernels/fpga/feed_compute.cc +++ b/lite/kernels/fpga/feed_compute.cc @@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL( PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(feed, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::FeedCompute, + def_host) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc old mode 100644 new mode 100755 index 9b5f3f60232bb8527f823395693cf3b3851bc04e..2d296f4d4a89b1fd86e5b2330d3caf44fbad0903 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() { } void FetchCompute::Run() { - pe_.dispatch(); auto& param = this->Param(); + auto fetch_list = param.fetch_list; + if (fetch_list->size() <= static_cast(param.col)) { + fetch_list->resize(param.col + 1); + } + Tensor& out = param.fetch_list->at(param.col); + out.Resize(param.input->dims()); + pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::OutputParam& fetch_param = pe_.param(); @@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kAny), - DATALAYOUT(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); REGISTER_LITE_KERNEL(fetch, @@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch, kNHWC, paddle::lite::kernels::fpga::FetchCompute, host_host) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kAny), - DATALAYOUT(kAny))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kAny), - DATALAYOUT(kAny))}) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); diff --git a/lite/kernels/fpga/gru_compute.cc b/lite/kernels/fpga/gru_compute.cc index 25fdcb505bcc8221da74b8cc87dc4fbec86b6190..a157382a6fb7ac39e4b102f5ac65dea337ed0f13 100755 --- a/lite/kernels/fpga/gru_compute.cc +++ b/lite/kernels/fpga/gru_compute.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include + #include #include #include @@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() { void GRUCompute::Run() { auto& param = this->Param(); param.hidden->mutable_data(); + // inputs auto input = param.input; auto h0 = param.h0; @@ -130,6 +132,7 @@ void GRUCompute::Run() { // //3. gru_value.prev_out_value = ordered_h0.mutable_data(); gru_tensors.pre_output = ordered_h0.ZynqTensor(); + } else { gru_value.prev_out_value = nullptr; gru_tensors.pre_output = nullptr; @@ -169,6 +172,7 @@ void GRUCompute::Run() { float* hidden_data = hidden_out.mutableData(zynqmp::FP32, float_input_shape); + gru_tensors.gate = &float_input; gru_tensors.output = &hidden_out; @@ -187,11 +191,6 @@ void GRUCompute::Run() { *(batch_hidden->mutable_lod()) = batch_gate->lod(); batch_hidden->mutable_data(); to_seq(*batch_hidden, hidden); - - save_tensor(const_cast(input), "_input.txt"); - save_tensor(hidden, "_gru.txt"); - - exit(-1); } } // namespace fpga diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc old mode 100644 new mode 100755 index 10a0e3116b920a2f408606ef211f408ed2279f60..57a76dee97ca889cd645a2c8f81b5a2354f9b11f --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -119,7 +119,81 @@ class IoCopyFpgaToHostCompute auto out_lod = param.y->mutable_lod(); *out_lod = param.x->lod(); } + std::string doc() const override { return "Copy IO from FPGA to HOST"; } +}; + +void hwc_to_chw(float* chw_data, + float* hwc_data, + int num, + int channel, + int height, + int width) { + int chw = channel * height * width; + int wc = width * channel; + int wh = width * height; + int index = 0; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + chw_data[n * chw + c * wh + h * width + w] = hwc_data[index]; + index++; + } + } + } + } +} + +class IoCopyFpgaToHostCHWCompute + : public KernelLite { + public: + void Run() override { + auto& param = Param(); + CHECK(param.x->target() == TARGET(kHost) || + param.x->target() == TARGET(kFPGA)); + + Tensor hwc; + hwc.Resize(param.y->dims()); + float* hwc_data = hwc.mutable_data(); + + float* chw_data = param.y->mutable_data(); + param.y->ZynqTensor()->setDataType(zynqmp::FP32); + param.x->ZynqTensor()->syncToDevice(); + if (param.x->ZynqTensor()->aligned() && + param.x->ZynqTensor()->shape().shouldAlign()) { + zynqmp::Tensor tempTensor; + tempTensor.mutableData(zynqmp::FP16, + param.x->ZynqTensor()->shape()); + tempTensor.copyFrom(param.x->ZynqTensor()); + tempTensor.setAligned(true); + tempTensor.unalignImage(); + hwc.ZynqTensor()->copyFrom(&tempTensor); + } else { + hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } + + int num = 1; + int channel = 1; + int height = 1; + int width = 1; + + auto dims = param.y->ZynqTensor()->shape(); + + hwc_to_chw(chw_data, + hwc_data, + dims.num(), + dims.channel(), + dims.height(), + dims.width()); + + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + param.y->ZynqTensor()->flush(); + auto out_lod = param.y->mutable_lod(); + *out_lod = param.x->lod(); + // param.x->ZynqTensor()->saveToFile("io_x", true); + // param.y->ZynqTensor()->saveToFile("io_y", true); + } std::string doc() const override { return "Copy IO from FPGA to HOST"; } }; @@ -170,7 +244,7 @@ REGISTER_LITE_KERNEL(io_copy, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kARM), + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNHWC))}) .Finalize(); @@ -179,8 +253,8 @@ REGISTER_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, - paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute, - device_to_host_22) + paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute, + device_to_host_chw) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc index cee5e16205370df7faabc6f37d57fe360e8a9e67..4834054df6371a9faaa17bd17b53a29b999ddf03 100644 --- a/lite/kernels/fpga/multiclass_nms_compute.cc +++ b/lite/kernels/fpga/multiclass_nms_compute.cc @@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() { scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); outs->ZynqTensor()->copyFrom(out.ZynqTensor()); } + outs->Resize({static_cast(e - s), out_dim}); } } LoD lod; diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc index afd14ccb4b4a9a4f1e93e1e38840035fb18186bb..c889df17cb72a6d3e8ab02efc729ecc93fb38a5f 100644 --- a/lite/kernels/fpga/prior_box_compute.cc +++ b/lite/kernels/fpga/prior_box_compute.cc @@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() { param.boxes->mutable_data(); param.variances->mutable_data(); + zynqmp::PriorBoxParam& priobox_param = pe_.param(); priobox_param.input = param.input->ZynqTensor(); priobox_param.image = param.image->ZynqTensor(); diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt old mode 100644 new mode 100755 index 428cc213ce63b8d24193a44f23d61fea78f63d6a..c6f2721d80b6fd584ce96e817476372e37b17ed8 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(one_hot_compute_host Host basic SRCS one_hot_compute.cc DEPS ${lite_kernel_deps}) #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any) #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any) diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc index 6f6079ef88fd9e61dbacb35c0ca8bdac536288a9..82a694b363b4cc219c48c294fb7545b26492f973 100644 --- a/lite/kernels/host/multiclass_nms_compute.cc +++ b/lite/kernels/host/multiclass_nms_compute.cc @@ -392,7 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms, kNCHW, paddle::lite::kernels::host::MulticlassNmsCompute, def) - .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("BBoxes", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindInput("Scores", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); diff --git a/lite/kernels/host/one_hot_compute.cc b/lite/kernels/host/one_hot_compute.cc new file mode 100755 index 0000000000000000000000000000000000000000..e0af6f5173f367bb9b2e06de10499ee36806379c --- /dev/null +++ b/lite/kernels/host/one_hot_compute.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "lite/backends/fpga/KD/debugger.hpp" +#include "lite/kernels/host/one_hot_compute.h" +#include "lite/utils/paddle_enforce.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +void OneHotCompute::Run() { + auto& param = Param(); + param.Out->mutable_data(); + int depth = param.depth; + if (param.depth_tensor) { + auto* depth_tensor = param.depth_tensor; + auto* depth_data = depth_tensor->data(); + depth = depth_data[0]; + auto in_dims = param.X->dims(); + DDim out_dims(in_dims); + out_dims[out_dims.size() - 1] = depth; + param.Out->Resize(out_dims); + } + + auto* p_in_data = param.X->data(); + auto numel = param.X->numel(); + auto* p_out_data = param.Out->mutable_data(); + + for (int i = 0; i < param.Out->numel(); ++i) { + p_out_data[i] = 0; + } + + if (param.allow_out_of_range) { + for (int i = 0; i < numel; ++i) { + if (p_in_data[i] >= 0 && p_in_data[i] < param.depth) { + *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT + } + } + } else { + for (int i = 0; i < numel; ++i) { + PADDLE_ENFORCE_GE( + p_in_data[i], 0, "Illegal index value, should be at least 0."); + PADDLE_ENFORCE_LT(p_in_data[i], + param.depth, + "Illegal index value, should be less than depth (%d).", + param.depth); + *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0; // NOLINT + } + } +} +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(one_hot, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::OneHotCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/host/one_hot_compute.h b/lite/kernels/host/one_hot_compute.h new file mode 100755 index 0000000000000000000000000000000000000000..3a6c47fee31bc28f130c3de782c0c912c9f4b769 --- /dev/null +++ b/lite/kernels/host/one_hot_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class OneHotCompute + : public KernelLite { + public: + void Run() override; + + virtual ~OneHotCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/reshape_compute.cc b/lite/kernels/host/reshape_compute.cc index 02f99787e60e73d91ca8f65cb42dcd4c56e7212b..7a826ed32b02a85860038482d8ca55c5db32a9bf 100644 --- a/lite/kernels/host/reshape_compute.cc +++ b/lite/kernels/host/reshape_compute.cc @@ -46,19 +46,43 @@ REGISTER_LITE_KERNEL(reshape, paddle::lite::kernels::host::ReshapeCompute, def) .BindInput("X", - {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindInput("ShapeTensor", - {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindInput("Shape", - {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", - {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .Finalize(); +// REGISTER_LITE_KERNEL(reshape, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::host::ReshapeCompute, +// def) +// .BindInput("X", +// {LiteType::GetTensorTy( +// TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) +// .BindInput("ShapeTensor", +// {LiteType::GetTensorTy( +// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) +// .BindInput("Shape", +// {LiteType::GetTensorTy( +// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) +// .BindOutput("Out", +// {LiteType::GetTensorTy( +// TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))}) +// .Finalize(); + REGISTER_LITE_KERNEL(reshape2, kHost, kAny, diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt old mode 100644 new mode 100755 index 190cf7194c19a47f377755a9e9b61d890bc1a262..a0c631e517afc8b3cdf9e97b00e327c477b6d026 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -136,6 +136,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS}) add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS}) add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS}) +add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS}) + if (NOT LITE_WITH_X86) lite_cc_test(test_fc_op SRCS fc_op_test.cc DEPS fc_op memory diff --git a/lite/operators/one_hot_op.cc b/lite/operators/one_hot_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..023cdc23aeb8329736b7438af2c52cbfa899c75c --- /dev/null +++ b/lite/operators/one_hot_op.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/one_hot_op.h" +#include "lite/core/op_registry.h" + +#include "lite/backends/fpga/KD/debugger.hpp" + +namespace paddle { +namespace lite { +namespace operators { + +bool OneHotOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + return true; +} + +bool OneHotOp::InferShape() const { + CHECK_OR_FALSE(param_.Out); + // TODO(Superjomn) Enable data sharing. + auto out_dims = param_.X->dims(); + + out_dims[out_dims.size() - 1] = param_.depth; + param_.Out->Resize(out_dims); + return true; +} + +bool OneHotOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.X = + scope->FindVar(opdesc.Input("X").front())->GetMutable(); + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + + if (opdesc.HasInput("depth_tensor")) { + auto depth_tensor = opdesc.Input("depth_tensor").front(); + param_.depth_tensor = + scope->FindVar(depth_tensor)->GetMutable(); + } + + CHECK(param_.X); + CHECK(param_.Out); + param_.depth = opdesc.GetAttr("depth"); + param_.dtype = opdesc.GetAttr("dtype"); + + if (opdesc.HasAttr("allow_out_of_range")) { + param_.allow_out_of_range = opdesc.GetAttr("allow_out_of_range"); + } + + auto out_lod = param_.Out->mutable_lod(); + *out_lod = param_.X->lod(); + // param_.allow_out_of_range = opdesc.GetAttr("allow_out_of_range"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(one_hot, paddle::lite::operators::OneHotOp); diff --git a/lite/operators/one_hot_op.h b/lite/operators/one_hot_op.h new file mode 100755 index 0000000000000000000000000000000000000000..4a0613952520279699a0f4a56d002483de325241 --- /dev/null +++ b/lite/operators/one_hot_op.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class OneHotOp : public OpLite { + public: + OneHotOp() {} + explicit OneHotOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "one_hot"; } + + private: + mutable OneHotParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h old mode 100644 new mode 100755 index cfee6a0391d81992069d70e9ac37e0e6594bd305..4f27e7a0d7b5bcfdbef5463d9cb352813f651bbf --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -1130,7 +1130,15 @@ struct GridSamplerParam { lite::Tensor* out{}; lite::Tensor* grid{}; }; - -} // namespace operators -} // namespace lite -} // namespace paddle +/// --------------------- attentions operators -------------- +struct OneHotParam { + lite::Tensor* X{}; + lite::Tensor* depth_tensor{nullptr}; + lite::Tensor* Out{}; + int depth{-1}; + int dtype{}; + bool allow_out_of_range{false}; +}; +}; // namespace operators +}; // namespace lite +}; // namespace paddle diff --git a/lite/tools/build.sh b/lite/tools/build.sh index e1610b60d3b1b104699ab175bca3bb3cf81bd40b..6121186e7c983145f2f9f450f6a23ea1957bb496 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4} # global variables -BUILD_EXTRA=OFF +BUILD_EXTRA=ON BUILD_JAVA=ON BUILD_PYTHON=OFF BUILD_DIR=$(pwd) diff --git a/lite/tools/build_fpga.sh b/lite/tools/build_fpga.sh index f8c186e92fc3ba23e5e09b6a139202d028e58fc6..ab10798fe7da34ddd88b2fab0bcc0e5f4b8ce233 100755 --- a/lite/tools/build_fpga.sh +++ b/lite/tools/build_fpga.sh @@ -2,12 +2,16 @@ build_dir=build_fpga mkdir -p ${build_dir} -cd ${build_dir} -GEN_CODE_PATH_PREFIX=lite/gen_code -mkdir -p ./${GEN_CODE_PATH_PREFIX} -touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc +root_dir=$(pwd) +build_dir=${build_dir} +# in build directory +# 1. Prepare gen_code file +GEN_CODE_PATH_PREFIX=${build_dir}/lite/gen_code +mkdir -p ${GEN_CODE_PATH_PREFIX} +touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc +cd ${build_dir} cmake .. \ -DWITH_GPU=OFF \ -DWITH_MKL=OFF \ @@ -19,8 +23,9 @@ cmake .. \ -DLITE_WITH_OPENMP=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DWITH_TESTING=OFF \ - -DARM_TARGET_OS=armlinux - -make -j8 + -DARM_TARGET_OS=armlinux \ + -DLITE_BUILD_EXTRA=ON \ + -DLITE_WITH_PROFILE=OFF +make -j42 cd -