diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index e3f7a211d70920aa74765b976af6939d55a328ab..3616823985bffb9d53615a031759c701d4b2ff09 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,9 @@ if (WITH_PADDLE_MOBILE) return() endif(WITH_PADDLE_MOBILE) +# set(CMAKE_BUILD_TYPE DEBUG) + + set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(CMAKE_CXX_STANDARD 11) diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 70f483822ac484576fe6934c0a30e85593e1e93a..8ef2257f17465be8e6ac92a842862ac68e45f765 100755 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -223,6 +223,25 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) + lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc + DEPS ${lite_model_test_DEPS} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + + lite_cc_test(test_inceptionv3_fpga SRCS inceptionv3_test_fpga.cc + DEPS ${lite_model_test_DEPS} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + + lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc + DEPS ${lite_model_test_DEPS} + CL_DEPS ${opencl_kernels} + ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl + --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) + add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) + lite_cc_test(test_ocr_attention_fpga SRCS ocr_attention_test_fpga.cc + DEPS ${lite_model_test_DEPS}) + # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc # DEPS ${lite_model_test_DEPS} # CL_DEPS ${opencl_kernels} diff --git a/lite/api/inceptionv3_test_fpga.cc b/lite/api/inceptionv3_test_fpga.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca3807124e69197bb9c6c62385a7d7e4f30449c7 --- /dev/null +++ b/lite/api/inceptionv3_test_fpga.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { + +#ifdef LITE_WITH_FPGA +TEST(ResNet50, test) { + lite::Predictor predictor; + + std::vector valid_places({ + Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + + predictor.Build("", + FLAGS_model_dir + "/model", + FLAGS_model_dir + "/params", + valid_places); + + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < 2; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; +} +#endif + +} // namespace lite +} // namespace paddle diff --git a/lite/api/ocr_attention_test_fpga.cc b/lite/api/ocr_attention_test_fpga.cc new file mode 100755 index 0000000000000000000000000000000000000000..326de883d1625f7196426094cc4ccec970f8a399 --- /dev/null +++ b/lite/api/ocr_attention_test_fpga.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +DEFINE_string(input_file, "", "input_file"); + +namespace paddle { +namespace lite { + +void read_from_file(const std::string& path, float* data, int num) { + std::ifstream file_stream; + file_stream.open(path); + if (!file_stream) { + exit(-1); + return; + } + + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + data[i] = value; + } +} + +void chw_to_hwc(float* src, float* dst, int channel, int height, int width) { + int amount_per_row = width * channel; + int index = 0; + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + int dst_index = offset_height + w * channel + c; + dst[dst_index] = src[index]; + index = index + 1; + } + } + } +} + +void TestModel(const std::vector& valid_places, + const Place& preferred_place, + bool use_npu = false) { + DeviceInfo::Init(); + DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); + lite::Predictor predictor; + + // predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); + predictor.Build("", "attention/model", "attention/params", valid_places); + + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector({1, 1, 100, 200}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + + read_from_file(FLAGS_input_file, data, 100 * 200); + //============================================= + auto* init_ids = predictor.GetInput(1); + init_ids->Resize(DDim(std::vector({1, 1}))); + auto* data_ids = init_ids->mutable_data(); + auto ids_size = init_ids->dims().production(); + for (int i = 0; i < ids_size; i++) { + data_ids[i] = 0; + } + auto lod_ids = init_ids->mutable_lod(); + std::vector> lod_i{{0, 1}, {0, 1}}; + *lod_ids = lod_i; + + //============================================= + auto* init_scores = predictor.GetInput(2); + init_scores->Resize(DDim(std::vector({1, 1}))); + auto* data_scores = init_scores->mutable_data(); + auto scores_size = input_tensor->dims().production(); + for (int i = 0; i < scores_size; i++) { + data_scores[i] = 0; + } + auto lod_scores = init_scores->mutable_lod(); + std::vector> lod_s{{0, 1}, {0, 1}}; + *lod_scores = lod_s; + + //============================================= + auto* position_encoding = predictor.GetInput(3); + position_encoding->Resize( + DDim(std::vector({1, 33, 10, 23}))); + auto* position_encoding_data = position_encoding->mutable_data(); + + float* temp_data = position_encoding_data; + + for (int i = 0; i < position_encoding->dims().production(); ++i) { + temp_data[i] = 0; + } + int index = 0; + for (int i = 0; i < 10; i++) { + for (int row = 0; row < 10; row++) { + for (int col = 0; col < 23; col++) { + if (i == row) { + temp_data[index] = 1.0f; + } else { + temp_data[index] = 0.0f; + } + index++; + } + } + } + for (int i = 0; i < 23; i++) { + for (int row = 0; row < 10; row++) { + for (int col = 0; col < 23; col++) { + if (i == col) { + temp_data[index] = 1.0f; + } else { + temp_data[index] = 0.0f; + } + index++; + } + } + } + // chw_to_hwc(temp_data, position_encoding_data, 33, 10, 23); + // delete[] temp_data; + + // read_from_file("position_encoding.data", position_encoding_data, 33 * 10 * + // 23); + auto start = GetCurrentUS(); + for (int i = 0; i < 2; ++i) { + predictor.Run(); + } + + std::cout << "================== Speed Report ==================="; + std::cout << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + auto* out = predictor.GetOutput(0); + + std::string file = "plate_data/" + FLAGS_input_file.substr(9); + std::cout << "file:::" << file << std::endl; + + std::ofstream ofs; + ofs.open(file); + for (int i = 0; i < out->dims().production(); i++) { + float value = out->data()[i]; + ofs << value << std::endl; + } + ofs.close(); +} + +TEST(OcrAttention, test_arm) { + std::vector valid_places({ + Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + TestModel(valid_places, Place{TARGET(kARM), PRECISION(kFloat)}); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index 9b1189c407d6d601bb3e5ba8172b1455f04710fd..cbc65e41e2912df10fca00169cdc64ea832e7d03 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -14,6 +14,8 @@ #pragma once +#include +#include #include #include @@ -22,7 +24,7 @@ namespace paddle { namespace lite { -#define FPGA_PRINT_TENSOR +// #define FPGA_PRINT_TENSOR class Debugger { public: @@ -37,25 +39,42 @@ class Debugger { } } + void tick(std::string key) { + float value = 0; + if (tick_tock_map.count(key) > 0) { + value += tick_tock_map[key] = value; + } + } + + void tock(std::string key) {} + + void setEnable(bool en) { enabled_ = en; } + private: + bool enabled_ = false; + std::unordered_map op_config; + std::unordered_map tick_tock_map; Debugger() { op_config["concat"] = true; op_config["pooling"] = true; op_config["conv"] = true; + op_config["dropout"] = true; op_config["dwconv"] = true; op_config["ew_add"] = true; + op_config["ew_mul"] = true; op_config["crop"] = true; op_config["feed"] = true; - op_config["mul"] = true; op_config["fetch"] = true; + op_config["fc"] = true; + op_config["mul"] = true; op_config["boxes"] = true; op_config["scores"] = true; op_config["nms"] = true; op_config["pb_boxes"] = true; op_config["pb_variances"] = true; - // op_config["fc"] = true; op_config["softmax"] = true; + op_config["split"] = true; } }; diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp index eddf5ca454cdc9e91f87d6e4f2c8dfc13f35fdc6..fe66c84143fbc05f0b52a11e2e315b7f3db9054c 100755 --- a/lite/backends/fpga/KD/dl_engine.hpp +++ b/lite/backends/fpga/KD/dl_engine.hpp @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include + #include "lite/backends/fpga/KD/llapi/filter.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h" diff --git a/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl b/lite/backends/fpga/KD/io.cpp similarity index 63% rename from lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl rename to lite/backends/fpga/KD/io.cpp index 615bf892b321ba67043d41f6032caa758d78c16f..868287d937bf73a6e34af736b4f94a29f34036ca 100644 --- a/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl +++ b/lite/backends/fpga/KD/io.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "io.hpp" -__kernel void sigmoid(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) { - const int index = get_global_id(0); - if (index < count) { - out_data[index] = 1 / (1 + exp(-x_data[index])); - } -} +namespace paddle { +namespace zynqmp { + + +} // namespace zynqmp +} // namespace paddle diff --git a/lite/backends/fpga/KD/io.hpp b/lite/backends/fpga/KD/io.hpp new file mode 100644 index 0000000000000000000000000000000000000000..874226bcfed506f10e5e092451b3fe291e3fc6c8 --- /dev/null +++ b/lite/backends/fpga/KD/io.hpp @@ -0,0 +1,39 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace zynqmp { + +class FpgaIO { + public: + static FpgaIO& get_instance() { + static FpgaIO s_instance; + return s_instance; + } + + void allocData(size_t s) { data_ = new float[s]; } + + float* getData() { return data_; } + + private: + float* data_ = nullptr; + + FpgaIO(); +}; +} // namespace zynqmp +} // namespace paddle diff --git a/lite/backends/fpga/KD/llapi/bias_scale.cpp b/lite/backends/fpga/KD/llapi/bias_scale.cpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp index da81565cf5ca152a54b6cc1514cb660589428439..e09b9d67d1263278abcd84d6ab9d7e392ee94b48 100755 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -240,8 +240,10 @@ int8_t* format_filter(float* data_in, for (int n = 0; n < num; n++) { float* filter_start = data_in + n * chw; int8_t* quantized_start = quantized_data + n * chw; - quantize(filter_start, quantized_start, chw, max); - filter_max.push_back(1); + // float f_max = find_max(filter_start, chw); + float f_max = max; + quantize(filter_start, quantized_start, chw, f_max); + filter_max.push_back(f_max); } int8_t* hwc_data = diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp index bcbf2b98f487aea3c6516fa6369e70d11be97ffc..f8dc1e69627dd039d130a19f224c14eb04e0be92 100644 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp @@ -204,7 +204,7 @@ int get_device_info(const struct DeviceInfo &args) { int perform_bypass(const struct BypassArgs &args) { int ret = -1; int size = args.image.channels * args.image.width * args.image.height; - int max_size = 1 << 21; + int max_size = 1 << 20; float times = 1.0 * size / max_size; int count = static_cast(times); diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp index 42ec32957e5884aaae3cc96f46060de114b44ead..222a788d351d9b3dd2cde7c595af898602990ea3 100644 --- a/lite/backends/fpga/KD/pe_params.hpp +++ b/lite/backends/fpga/KD/pe_params.hpp @@ -83,26 +83,34 @@ struct ConvParam : PEParam { std::vector kernelSize; std::vector dilations; - Tensor* scale() { return scale_; } + Tensor* scale() { return &scale_; } - Tensor* bias() { return bias_; } + Tensor* bias() { return &bias_; } std::vector& splitParams() { return splitParams_; } + ~ConvParam() { + for (int i = 0; i < splitParams_.size(); i++) { + BasicConvParam* basic_param = splitParams_[i]; + delete basic_param; + } + splitParams_.clear(); + } + protected: std::vector splitParams_; - Tensor* scale_ = new Tensor(); - Tensor* bias_ = new Tensor(); + Tensor scale_; + Tensor bias_; }; struct DepthwiseConvParam : ConvParam { public: - Tensor* quantizedFilter() { return quantizedFilter_; } + Tensor* quantizedFilter() { return &quantizedFilter_; } DWconvArgs args; protected: - Tensor* quantizedFilter_ = new Tensor(); + Tensor quantizedFilter_; }; enum PoolingType : int { @@ -142,7 +150,7 @@ struct ElementwiseAddParam : PEParam { struct ElementwiseMulParam : PEParam { public: - Tensor* input_x; + Tensor* input_x = nullptr; Tensor* input_y = nullptr; Tensor* output = nullptr; }; @@ -154,13 +162,13 @@ struct FullyConnectedParam : PEParam { Tensor* bias = nullptr; Tensor* output = nullptr; - Tensor* quantizedFilter() { return quantizedFilter_; } + Tensor* quantizedFilter() { return &quantizedFilter_; } - Tensor* biasScale() { return biasScale_; } + Tensor* biasScale() { return &biasScale_; } protected: - Tensor* quantizedFilter_ = new Tensor(); - Tensor* biasScale_ = new Tensor(); + Tensor quantizedFilter_; + Tensor biasScale_; }; struct SoftmaxParam : PEParam { @@ -193,10 +201,10 @@ struct NormParam : PEParam { }; struct PriorBoxParam : PEParam { - Tensor* input; - Tensor* image; - Tensor* outputBoxes; - Tensor* outputVariances; + Tensor* input = nullptr; + Tensor* image = nullptr; + Tensor* outputBoxes = nullptr; + Tensor* outputVariances = nullptr; std::vector minSizes; std::vector maxSizes; @@ -212,10 +220,10 @@ struct PriorBoxParam : PEParam { }; struct YoloBoxParam : PEParam { - Tensor* input; - Tensor* imgSize; - Tensor* outputBoxes; - Tensor* outputScores; + Tensor* input = nullptr; + Tensor* imgSize = nullptr; + Tensor* outputBoxes = nullptr; + Tensor* outputScores = nullptr; int downsampleRatio; std::vector anchors; int classNum; @@ -229,15 +237,15 @@ struct ScaleParam : PEParam { Tensor* scale = nullptr; Tensor* bias = nullptr; - Tensor* alignedScale() { return alignedScale_; } + Tensor* alignedScale() { return &alignedScale_; } - Tensor* alignedBias() { return alignedBias_; } + Tensor* alignedBias() { return &alignedBias_; } ScaleArgs args = {0}; protected: - Tensor* alignedScale_ = new Tensor(); - Tensor* alignedBias_ = new Tensor(); + Tensor alignedScale_; + Tensor alignedBias_; }; struct ResizeParam : PEParam { diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp index b4eac2c41e138cab19197ccb8ab89681a69ec6fe..4c5da08a4b1f5ae02965b03da8ff95c09a721f5c 100644 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -212,6 +212,8 @@ class ConvPE : public PE { ConvParam& param() { return param_; } + ~ConvPE() {} + private: bool use_cpu_ = false; bool split_channel = false; diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp index a2b184e383aa600b1279197a115c58309e204a95..01e16a454af8f14c06b7d62fbefe9b29cfef2850 100644 --- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp +++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp @@ -38,7 +38,7 @@ class FullyConnectedPE : public PE { Tensor* input = param_.input; convParam_.input = param_.input; convParam_.output = param_.output; - // convParam_.relu = param_.relu; + convParam_.activeParam.type = param_.activeParam.type; convParam_.groups = 1; convParam_.strides = {1, 1}; diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp index 2944691693b135a2d2df7b91ecbe0ef249b015d8..2d02d30fbae12efc372e58c2ad80348356a8f22d 100755 --- a/lite/backends/fpga/KD/pes/output_pe.hpp +++ b/lite/backends/fpga/KD/pes/output_pe.hpp @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/pe.hpp" #include "lite/backends/fpga/KD/pe_params.hpp" @@ -52,6 +53,12 @@ class OutputPE : public PE { memcpy(DLEngine::get_instance().out_data, output->data(), output->shape().numel() * sizeof(float)); + + fpga_reset(); + + auto max = fpga_get_memory_size_max(); + std::cout << "PL ===== Max: ===== :: " << max << std::endl; + return true; } diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp index 00dfe1830f6f44cbf6a30708fa5783563470c686..d7d58ee8b7e23de843143b643eda0272c4cfc34b 100644 --- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp +++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp @@ -241,10 +241,13 @@ void PriorBoxPE::compute_prior_box() { } boxes.flush(); - boxes.syncToCPU(); + // boxes.syncToCPU(); variances.flush(); output_boxes->copyFrom(&boxes); output_variances->copyFrom(&variances); + + output_boxes->invalidate(); + output_variances->invalidate(); } void PriorBoxPE::apply() {} @@ -253,8 +256,9 @@ bool PriorBoxPE::dispatch() { if (cachedBoxes_ == nullptr) { cachedBoxes_ = new Tensor(); cachedVariances_ = new Tensor(); - cachedBoxes_->mutableData(FP32, param_.outputBoxes->shape()); - cachedVariances_->mutableData(FP32, param_.outputVariances->shape()); + cachedBoxes_->mutableData(FP16, param_.outputBoxes->shape()); + cachedVariances_->mutableData(FP16, + param_.outputVariances->shape()); cachedBoxes_->setDataLocation(CPU); cachedVariances_->setDataLocation(CPU); compute_prior_box(); diff --git a/lite/backends/fpga/KD/pes/relu_pe.hpp b/lite/backends/fpga/KD/pes/relu_pe.hpp index 5c125010c27615c545ba274b259f18c775db3d55..dfc70867735b18f10970864888eca88c7f2dc56e 100755 --- a/lite/backends/fpga/KD/pes/relu_pe.hpp +++ b/lite/backends/fpga/KD/pes/relu_pe.hpp @@ -23,43 +23,27 @@ class ReluPE : public PE { public: bool init() { Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); + output->setAligned(param_.input->aligned()); + output->setDataLocation(CPU); return true; } - void apply() { - Tensor* src = param_.input; - - args_.input_data_type = DATA_TYPE_FP16; - args_.output_data_type = DATA_TYPE_FP16; - args_.input_layout_type = LAYOUT_HWC; - args_.output_layout_type = LAYOUT_HWC; - args_.image = {.address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().channel(), - .width = (uint32_t)src->shape().width(), - .height = (uint32_t)src->shape().height(), - .pad_width = 0u, - .pad_height = 0u}; - args_.output = { - .address = param_.output->data(), - .scale_address = param_.output->scale(), - }; - - inplace_.relu_enable = false; - inplace_.power_enable = false; - inplace_.normalize_enable = false; - } + void apply() {} bool dispatch() { - inplace_.relu_enable = true; - config_inplace(inplace_); - param_.input->syncToDevice(); - param_.output->copyFrom(param_.input); - param_.output->invalidate(); - inplace_.relu_enable = false; - config_inplace(inplace_); + param_.input->invalidate(); + int16_t* input_data = param_.input->data(); + float16* out_data = param_.output->data(); + for (int i = 0; i < param_.input->shape().alignedElementCount(); i++) { + int16_t v = param_.input->data()[i]; + if (v > 0) { + out_data[i] = input_data[i]; + } else { + out_data[i] = zero; + } + } + param_.output->copyScaleFrom(param_.input); + param_.output->flush(); return true; } @@ -67,8 +51,7 @@ class ReluPE : public PE { private: InputParam param_; - BypassArgs args_; - InplaceArgs inplace_; + float16 zero = float_to_half(0.0f); }; } // namespace zynqmp diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp index 09755c65a322da8ccab0d57dd2e877712b112361..5ff94edd747fe9f01741baf1efaad288ed32b98d 100755 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -36,6 +36,7 @@ class ScalePE : public PE { } inline int lcm(int a, int b) { return a * b / gcd(a, b); } + bool init() { Tensor* output = param_.output; output->setAligned(true); diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 988bc1bb507036de8f13a6c6549c549718bd1256..19f8f3b2500002a1329f433de0474f512931e6e8 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -103,12 +103,14 @@ class Tensor { return reinterpret_cast(ptr); } + void releaseData() { + released = true; + placeHolder_.reset(); + } + template Dtype* mutableData(DataType dataType, const Shape& shape) { - if (this->shape_ != nullptr) { - delete shape_; - } - this->shape_ = new Shape(shape); + this->shape_.reset(new Shape(shape)); this->dataType_ = dataType; return mutableData(); } @@ -138,7 +140,7 @@ class Tensor { DataType dataType() { return this->dataType_; } - Shape& shape() { return *shape_; } + Shape& shape() { return *(shape_.get()); } bool aligned() { return this->aligned_; } @@ -247,15 +249,12 @@ class Tensor { void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); } void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) { - if (shape_ != nullptr) { - delete shape_; - } this->placeHolder_ = src->placeHolder_; this->dataType_ = src->dataType_; this->aligned_ = src->aligned_; this->dateLocation_ = src->dateLocation_; this->offset = offset; - shape_ = new Shape(const_cast(shape)); + shape_.reset(new Shape(shape)); } void copyFrom(Tensor* src) { @@ -284,7 +283,6 @@ class Tensor { .address = data(), .scale_address = scale(), }; args.output = output; - src->syncToDevice(); size_t aligned_remainder = src->shape().numel() % 16; if (aligned_remainder > 0) { size_t dtype_size = @@ -294,12 +292,14 @@ class Tensor { fpga_flush(dst, aligned_remainder * dtype_size); } src->syncToDevice(); - this->invalidate(); perform_bypass(args); this->invalidate(); } void flush() { + if (released) { + return; + } size_t memorySize = placeHolder_->memorySize(); fpga_flush(placeHolder_->data(), memorySize); } @@ -380,7 +380,6 @@ class Tensor { } void save_file_with_name(std::string path) { - invalidate(); std::ofstream ofs; ofs.open(path); ofs << scale()[0] << " / " << scale()[1] << std::endl; @@ -389,11 +388,17 @@ class Tensor { float value = 0; if (dataType_ == FP32) { value = data()[i]; - } else if (dataType_ == FP16) { + } + if (dataType_ == FP16) { value = half_to_float(data()[i]); - } else { + } + + if (dataType_ == INT8) { value = data()[i]; } + if (dataType_ == INT32) { + value = data()[i]; + } ofs << value << std::endl; } ofs.close(); @@ -451,18 +456,12 @@ class Tensor { return os; } - ~Tensor() { - if (shape_ != nullptr) { - delete shape_; - shape_ = nullptr; - } - } - private: + bool released = false; int offset = 0; float mem_scale_factor_ = 1.0f; std::shared_ptr placeHolder_; - Shape* shape_ = nullptr; + std::shared_ptr shape_; DataType dataType_ = FP32; bool aligned_ = false; DataSyncStatus synchedStatus_ = Synched; diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc index 7f1e8d3e17f97315e77532b77bbcfcc8331edd4f..53086404955b2b477532f2e73250005807183c87 100755 --- a/lite/backends/fpga/lite_tensor.cc +++ b/lite/backends/fpga/lite_tensor.cc @@ -69,7 +69,7 @@ std::string DDimLite::repr() const { } void TensorLite::ShareDataWith(const TensorLite &other) { - buffer_ = other.buffer_; + buffer_ = other.buffer_; // TODO(chonwhite) delete buffer; dims_ = other.dims_; zynq_tensor_ = other.zynq_tensor_; target_ = other.target_; @@ -79,10 +79,8 @@ void TensorLite::ShareDataWith(const TensorLite &other) { } void *TensorLite::mutable_data(size_t memory_size) { - memory_size_ = memory_size; + memory_size_ = memory_size; // TODO(chonwhite) delete buffer; buffer_->ResetLazy(target_, memory_size_); - // throw -1; - std::cout << memory_size << std::endl; return buffer_->data(); } @@ -95,13 +93,20 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { dims_ = other.dims_; target_ = other.target_; lod_ = other.lod_; - auto dt = zynq_tensor_->dataType(); - auto shape = other.zynq_tensor_->shape(); + if (zynq_tensor_.get() == nullptr) { + zynq_tensor_.reset(new zynqmp::Tensor()); + } + auto dt = zynq_tensor_->dataType(); Resize(other.dims()); + auto shape = other.zynq_tensor_->shape(); zynq_tensor_->mutableData(zynq_tensor_->dataType(), shape); - this->ZynqTensor()->copyFrom(other.ZynqTensor()); + + // this->ZynqTensor()->copyFrom(other.ZynqTensor()); + memcpy(this->ZynqTensor()->data(), + other.ZynqTensor()->data(), + other.ZynqTensor()->shape().numel() * sizeof(float)); } } // namespace lite diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h index 266e0b5ce0ea03108978c3b0a32fbf0e3872c83c..3574d466e9ad88b1cd0de55751f8c519b967045a 100644 --- a/lite/backends/fpga/lite_tensor.h +++ b/lite/backends/fpga/lite_tensor.h @@ -81,6 +81,8 @@ class DDimLite { return !(a == b); } + ~DDimLite() {} + private: std::vector data_; }; @@ -142,7 +144,9 @@ class TensorLite { void *mutable_data(size_t memory_size); void *mutable_data(TargetType target, size_t memory_size); - const void *raw_data() const { return buffer_->data(); } + const void *raw_data() const { + return buffer_->data(); + } // TODO(chonwhite) delete buffer; size_t data_size() const { return this->dims().production(); } @@ -150,17 +154,19 @@ class TensorLite { size_t offset() const { return offset_; } - bool IsInitialized() const { return buffer_->data(); } - void clear() { - buffer_->Free(); - offset_ = 0; - } + bool IsInitialized() const { + return buffer_->data(); + } // TODO(chonwhite) delete buffer; // Other share data to this. void ShareDataWith(const TensorLite &other); void CopyDataFrom(const TensorLite &other); + void clear() { + // zynq_tensor_->releaseData(); + } + template TensorLite Slice(int64_t begin, int64_t end) const; @@ -169,7 +175,10 @@ class TensorLite { TargetType target() const { return target_; } - zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; } + // template + // TensorLite Slice(int64_t begin, int64_t end) const; + + zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_.get(); } friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) { os << "Tensor:" << '\n'; @@ -198,12 +207,34 @@ class TensorLite { size_t memory_size_{}; size_t offset_{0}; - zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor(); + std::shared_ptr zynq_tensor_; template void mutable_data_internal(); }; +template +zynqmp::DataType get_date_type() { + zynqmp::DataType data_type = zynqmp::FP32; + if (typeid(T) == typeid(float)) { + data_type = zynqmp::FP32; + } + if (typeid(T) == typeid(zynqmp::float16)) { + data_type = zynqmp::FP16; + } + if (typeid(T) == typeid(int)) { + data_type = zynqmp::INT32; + } + if (typeid(T) == typeid(int32_t)) { + data_type = zynqmp::INT32; + } + if (typeid(T) == typeid(int8_t)) { + data_type = zynqmp::INT8; + } + + return data_type; +} + template R *TensorLite::mutable_data() { std::vector v; @@ -229,14 +260,12 @@ R *TensorLite::mutable_data() { break; } zynqmp::Shape input_shape(layout_type, v); + zynqmp::DataType data_type = get_date_type(); - zynqmp::DataType data_type = zynqmp::FP32; - if (typeid(T) == typeid(float)) { - data_type = zynqmp::FP32; - } - if (typeid(T) == typeid(zynqmp::float16)) { - data_type = zynqmp::FP16; + if (zynq_tensor_.get() == nullptr) { + zynq_tensor_.reset(new zynqmp::Tensor()); } + return zynq_tensor_->mutableData(data_type, input_shape); } @@ -276,6 +305,7 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { template void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const { + // TODO(chonwhite) delete this function; CHECK_GE(begin, 0); CHECK_LE(end, dims_[0]); CHECK_LT(begin, end); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index b688bbc1083a6ab0f521381c4a988a12badc3141..a3b90f7d1040b4d878db784c44d578dc37581d42 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -25,7 +25,7 @@ namespace mir { void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { std::vector act_types{"relu"}; for (auto& place : graph->valid_places()) { - if (place.target == TARGET(kCUDA)) { + if (place.target == TARGET(kCUDA) || place.target == TARGET(kFPGA)) { act_types.push_back("leaky_relu"); break; } diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index 0327a50af3387588cf067c637762b625cadfc58f..754bfe142e59d066b936c9337d59c56fbf55eba5 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -103,6 +103,12 @@ void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph, // obtain values, save values and relink node int bit_length = quant_node->stmt()->op_info()->GetAttr("bit_length"); + int range = ((1 << (bit_length - 1)) - 1); + auto* scope = quant_node->stmt()->op()->scope(); + auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name) + ->GetMutable(); + float scale_value = scale_tensor->data()[0] / range; + auto outlinks = output_act_node->outlinks; for (auto* quantized_node : outlinks) { auto* op_desc = quantized_node->stmt()->mutable_op_info(); @@ -208,9 +214,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, for (int i = 0; i < weight_scale_size; i++) { weight_scale.push_back(whole_weight_scale); } + #ifndef LITE_WITH_FPGA op_desc.SetAttr("enable_int8", true); #endif + if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) { op_desc.SetAttr("input_scale", input_scale); } @@ -689,13 +697,16 @@ void DynamicQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, float* temp_data = temp_tensor.mutable_data(); size_t weight_num = quantized_weight_t->data_size(); quantized_weight_t->set_persistable(true); + std::cout << "DynamicQuantDequantOpFuser::InsertNewNode====================" "========================================" << std::endl; + #ifdef LITE_WITH_FPGA float* quantized_weight_data = quantized_weight_t->mutable_data(); for (size_t i = 0; i < weight_num; i++) { quantized_weight_data[i] = temp_data[i] * whole_weight_scale; + std::cout << whole_weight_scale << "," << temp_data[i] << "," << quantized_weight_data[i] << std::endl; } diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h index 35cd2e6ef7e4a82335765e89bb6f80df07e4f903..71c6ea92737986f1ecb860c31fb21150697f00e9 100644 --- a/lite/core/mir/kernel_place_correct_pass.h +++ b/lite/core/mir/kernel_place_correct_pass.h @@ -86,6 +86,8 @@ class KernelPlaceCorrectPass : public DebugPass { << node_name; VLOG(4) << "-- input arg_name:" << arg_name << " " << "-- node name:" << node_name; + + auto type = inst.picked_kernel().GetInputDeclType(arg_name); if (!x_in->AsArg().type) { need_correct_place &= false; } else { @@ -107,6 +109,8 @@ class KernelPlaceCorrectPass : public DebugPass { << node_name << " in Inst " << inst.op_type(); VLOG(4) << "-- output arg_name " << arg_name; + + auto type = inst.picked_kernel().GetOutputDeclType(arg_name); if (!x_out->AsArg().type) { need_correct_place &= false; } else { diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc old mode 100644 new mode 100755 diff --git a/lite/core/program.cc b/lite/core/program.cc index 4f6ea2ce470724c0b00993478c47eb0315b5a1e5..ce6bd3a36cd1d852f2d50f69c4be9e31b84b3f60 100755 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -139,7 +139,14 @@ void RuntimeProgram::Run() { for (auto& inst : instructions_) { #ifndef LITE_WITH_FPGA if (inst.is_feed_fetch_op()) continue; +#endif std::string op_type = inst.op()->op_info()->Type(); + + VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr() + << " on Target " << TargetToStr(inst.kernel()->target()); + +#ifndef LITE_WITH_FPGA + if (op_type == "feed" || op_type == "fetch") continue; #endif inst.Run(); #ifdef LITE_WITH_PROFILE diff --git a/lite/gen_code/paddle_infer.h b/lite/gen_code/paddle_infer.h index e01ffc25e29ca94166e8fe12b0643ae9e914001d..2449e1e5d3fb721a39760e78a0417bf9491d8cef 100644 --- a/lite/gen_code/paddle_infer.h +++ b/lite/gen_code/paddle_infer.h @@ -46,7 +46,7 @@ class Tensor { */ class PaddlePredictor { public: - void Init(); + void Init() {} std::unique_ptr GetTensor(const std::string &id) const; std::unique_ptr GetMutableTensor(const std::string &id); diff --git a/lite/kernels/arm/sequence_pool_compute.cc b/lite/kernels/arm/sequence_pool_compute.cc index 8fcbb8cffe72935e4df503c3c1748ddb68247fb7..93072fe499eed296d6e31d87ee9b74494de07aa1 100644 --- a/lite/kernels/arm/sequence_pool_compute.cc +++ b/lite/kernels/arm/sequence_pool_compute.cc @@ -59,6 +59,7 @@ void SequencePoolCompute::Run() { for (int i = 0; i <= batch_size; i++) { offset_new[i] = i; } + (output->mutable_lod())->clear(); (output->mutable_lod())->push_back(offset_new); } diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index f6c3a399490a86e2ac2fcd9cbeb76fca8c8ac479..7251d875276b7ce7492c14742aea332a4f5e22d2 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -5,28 +5,32 @@ endif() set(fpga_deps fpga_target_wrapper kernel_fpga) -# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) +add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) -# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) + +add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) + add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps}) add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps}) -# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) + # add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps}) add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps}) add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) + # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps}) -# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) +add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps}) add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps}) -# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) -# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps}) +add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) +add_kernel(split_compute_fpga FPGA basic SRCS split_compute.cc DEPS ${fpga_deps}) +add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps}) add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps}) add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc index ecd9af0f8da5df62a15637e88dc4564efb187f6c..f6704204d34c309835c1de0ef61afed97c0b29e3 100644 --- a/lite/kernels/fpga/activation_compute.cc +++ b/lite/kernels/fpga/activation_compute.cc @@ -25,10 +25,10 @@ using float16 = zynqmp::float16; void ReluCompute::PrepareForRun() { auto& param = this->Param(); auto output_data = param.Out->mutable_data(); - zynqmp::InputParam& input_param = pe_.param(); + zynqmp::InputParam& relu_param = pe_.param(); - input_param.input = param.X->ZynqTensor(); - input_param.output = param.Out->ZynqTensor(); + relu_param.input = param.X->ZynqTensor(); + relu_param.output = param.Out->ZynqTensor(); pe_.init(); pe_.apply(); } diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc index 69e600a043389fcc36bda2906f38432f2771aaf8..bd6adf60934eda84a0f95c1d3bca6f801b566484 100644 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -72,12 +72,6 @@ void ConvCompute::PrepareForRun() { conv_param.activeParam.type = zynqmp::TYPE_RELU; } - // conv_param.filter->saveToFile("conv_filter_", true); - // if (param.bias != nullptr) { - // std::cout << "param.bias != nullptr" << std::endl; - // conv_param.bias()->saveToFile("conv_bias_", true); - // } - conv_pe_.init(); conv_pe_.apply(); } diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc index d22cc7abacc2ecd80e54aa5c62a7e57671b920c9..39780d82276188b141e31d89466fbe09434393aa 100755 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -80,21 +80,21 @@ void ElementwiseMulCompute::PrepareForRun() { scale_param.activeParam.type = zynqmp::TYPE_NONE; int channel = scale_param.input->shape().channel(); - zynqmp::Tensor* scale = new zynqmp::Tensor(); - zynqmp::Tensor* bias = new zynqmp::Tensor(); - scale_param.scale = scale; - scale_param.bias = bias; + scale_param.scale = &scale_; + scale_param.bias = &bias_; zynqmp::Shape shape(zynqmp::N, {channel}); - float* scale_data = scale->mutableData(zynqmp::FP32, shape); - float* bias_data = bias->mutableData(zynqmp::FP32, shape); + zynqmp::float16* scale_data = + scale_.mutableData(zynqmp::FP16, shape); + zynqmp::float16* bias_data = + bias_.mutableData(zynqmp::FP16, shape); float scale_value = param.Y->data()[0]; - for (int i = 0; i < channel; ++i) { + for (int i = 0; i < channel; i++) { if (param.Y->dims().production() != 1) { scale_value = param.Y->ZynqTensor()->data()[i]; } - scale_data[i] = scale_value; - bias_data[i] = 0; + scale_data[i] = zynqmp::float_to_half(scale_value); + bias_data[i] = zero_; } pe_.init(); @@ -102,6 +102,10 @@ void ElementwiseMulCompute::PrepareForRun() { } void ElementwiseMulCompute::Run() { + auto& param = Param(); + param.Y->ZynqTensor()->flush(); + scale_.copyFrom(param.Y->ZynqTensor()); + scale_.invalidate(); pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ScaleParam& scale_param = pe_.param(); diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h index e3e9c52c4c660e9ae6852f2ec8cdd815829ad524..9fa4991161dff6bba6c860838863b1cb38393877 100644 --- a/lite/kernels/fpga/elementwise_compute.h +++ b/lite/kernels/fpga/elementwise_compute.h @@ -61,6 +61,9 @@ class ElementwiseMulCompute private: zynqmp::ScalePE pe_; + zynqmp::Tensor scale_; + zynqmp::Tensor bias_; + zynqmp::float16 zero_ = zynqmp::float_to_half(0.0f); }; } // namespace fpga diff --git a/lite/kernels/fpga/elementwise_compute_test.cc b/lite/kernels/fpga/elementwise_compute_test.cc index 51c9d54cad9054e4767860096d42e9c991d4f936..de7d23f09b89ac9308af773d91ed8f444a45a6ad 100644 --- a/lite/kernels/fpga/elementwise_compute_test.cc +++ b/lite/kernels/fpga/elementwise_compute_test.cc @@ -93,21 +93,6 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param, } // do elementwise add/sub/max... if (elt_type == "add") { - // for (int i = 0; i < batch; ++i) { - // for (int j = 0; j < channels; ++j) { - // int offset = (i * channels + j) * num; - // const dtype* din_ptr = x_data + offset; - // const dtype diny_data = y_data[j]; - // dtype* dout_ptr = out_data + offset; - // for (int k = 0; k < num; ++k) { - // *dout_ptr = - // zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr), - // zynqmp::half_to_float(diny_data))); - // dout_ptr++; - // din_ptr++; - // } - // } - // } int count = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; for (int i = 0; i < count; ++i) { out_data[i] = zynqmp::float_to_half(sum( @@ -229,75 +214,6 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) { ASSERT_TRUE(fusion_elementwise_add_activation.front()); } -// TEST(fusion_elementwise_add_activation_fpga, init) { -// ElementwiseAddActivationCompute fusion_elementwise_add_activation; -// ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16)); -// ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA)); -// } - -// TEST(fusion_elementwise_add_activation_fpga, compute) { -// ElementwiseAddActivationCompute fusion_elementwise_add_activation; -// operators::FusionElementwiseActivationParam param; -// lite::Tensor x, y, output, output_ref; - -// for (auto act_type : {"relu"}) { -// for (auto n : {1}) { -// for (auto c : {8}) { -// for (auto h : {8}) { -// for (auto w : {8}) { -// for (auto axis : {0}) { -// for (auto yd : {std::vector({n, c, h, w})}) { -// auto x_dim = DDim(std::vector({n, c, h, w})); -// auto y_dim = DDim(yd); -// int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; - -// if (axis_t + y_dim.size() > 4) continue; -// bool flag = false; -// for (int i = 0; i < y_dim.size(); i++) { -// if (x_dim[i + axis_t] != y_dim[i]) flag = true; -// } -// if (flag) continue; - -// x.Resize(x_dim); -// y.Resize(y_dim); -// output.Resize(x_dim); -// output_ref.Resize(x_dim); -// auto* x_data = x.mutable_data(TARGET(kFPGA)); -// auto* y_data = y.mutable_data(TARGET(kFPGA)); -// auto* output_data = -// output.mutable_data(TARGET(kFPGA)); -// auto* output_ref_data = -// output_ref.mutable_data(TARGET(kFPGA)); -// for (int i = 0; i < x_dim.production(); i++) { -// float sign = i % 3 == 0 ? -1.0f : 1.0f; -// x_data[i] = zynqmp::float_to_half(i * sign); -// } -// for (int i = 0; i < y_dim.production(); i++) { -// float sign = i % 2 == 0 ? 0.5f : -0.5f; -// y_data[i] = zynqmp::float_to_half(i * sign); -// } -// param.X = &x; -// param.Y = &y; -// param.axis = axis; -// param.Out = &output; -// param.act_type = act_type; -// fusion_elementwise_add_activation.SetParam(param); -// fusion_elementwise_add_activation.PrepareForRun(); -// fusion_elementwise_add_activation.Run(); -// param.Out = &output_ref; -// elementwise_compute_ref(param, "add", act_type); -// for (int i = 0; i < output.dims().production(); i++) { -// EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); -// } -// } -// } -// } -// } -// } -// } -// } -// } - } // namespace fpga } // namespace kernels } // namespace lite diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc index 79329e99a3e5e812dca487c17452f3f5d1e96449..9ca2424bc2f8a748c348cac4aafd219e538c7a17 100755 --- a/lite/kernels/fpga/feed_compute.cc +++ b/lite/kernels/fpga/feed_compute.cc @@ -40,8 +40,8 @@ void FeedCompute::PrepareForRun() { void FeedCompute::Run() { auto& param = this->Param(); Tensor& x = param.feed_list->at(param.col); + pe_.param().input = x.ZynqTensor(); pe_.dispatch(); - auto out_lod = param.out->mutable_lod(); *out_lod = x.lod(); diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc index 2d296f4d4a89b1fd86e5b2330d3caf44fbad0903..d5c8585aaefd6dd54e3f3f603173b1790023ede8 100755 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -55,6 +55,7 @@ void FetchCompute::Run() { #ifdef FPGA_PRINT_TENSOR zynqmp::OutputParam& fetch_param = pe_.param(); Debugger::get_instance().registerOutput("fetch", fetch_param.output); + Debugger::get_instance().setEnable(true); #endif } diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc index 4554c24e07de656b948826c2fa6f9526f61daaa6..8b515532453d41eb504fabb228e491f0d5a3c00e 100755 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -45,21 +45,32 @@ class IoCopyHostToFpgaCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - param.y->mutable_data(); - if (param.x->ZynqTensor()->aligned() && - param.x->ZynqTensor()->shape().shouldAlign()) { - zynqmp::Tensor tempTensor; - tempTensor.mutableData(zynqmp::FP16, - param.x->ZynqTensor()->shape()); - tempTensor.copyFrom(param.x->ZynqTensor()); - tempTensor.setAligned(true); - tempTensor.unalignImage(); - param.y->ZynqTensor()->copyFrom(&tempTensor); - } else { + param.x->ZynqTensor()->flush(); + + if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) { + param.y->mutable_data(); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + return; } - param.y->ZynqTensor()->invalidate(); - param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + + if (param.x->ZynqTensor()->dataType() == zynqmp::FP32) { + param.y->mutable_data(); + if (param.x->ZynqTensor()->aligned() && + param.x->ZynqTensor()->shape().shouldAlign()) { + zynqmp::Tensor tempTensor; + tempTensor.mutableData(zynqmp::FP16, + param.x->ZynqTensor()->shape()); + tempTensor.copyFrom(param.x->ZynqTensor()); + tempTensor.setAligned(true); + tempTensor.unalignImage(); + param.y->ZynqTensor()->copyFrom(&tempTensor); + } else { + param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } + param.y->ZynqTensor()->invalidate(); + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + } + auto out_lod = param.y->mutable_lod(); *out_lod = param.x->lod(); } diff --git a/lite/kernels/fpga/mul_compute.cc b/lite/kernels/fpga/mul_compute.cc index c27600d9f773ff0aae04a2ee519905bc0e58785c..659c8dfb653c1707105a7337493ee4f3b3357b76 100755 --- a/lite/kernels/fpga/mul_compute.cc +++ b/lite/kernels/fpga/mul_compute.cc @@ -80,7 +80,8 @@ void mul(MulCompute* k) { } void MulCompute::Run() { - pe_.dispatch(); + // pe_.dispatch(); + mul(this); #ifdef FPGA_PRINT_TENSOR zynqmp::FullyConnectedParam& fc_param = pe_.param(); Debugger::get_instance().registerOutput("mul", fc_param.output); diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc index 4834054df6371a9faaa17bd17b53a29b999ddf03..23a5aad8e694d33cc30adec114e520620685178e 100644 --- a/lite/kernels/fpga/multiclass_nms_compute.cc +++ b/lite/kernels/fpga/multiclass_nms_compute.cc @@ -318,14 +318,29 @@ void MultiClassOutput(const Tensor& scores, void MulticlassNmsCompute::Run() { auto& param = Param(); - auto* boxes = param.bboxes; - auto* scores = param.scores; + auto* boxes_in = param.bboxes; + auto* scores_in = param.scores; auto* outs = param.out; outs->mutable_data(); - auto score_dims = scores->dims(); + auto score_dims = boxes_in->dims(); auto score_size = score_dims.size(); + Tensor boxes_float; + Tensor scores_float; + + boxes_float.Resize(boxes_in->dims()); + scores_float.Resize(scores_in->dims()); + + boxes_float.mutable_data(); + scores_float.mutable_data(); + + boxes_float.ZynqTensor()->copyFrom(boxes_in->ZynqTensor()); + scores_float.ZynqTensor()->copyFrom(scores_in->ZynqTensor()); + + Tensor* boxes = &boxes_float; + Tensor* scores = &scores_float; + auto box_dims = boxes->dims(); int64_t box_dim = boxes->dims()[2]; @@ -383,6 +398,7 @@ void MulticlassNmsCompute::Run() { MultiClassOutput( scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); outs->ZynqTensor()->copyFrom(out.ZynqTensor()); + out.ZynqTensor()->saveToFile("nms_oo", true); } outs->Resize({static_cast(e - s), out_dim}); } @@ -402,16 +418,16 @@ void MulticlassNmsCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(multiclass_nms, - kFPGA, - kFP16, - kNHWC, - paddle::lite::kernels::fpga::MulticlassNmsCompute, - def) - .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) - .Finalize(); +// REGISTER_LITE_KERNEL(multiclass_nms, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::fpga::MulticlassNmsCompute, +// def) +// .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) +// .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) +// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) +// .Finalize(); REGISTER_LITE_KERNEL(multiclass_nms, kFPGA, @@ -427,5 +443,8 @@ REGISTER_LITE_KERNEL(multiclass_nms, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) .Finalize(); diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc index afd14ccb4b4a9a4f1e93e1e38840035fb18186bb..a11e67d837b81b03a8cca753bc409509ca5833b6 100644 --- a/lite/kernels/fpga/prior_box_compute.cc +++ b/lite/kernels/fpga/prior_box_compute.cc @@ -131,3 +131,27 @@ REGISTER_LITE_KERNEL(prior_box, .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +// REGISTER_LITE_KERNEL(prior_box, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::fpga::PriorBoxCompute, +// def) +// .BindInput("Input", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindInput("Image", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindOutput("Boxes", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindOutput("Variances", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .Finalize(); diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc index f72f18892c987e48fb3467372352f6ded98444ff..b79051f5b168abd857b5ec09cb048984da3dd4e1 100644 --- a/lite/kernels/fpga/reshape_compute.cc +++ b/lite/kernels/fpga/reshape_compute.cc @@ -38,18 +38,15 @@ void ReshapeCompute::Run() { auto* actual_shape_data = actual_shape->data(); auto shape = std::vector( actual_shape_data, actual_shape_data + actual_shape_dims.production()); - output_dims = lite::operators::ValidateShape(shape, x_dims); + // output_dims = lite::operators::ValidateShape(shape, x_dims); //TODO output->Resize(output_dims); } - if (inplace) { - output->ShareDataWith(*x); - } else { - output->CopyDataFrom(*x); - } - - param.x->ZynqTensor()->saveToFile("reshape_in", true); - output->ZynqTensor()->saveToFile("reshape_out", true); - + // if (inplace) { + // output->ShareDataWith(*x); + // } else { + // output->CopyDataFrom(*x); + // } + output->ZynqTensor()->copyFrom(x->ZynqTensor()); output->Resize(output_dims); } diff --git a/lite/kernels/fpga/split_compute.cc b/lite/kernels/fpga/split_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..518503d67ff28b209ed9d7e76d441ef46b3bfd4d --- /dev/null +++ b/lite/kernels/fpga/split_compute.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/split_compute.h" +#include +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +void SplitCompute::PrepareForRun() { + auto& param = Param(); + zynqmp::SplitParam& split_param = pe_.param(); + split_param.input = param.x->ZynqTensor(); + auto& dout = param.output; + for (int i = 0; i < dout.size(); i++) { + dout[i]->mutable_data(); + split_param.outputs.push_back(dout[i]->ZynqTensor()); + } + + pe_.init(); + pe_.apply(); +} + +void SplitCompute::Run() { + zynqmp::SplitParam& split_param = pe_.param(); + pe_.dispatch(); + +#ifdef FPGA_PRINT_TENSOR + auto& dout = param.output; + for (int i = 0; i < dout.size(); i++) { + Debugger::get_instance().registerOutput("split", split_param.outputs[0]); + } + +#endif +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + split, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::SplitCompute, def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("SectionsTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/split_compute.h b/lite/kernels/fpga/split_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..d7680a66495c4e31591ecf6bdcdc73e3a71d802e --- /dev/null +++ b/lite/kernels/fpga/split_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/split_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class SplitCompute + : public KernelLite { + public: + void PrepareForRun() override; + void Run() override; + + virtual ~SplitCompute() = default; + + private: + zynqmp::SplitPE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc index e3bb813873d69d8f9d9939f06869e2640f416915..4ffeb4c82b10cee4094fbee53c7f39014e7fab84 100644 --- a/lite/kernels/fpga/transpose_compute.cc +++ b/lite/kernels/fpga/transpose_compute.cc @@ -81,7 +81,17 @@ void transposeCompute(operators::TransposeParam param) { } // Transpose -void TransposeCompute::Run() { auto& param = this->Param(); } +void TransposeCompute::Run() { + auto& param = this->Param(); + param.output->mutable_data(); + param.x->ZynqTensor()->invalidate(); + param.x->ZynqTensor()->unalignImage(); + if (param.x->dims().size() != 4) { + transposeCompute(param); + } else { + param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } +} // Transpose2 void Transpose2Compute::Run() { diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt index c212fb9b0465824b7a87eef2e87033bf967736e5..3ef584ab37280ad59b7670c7f414d1fcbc522316 100755 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps}) #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any) diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc old mode 100644 new mode 100755 diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc old mode 100644 new mode 100755