diff --git a/CMakeLists.txt b/CMakeLists.txt index 786b1322b346631d1570a6ebd9c572302531db4e..288be06b78ac6d6439ea48b39a2e86ee0c364a97 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,11 @@ endif(WITH_PADDLE_MOBILE) # set(CMAKE_BUILD_TYPE DEBUG) + +SET(CMAKE_BUILD_TYPE "Release") +SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall") + + set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(CMAKE_CXX_STANDARD 11) diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp index b6932bc27f0019af58cea00e4b5422396d838208..54c10f50af1d2f3cd4244d11f0d8dbcba95b35e2 100755 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -234,8 +234,8 @@ int8_t* format_filter(float* data_in, float* filter_start = data_in + n * chw; float f_max = find_max(filter_start, chw); int8_t* quantized_start = quantized_data + n * chw; - quantize(filter_start, quantized_start, chw, max); - filter_max.push_back(max); + quantize(filter_start, quantized_start, chw, f_max); + filter_max.push_back(f_max); } int8_t* hwc_data = diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp index 68d0b6c68b722f9c5cf31139ed7308516889bd8c..b8b7dfb8876ea5381b0100b3bcfb701f6c5c2ca6 100644 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp @@ -187,7 +187,7 @@ int get_device_info(const struct DeviceInfo &args) { int perform_bypass(const struct BypassArgs &args) { int ret = -1; int size = args.image.channels * args.image.width * args.image.height; - int max_size = 1 << 21; + int max_size = 1 << 22; float times = 1.0 * size / max_size; int count = static_cast(times); diff --git a/lite/backends/fpga/KD/pes/input_pe.hpp b/lite/backends/fpga/KD/pes/input_pe.hpp index abae094d97e66f1ecff78f1e41199893456e6722..d8f9a15c6a94f6869a588f758982800b35eecc40 100755 --- a/lite/backends/fpga/KD/pes/input_pe.hpp +++ b/lite/backends/fpga/KD/pes/input_pe.hpp @@ -29,28 +29,20 @@ class InputPE : public PE { } bool dispatch() { - std::cout << "input_dispatch()\n"; + // std::cout << "input_dispatch()\n"; Tensor* input = param_.input; Tensor* output = param_.output; Tensor* src = input; - // std::cout << "input:" << input << std::endl; input->flush(); - // std::cout << "input_flush()\n"; Tensor half_tensor; if (input->dataType() == DataType::FP32) { - // std::cout << "2()\n"; half_tensor.mutableData(DataType::FP16, input->shape()); - // std::cout << "3()\n"; half_tensor.copyFrom(input); - // std::cout << "4()\n"; src = &half_tensor; } - // std::cout << "5()\n"; output->mutableData(); - // std::cout << "6()\n"; src->alignImage(output, true); - // std::cout << "7()\n"; return true; } diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp index 53da0c5be768d05d1ca58abf26a4a4ab0425cc9f..2d02d30fbae12efc372e58c2ad80348356a8f22d 100755 --- a/lite/backends/fpga/KD/pes/output_pe.hpp +++ b/lite/backends/fpga/KD/pes/output_pe.hpp @@ -54,8 +54,10 @@ class OutputPE : public PE { output->data(), output->shape().numel() * sizeof(float)); - // auto max = fpga_get_memory_size_max(); - // std::cout << "===== Max: ===== :: " << max << std::endl; + fpga_reset(); + + auto max = fpga_get_memory_size_max(); + std::cout << "PL ===== Max: ===== :: " << max << std::endl; return true; } diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp index 84ed4f946e1a394cb0fc40d7c156faf534e1f8db..bec99a539490513d79ad86a4d5e3b2f4168c24e4 100755 --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp @@ -67,7 +67,7 @@ class PoolingPE : public PE { use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && (k_width > 255 || k_height > 255); - use_cpu_ = param_.type == AVERAGE; + // use_cpu_ = param_.type == AVERAGE; } void compute() { diff --git a/lite/kernels/arm/sequence_pool_compute.cc b/lite/kernels/arm/sequence_pool_compute.cc index 8fcbb8cffe72935e4df503c3c1748ddb68247fb7..93072fe499eed296d6e31d87ee9b74494de07aa1 100644 --- a/lite/kernels/arm/sequence_pool_compute.cc +++ b/lite/kernels/arm/sequence_pool_compute.cc @@ -59,6 +59,7 @@ void SequencePoolCompute::Run() { for (int i = 0; i <= batch_size; i++) { offset_new[i] = i; } + (output->mutable_lod())->clear(); (output->mutable_lod())->push_back(offset_new); } diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index e71e5255ca6daa0c86c7f1b1c3d9174df66cac25..90cd2bfd3653a876f6568ecc351464b9d37e3af3 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -14,7 +14,6 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps}) add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps}) -# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc index 2d296f4d4a89b1fd86e5b2330d3caf44fbad0903..71ec37a64d94bcbef00d7e3c2a187bdb28c47935 100755 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -82,6 +82,6 @@ REGISTER_LITE_KERNEL(fetch, kNHWC, paddle::lite::kernels::fpga::FetchCompute, host_host) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); diff --git a/lite/kernels/host/one_hot_compute.cc b/lite/kernels/host/one_hot_compute.cc index e0af6f5173f367bb9b2e06de10499ee36806379c..e1bf4c103bcb59c59617d8c5d1ce10ae8780e403 100755 --- a/lite/kernels/host/one_hot_compute.cc +++ b/lite/kernels/host/one_hot_compute.cc @@ -16,7 +16,7 @@ #include #include -#include "lite/backends/fpga/KD/debugger.hpp" +// #include "lite/backends/fpga/KD/debugger.hpp" #include "lite/kernels/host/one_hot_compute.h" #include "lite/utils/paddle_enforce.h" diff --git a/lite/operators/one_hot_op.cc b/lite/operators/one_hot_op.cc index 023cdc23aeb8329736b7438af2c52cbfa899c75c..ebab9e20679038546611b8dc3221f4ecba1bbe21 100644 --- a/lite/operators/one_hot_op.cc +++ b/lite/operators/one_hot_op.cc @@ -15,7 +15,7 @@ #include "lite/operators/one_hot_op.h" #include "lite/core/op_registry.h" -#include "lite/backends/fpga/KD/debugger.hpp" +// #include "lite/backends/fpga/KD/debugger.hpp" namespace paddle { namespace lite {