diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 477d241e93fc05a7197f84b495f0faf0b3badbef..0cb872366938ee37de1c6ec4c362152949710151 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -56,11 +56,17 @@ void *fpga_malloc(size_t size) { return reinterpret_cast( mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); #else - return NULL; + return malloc(size); #endif } -void fpga_free(void *ptr) { munmap(ptr, 0); } +void fpga_free(void *ptr) { +#ifdef PADDLE_MOBILE_OS_LINUX + munmap(ptr, 0); +#else + free(ptr); +#endif +} void fpga_copy(void *dest, const void *src, size_t num) { memcpy(dest, src, num); diff --git a/src/fpga/quantization.cpp b/src/fpga/quantization.cpp index 560a1aa0059cbaffe36dd570e3f2f38ab8943379..44994d4c353490b533110d0965fb63b4fb5c7aa2 100644 --- a/src/fpga/quantization.cpp +++ b/src/fpga/quantization.cpp @@ -48,16 +48,11 @@ static Dtype find_max(Dtype* data, int64_t num) { // template void quantize_filter(framework::Tensor* filter) { - DLOG << "quantilize_filter........"; + DLOG << "quantilize_filter........" << filter->dims(); float scale = 0; auto fix_range = static_cast(std::pow(2, 8 - 1) - 1); - const auto batch_size = filter->dims()[0]; - const auto channel = filter->dims()[1]; - const auto height = filter->dims()[2]; - const auto width = filter->dims()[3]; - auto* tmp_data = new int8_t[filter->numel()]; // 32bit filter -> 8bit filter; @@ -76,9 +71,19 @@ void quantize_filter(framework::Tensor* filter) { scale = (fix_range / max); std::memcpy(tmp_data, filter->data(), (size_t)filter->numel()); } - // NCHW -> NHWC; - chw_to_hwc(tmp_data, filter->mutable_data(), batch_size, - channel, height, width); + + if (filter->dims().size() == 4) { + const auto batch_size = filter->dims()[0]; + const auto channel = filter->dims()[1]; + const auto height = filter->dims()[2]; + const auto width = filter->dims()[3]; + chw_to_hwc(tmp_data, filter->mutable_data(), batch_size, + channel, height, width); + } else if (filter->dims().size() == 2) { + std::memcpy(filter->mutable_data(), tmp_data, + (size_t)filter->numel()); + } + delete tmp_data; filter->SetFpgaScale(scale); } diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 09bff80af723161dfaf31d58f3ec24528ef1ccc4..2bd4c0ac6ba3c7b066cc7ad2439ab6bebb7c3cd9 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -26,7 +26,7 @@ namespace paddle_mobile { namespace memory { const int MALLOC_ALIGN = 64; -#ifdef PADDLE_MOBILE_FPGA__VV +#ifdef PADDLE_MOBILE_FPGA namespace fpga = paddle_mobile::fpga; void Copy(void *dst, const void *src, size_t num) { diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index 6424de8afe705e13534a3452fc04890c0f750b9f..d58ab0f751eeb584f286a0920d08e9473be38402 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -32,9 +32,9 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { args.convert_type = fpga::DATA_FP16_TO_FP32; args.layout_type = fpga::LAYOUT_HWC_TO_CHW; args.image.address = (void *)(input_ptr); - args.image.height = input->dims()[1]; - args.image.width = input->dims()[2]; - args.image.channels = input->dims()[3]; + args.image.height = input->dims()[0]; + args.image.width = input->dims()[1]; + args.image.channels = 1; args.output.address = output_ptr; param->SetFpgaArgs(args); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a144e553236e300ad24501420183df01dd15aad5..dab8bcc977054f90a2ec82899b9ab64c426d1fb6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -31,6 +31,9 @@ elseif("FPGAnets" IN_LIST NET) # target_link_libraries(test-resnet paddle-mobile) ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-tensor-quant paddle-mobile) + + ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h) + target_link_libraries(test-fpga-concat-op paddle-mobile) elseif("mobilenetssd" IN_LIST NET) # gen test ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h) diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d1a5828b36b3d9ed371a271af6db82657ff1596 --- /dev/null +++ b/test/fpga/test_concat_op.cpp @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +#include "operators/concat_op.h" + +int main() { + paddle_mobile::Loader loader; + auto program = loader.Load(g_googlenet); + PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, + "program file read fail"); + + Executor4Test> + executor(program, "concat"); + + // 1. input_tensors; + vector input_tensors; + + Tensor input1; + auto input1_data = CreateInput(&input1, {4, 10, 2, 2}, 0, 1); + input_tensors.push_back(input1); + Tensor input2; + auto input2_data = CreateInput(&input2, {4, 20, 2, 2}, 0, 1); + input_tensors.push_back(input2); + Tensor input3; + auto input3_data = CreateInput(&input3, {4, 30, 2, 2}, 0, 1); + input_tensors.push_back(input3); + Tensor input4; + auto input4_data = CreateInput(&input4, {4, 40, 2, 2}, 0, 1); + input_tensors.push_back(input4); + // 2. input_names + vector input_names({ + "conv2d_3.tmp_1", + "conv2d_5.tmp_1", + "conv2d_7.tmp_1", + "conv2d_8.tmp_1", + }); + + // 3. output_names + vector output_names({"concat_0.tmp_0"}); + + // 4. out_dims; + vector out_ddims; + auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2}); + out_ddims.push_back(out_ddim); + + auto output = executor.Predict(input_tensors, input_names, + output_names, out_ddims); + + auto output0_data = output[0]->data(); + + // 5. test one example. + int input_n = 1; + int input_c = 2; + int input_h = 0; + int input_w = 1; + int stride0 = input3.numel() / input3.dims()[0]; + int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1]; + int stride2 = input3.dims()[3]; + /// inputx1 (4,10,2,2), + /// inputx2 (4,20,2,2), + /// inputx3 (4,30,2,2), + /// inputx4 (4,40,2,2), + /// axis = 1 + /// output (4,100,2,2) + int input_index = + input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w; + int output_index = input_n * 100 * 2 * 2 + + (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 + + input_h * 2 + input_w; + + DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index]; + DLOG << " output [1,32,0,1] = " << output0_data[output_index]; + return 0; +}