提交 4dddc907 编写于 作者: C chonwhite

VGG-SSD works

上级 945aa36f
...@@ -3,4 +3,3 @@ ...@@ -3,4 +3,3 @@
--arm_abi=armv8 \ --arm_abi=armv8 \
--arm_lang=gcc \ --arm_lang=gcc \
test test
...@@ -3,4 +3,3 @@ ...@@ -3,4 +3,3 @@
--arm_abi=armv8 \ --arm_abi=armv8 \
--arm_lang=gcc \ --arm_lang=gcc \
test test
...@@ -143,8 +143,6 @@ lite::Tensor *Predictor::GetInput(size_t offset) { ...@@ -143,8 +143,6 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
} }
#endif #endif
// get inputs names // get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; } std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
......
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box_fpga, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_or, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_not, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(roi_align, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(generate_proposals, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(crop, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(beam_search, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(power, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(negative, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(less_than, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(not_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(less_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(greater_than, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(greater_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(multiclass_nms, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_clip, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gru, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_arm_float);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(elementwise_mul, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(shape, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gather, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(im2sequence, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(density_prior_box, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(lookup_table_v2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layer_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sequence_softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(norm, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nhwc2nchw);
USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nchw2nhwc);
USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nhwc2nchw);
USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_sub_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, host_to_device);
USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, device_to_host);
USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, host_to_device_once);
USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, device_to_host_once);
USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(beam_search_decode, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(assign_value, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gru, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(anchor_generator, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(gru_unit, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def);
\ No newline at end of file
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_OP(lookup_table_v2);
USE_LITE_OP(feed);
USE_LITE_OP(fake_channel_wise_dequantize_max_abs);
USE_LITE_OP(assign);
USE_LITE_OP(layout);
USE_LITE_OP(transpose);
USE_LITE_OP(transpose2);
USE_LITE_OP(pool2d);
USE_LITE_OP(batch_norm);
USE_LITE_OP(reshape);
USE_LITE_OP(reshape2);
USE_LITE_OP(search_fc);
USE_LITE_OP(sequence_reverse);
USE_LITE_OP(matmul);
USE_LITE_OP(slice);
USE_LITE_OP(lod_reset);
USE_LITE_OP(graph_op);
USE_LITE_OP(expand);
USE_LITE_OP(top_k);
USE_LITE_OP(fake_quantize_range_abs_max);
USE_LITE_OP(arg_max);
USE_LITE_OP(beam_search);
USE_LITE_OP(box_clip);
USE_LITE_OP(fake_quantize_dequantize_moving_average_abs_max);
USE_LITE_OP(box_coder);
USE_LITE_OP(search_seq_depadding);
USE_LITE_OP(write_to_array);
USE_LITE_OP(is_empty);
USE_LITE_OP(prior_box);
USE_LITE_OP(sequence_concat);
USE_LITE_OP(affine_channel);
USE_LITE_OP(shape);
USE_LITE_OP(axpy);
USE_LITE_OP(anchor_generator);
USE_LITE_OP(reduce_max);
USE_LITE_OP(gru);
USE_LITE_OP(uniform_random);
USE_LITE_OP(unsqueeze);
USE_LITE_OP(unsqueeze2);
USE_LITE_OP(layout_once);
USE_LITE_OP(search_group_padding);
USE_LITE_OP(assign_value);
USE_LITE_OP(norm);
USE_LITE_OP(concat);
USE_LITE_OP(fill_constant);
USE_LITE_OP(fill_constant_batch_size_like);
USE_LITE_OP(calib_once);
USE_LITE_OP(decode_bboxes);
USE_LITE_OP(negative);
USE_LITE_OP(layer_norm);
USE_LITE_OP(mean);
USE_LITE_OP(lrn);
USE_LITE_OP(fetch);
USE_LITE_OP(power);
USE_LITE_OP(stack);
USE_LITE_OP(gather);
USE_LITE_OP(fc);
USE_LITE_OP(gru_unit);
USE_LITE_OP(increment);
USE_LITE_OP(mul);
USE_LITE_OP(search_seq_softmax);
USE_LITE_OP(var_conv_2d);
USE_LITE_OP(dropout);
USE_LITE_OP(fusion_elementwise_sub_activation);
USE_LITE_OP(fusion_elementwise_add_activation);
USE_LITE_OP(fusion_elementwise_mul_activation);
USE_LITE_OP(fusion_elementwise_max_activation);
USE_LITE_OP(fusion_elementwise_div_activation);
USE_LITE_OP(elementwise_sub);
USE_LITE_OP(elementwise_add);
USE_LITE_OP(elementwise_mul);
USE_LITE_OP(elementwise_max);
USE_LITE_OP(elementwise_div);
USE_LITE_OP(pad2d);
USE_LITE_OP(crop);
USE_LITE_OP(sequence_expand);
USE_LITE_OP(search_aligned_mat_mul);
USE_LITE_OP(io_copy);
USE_LITE_OP(squeeze);
USE_LITE_OP(squeeze2);
USE_LITE_OP(reduce_sum);
USE_LITE_OP(square);
USE_LITE_OP(relu);
USE_LITE_OP(leaky_relu);
USE_LITE_OP(relu_clipped);
USE_LITE_OP(prelu);
USE_LITE_OP(sigmoid);
USE_LITE_OP(tanh);
USE_LITE_OP(swish);
USE_LITE_OP(relu6);
USE_LITE_OP(log);
USE_LITE_OP(exp);
USE_LITE_OP(floor);
USE_LITE_OP(hard_sigmoid);
USE_LITE_OP(sqrt);
USE_LITE_OP(rsqrt);
USE_LITE_OP(softsign);
USE_LITE_OP(sequence_expand_as);
USE_LITE_OP(match_matrix_tensor);
USE_LITE_OP(range);
USE_LITE_OP(lookup_table);
USE_LITE_OP(fake_quantize_moving_average_abs_max);
USE_LITE_OP(search_grnn);
USE_LITE_OP(conv2d_transpose);
USE_LITE_OP(flatten);
USE_LITE_OP(flatten2);
USE_LITE_OP(nearest_interp);
USE_LITE_OP(bilinear_interp);
USE_LITE_OP(softmax);
USE_LITE_OP(reduce_mean);
USE_LITE_OP(cast);
USE_LITE_OP(fake_dequantize_max_abs);
USE_LITE_OP(read_from_array);
USE_LITE_OP(yolo_box);
USE_LITE_OP(multiclass_nms);
USE_LITE_OP(while);
USE_LITE_OP(conv2d);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(split);
USE_LITE_OP(scale);
USE_LITE_OP(beam_search_decode);
USE_LITE_OP(im2sequence);
USE_LITE_OP(sequence_topk_avg_pooling);
USE_LITE_OP(io_copy_once);
USE_LITE_OP(roi_align);
USE_LITE_OP(sequence_reshape);
USE_LITE_OP(equal);
USE_LITE_OP(notequal);
USE_LITE_OP(less_than);
USE_LITE_OP(less_equal);
USE_LITE_OP(greater_than);
USE_LITE_OP(greater_equal);
USE_LITE_OP(calib);
USE_LITE_OP(sequence_pool);
USE_LITE_OP(attention_padding_mask);
USE_LITE_OP(search_attention_padding_mask);
USE_LITE_OP(density_prior_box);
USE_LITE_OP(search_seq_fc);
USE_LITE_OP(generate_proposals);
USE_LITE_OP(sequence_arithmetic);
USE_LITE_OP(search_seq_arithmetic);
USE_LITE_OP(shuffle_channel);
USE_LITE_OP(sequence_softmax);
USE_LITE_OP(logical_xor);
USE_LITE_OP(logical_and);
USE_LITE_OP(logical_or);
USE_LITE_OP(logical_not);
\ No newline at end of file
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include "lite/api/test_helper.h" #include "lite/api/test_helper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file"); DEFINE_string(input_file, "", "input_file");
namespace paddle { namespace paddle {
...@@ -71,11 +70,7 @@ TEST(ResNet50, test) { ...@@ -71,11 +70,7 @@ TEST(ResNet50, test) {
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
}); });
predictor.Build(FLAGS_model_dir, predictor.Build(FLAGS_model_dir, "", "", valid_places);
"",
"",
valid_places);
// predictor.Build(FLAGS_model_dir, // predictor.Build(FLAGS_model_dir,
// FLAGS_model_dir + "/model", // FLAGS_model_dir + "/model",
...@@ -83,13 +78,11 @@ TEST(ResNet50, test) { ...@@ -83,13 +78,11 @@ TEST(ResNet50, test) {
// Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, // Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
// valid_places); // valid_places);
auto* input_tensor = predictor.GetInput(0); auto* input_tensor = predictor.GetInput(0);
int width = 416; int width = 416;
int height = 416; int height = 416;
std::ifstream file_stream(FLAGS_input_file); std::ifstream file_stream(FLAGS_input_file);
// file_stream.open(path); // file_stream.open(path);
if (!file_stream.good()) { if (!file_stream.good()) {
...@@ -101,7 +94,8 @@ TEST(ResNet50, test) { ...@@ -101,7 +94,8 @@ TEST(ResNet50, test) {
file_stream >> height; file_stream >> height;
file_stream >> width; file_stream >> width;
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, height, width}))); input_tensor->Resize(
DDim(std::vector<DDim::value_type>({1, 3, height, width})));
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production(); auto item_size = input_tensor->dims().production();
...@@ -125,17 +119,17 @@ TEST(ResNet50, test) { ...@@ -125,17 +119,17 @@ TEST(ResNet50, test) {
} }
auto* out = predictor.GetOutput(0); auto* out = predictor.GetOutput(0);
for (int i = 0;i < out->dims().production();i++) { for (int i = 0; i < out->dims().production(); i++) {
std::cout << ":" << out->data<float>()[i] << std::endl; std::cout << ":" << out->data<float>()[i] << std::endl;
} }
// std::cout << "-------\n"; // std::cout << "-------\n";
// auto* out1 = predictor.GetOutput(1); // auto* out1 = predictor.GetOutput(1);
// for (int i = 0;i < out1->dims().production();i++) { // for (int i = 0;i < out1->dims().production();i++) {
// std::cout << ":" << out1->data<float>()[i] << std::endl; // std::cout << ":" << out1->data<float>()[i] << std::endl;
// } // }
std::string file = "output/" + FLAGS_input_file.substr (6); std::string file = "output/" + FLAGS_input_file.substr(6);
std::cout << "file:::" << file << std::endl; std::cout << "file:::" << file << std::endl;
std::ofstream ofs; std::ofstream ofs;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
// #include "lite/backends/fpga/lite_tensor.h" // #include "lite/backends/fpga/lite_tensor.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
...@@ -16,19 +34,16 @@ class Debugger { ...@@ -16,19 +34,16 @@ class Debugger {
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) { void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
// tensor->printScale(); // tensor->printScale();
// tensor->saveToFile(op_type, true); if (op_type != "conv") {
// tensor->saveToFile(op_type, true);
}
} }
private: private:
std::unordered_map<std::string, bool> op_config; std::unordered_map<std::string, bool> op_config;
Debugger() { Debugger() {
op_config["concat"] = true; op_config["concat"] = true;
op_config["conv_add_bn"] = true; op_config["conv"] = true;
op_config["conv_add_bn_relu"] = true;
op_config["conv_add"] = true;
op_config["conv_add_relu"] = true;
op_config["conv_bn"] = true;
op_config["conv_bn_relu"] = true;
op_config["crop"] = true; op_config["crop"] = true;
} }
}; };
...@@ -39,16 +54,16 @@ inline void chw_to_hwc(Tensor* t, float* dst) { ...@@ -39,16 +54,16 @@ inline void chw_to_hwc(Tensor* t, float* dst) {
int height = 1; int height = 1;
int width = 1; int width = 1;
if (t->dims().size() > 2){ if (t->dims().size() > 2) {
height = t->dims()[2]; height = t->dims()[2];
} }
if (t->dims().size() > 3){ if (t->dims().size() > 3) {
width = t->dims()[3]; width = t->dims()[3];
} }
// int width = t->dims()[3]; // int width = t->dims()[3];
const float* chw_data = t->data<float>(); const float* chw_data = t->data<float>();
float* hwc_data = dst; float* hwc_data = dst;
int chw = channel * height * width; int chw = channel * height * width;
int wc = width * channel; int wc = width * channel;
int index = 0; int index = 0;
...@@ -64,7 +79,7 @@ inline void chw_to_hwc(Tensor* t, float* dst) { ...@@ -64,7 +79,7 @@ inline void chw_to_hwc(Tensor* t, float* dst) {
} }
} }
inline void read_from_file(lite::Tensor* t,const std::string& path) { inline void read_from_file(lite::Tensor* t, const std::string& path) {
std::ifstream file_stream; std::ifstream file_stream;
file_stream.open(path); file_stream.open(path);
if (!file_stream) { if (!file_stream) {
...@@ -81,17 +96,18 @@ inline void read_from_file(lite::Tensor* t,const std::string& path) { ...@@ -81,17 +96,18 @@ inline void read_from_file(lite::Tensor* t,const std::string& path) {
} }
inline void save_float(float* data, const std::string& name, int len) { inline void save_float(float* data, const std::string& name, int len) {
// return; // return;
static int counter = 0; static int counter = 0;
std::string old_string = std::to_string(counter); std::string old_string = std::to_string(counter);
std::string new_string = std::string(3 - old_string.length(), '0') + old_string; std::string new_string =
std::string(3 - old_string.length(), '0') + old_string;
std::string file = "arm_" + new_string + name; std::string file = "arm_" + new_string + name;
counter++; counter++;
std::cout << "-------------------------- saving file: --------------------------" << file << std::endl; std::cout
<< "-------------------------- saving file: --------------------------"
<< file << std::endl;
std::ofstream ofs; std::ofstream ofs;
ofs.open(file); ofs.open(file);
// float* data = dst; // float* data = dst;
...@@ -102,9 +118,11 @@ inline void save_float(float* data, const std::string& name, int len) { ...@@ -102,9 +118,11 @@ inline void save_float(float* data, const std::string& name, int len) {
ofs.close(); ofs.close();
} }
inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert = true) { inline void save_tensor(lite::Tensor* t,
const std::string& name,
bool convert = true) {
float* data = const_cast<float*>(t->data<float>()); float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()]; float* dst = new float[t->numel()];
if (convert) { if (convert) {
chw_to_hwc(t, dst); chw_to_hwc(t, dst);
data = dst; data = dst;
...@@ -114,8 +132,9 @@ inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert = ...@@ -114,8 +132,9 @@ inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert =
delete[] dst; delete[] dst;
} }
inline void save_tensor(const lite::Tensor* t,const std::string& name, bool convert = true) { inline void save_tensor(const lite::Tensor* t,
const std::string& name,
bool convert = true) {
// return; // return;
float* data = const_cast<float*>(t->data<float>()); float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()]; float* dst = new float[t->numel()];
...@@ -128,6 +147,5 @@ inline void save_tensor(const lite::Tensor* t,const std::string& name, bool conv ...@@ -128,6 +147,5 @@ inline void save_tensor(const lite::Tensor* t,const std::string& name, bool conv
delete[] dst; delete[] dst;
} }
} // namespace lite
} } // namespace paddle
}
\ No newline at end of file
...@@ -45,12 +45,8 @@ struct None : Layout { ...@@ -45,12 +45,8 @@ struct None : Layout {
int channelIndex() { return -1; } int channelIndex() { return -1; }
int heightIndex() { return -1; } int heightIndex() { return -1; }
int widthIndex() { return -1; } int widthIndex() { return -1; }
int alignedElementCount(const std::vector<int>& dims) { int alignedElementCount(const std::vector<int>& dims) { return 16; }
return 16; virtual int elementCount(const std::vector<int>& dims) { return 1; }
}
virtual int elementCount(const std::vector<int>& dims) {
return 1;
}
}; };
struct NCHW : Layout { struct NCHW : Layout {
......
...@@ -25,28 +25,31 @@ namespace bias_scale { ...@@ -25,28 +25,31 @@ namespace bias_scale {
void align_element(float **data_in, int num_per_div_before_alignment, int num) { void align_element(float **data_in, int num_per_div_before_alignment, int num) {
int copynum = 0; int copynum = 0;
float *ptr_unaligned = *data_in; float *ptr_unaligned = *data_in;
int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; int div_num =
int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment, (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
BS_NUM_ALIGNMENT); int num_per_div_after_alignment =
int num_element = 2 * div_num * num_per_div_after_alignment; // including bias & scale align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned =
(float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, num_element * sizeof(float)); memset(ptr_aligned, 0, num_element * sizeof(float));
for (int i = 0; i < div_num; i++) { for (int i = 0; i < div_num; i++) {
if (i == div_num - 1) { if (i == div_num - 1) {
copynum = (num_per_div_after_alignment * div_num > num) copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment) ? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment); : (num_per_div_before_alignment);
} else { } else {
copynum = num_per_div_before_alignment; copynum = num_per_div_before_alignment;
} }
memcpy(ptr_aligned + i * num_per_div_after_alignment, memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i, ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float)); copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment, memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num, ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float)); copynum * sizeof(float));
} }
fpga_free(ptr_unaligned); fpga_free(ptr_unaligned);
*data_in = ptr_aligned; *data_in = ptr_aligned;
...@@ -55,13 +58,14 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -55,13 +58,14 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
size_t interleave(float **data_in, int num_after_alignment) { size_t interleave(float **data_in, int num_after_alignment) {
float *ptr_uninterleaved = *data_in; float *ptr_uninterleaved = *data_in;
float *ptr_interleaved = float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
int num = num_after_alignment / 4; int num = num_after_alignment / 4;
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, memcpy(
4 * sizeof(float)); ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, 4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4, memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float)); ptr_uninterleaved + num_after_alignment + 4 * i,
4 * sizeof(float));
} }
fpga_free(ptr_uninterleaved); fpga_free(ptr_uninterleaved);
...@@ -70,12 +74,14 @@ size_t interleave(float **data_in, int num_after_alignment) { ...@@ -70,12 +74,14 @@ size_t interleave(float **data_in, int num_after_alignment) {
} }
void format_bias_scale_array(float **bias_scale_array, void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) { int element_num_per_division,
int num) {
align_element(bias_scale_array, element_num_per_division, num); align_element(bias_scale_array, element_num_per_division, num);
int div_num = (num + element_num_per_division - 1) / element_num_per_division; int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division = int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
size_t mem = interleave(bias_scale_array, div_num * element_num_after_division); size_t mem =
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, mem); fpga_flush(*bias_scale_array, mem);
} }
void format_bias_array(float **bias_array, int num) { void format_bias_array(float **bias_array, int num) {
...@@ -83,12 +89,12 @@ void format_bias_array(float **bias_array, int num) { ...@@ -83,12 +89,12 @@ void format_bias_array(float **bias_array, int num) {
int num_before_align = num; int num_before_align = num;
int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT); int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
int16_t *ptr_aligned = int16_t *ptr_aligned =
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(int16_t)); memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
for (int i = 0; i < num_before_align; i++) { for (int i = 0; i < num_before_align; i++) {
float value = ptr_aligned[i]; float value = ptr_aligned[i];
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]); ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
} }
*bias_array = (float *)ptr_aligned; // NOLINT *bias_array = (float *)ptr_aligned; // NOLINT
fpga_free(ptr_unaligned); fpga_free(ptr_unaligned);
......
...@@ -21,7 +21,8 @@ namespace bias_scale { ...@@ -21,7 +21,8 @@ namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num); void align_element(float** data_in, int num_per_div_before_alignment, int num);
size_t interleave(float** data_in, int num_after_alignment); size_t interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array, void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num); int element_num_per_division,
int num);
void format_bias_array(float** bias_array, int num); void format_bias_array(float** bias_array, int num);
} // namespace bias_scale } // namespace bias_scale
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lite/backends/fpga/KD/llapi/filter.h"
#include <memory.h> #include <memory.h>
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
...@@ -27,7 +27,7 @@ namespace filter { ...@@ -27,7 +27,7 @@ namespace filter {
static int FILTER_SIZE = 2048; static int FILTER_SIZE = 2048;
static int COLUMN = 4; static int COLUMN = 4;
void saveToFile(std::string name,void* data_in, int size) { void saveToFile(std::string name, void* data_in, int size) {
// std::ofstream ofs; // std::ofstream ofs;
// ofs.open(name); // ofs.open(name);
...@@ -39,7 +39,7 @@ void saveToFile(std::string name,void* data_in, int size) { ...@@ -39,7 +39,7 @@ void saveToFile(std::string name,void* data_in, int size) {
// ofs.close(); // ofs.close();
} }
void saveFloatToFile(std::string name,float* data_in, int size) { void saveFloatToFile(std::string name, float* data_in, int size) {
// std::ofstream ofs; // std::ofstream ofs;
// ofs.open(name); // ofs.open(name);
...@@ -50,22 +50,16 @@ void saveFloatToFile(std::string name,float* data_in, int size) { ...@@ -50,22 +50,16 @@ void saveFloatToFile(std::string name,float* data_in, int size) {
// ofs.close(); // ofs.close();
} }
void set_filter_capacity(uint32_t cap) { void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
FILTER_SIZE = cap;
}
void set_colunm(uint32_t column) { void set_colunm(uint32_t column) { COLUMN = column; }
COLUMN = column;
}
// replace zynqmp_api.h #define FILTER_NUM_ALIGNMENT // replace zynqmp_api.h #define FILTER_NUM_ALIGNMENT
int get_filter_num_alignment() { int get_filter_num_alignment() { return COLUMN * 4; }
return COLUMN * 4;
}
int calc_division_capacity(int chw) { int calc_division_capacity(int chw) {
// int n = FILTER_SIZE / ((chw + 15) / 16) * 32; // int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
int filter_num_alignment = get_filter_num_alignment(); int filter_num_alignment = get_filter_num_alignment();
int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment; int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
return n < FILTER_SIZE ? n : FILTER_SIZE; return n < FILTER_SIZE ? n : FILTER_SIZE;
} }
...@@ -91,8 +85,12 @@ int calc_num_per_div(int num, int group_num, int division_capacity) { ...@@ -91,8 +85,12 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
} }
} }
void convert_to_hwc(int8_t* chw_data, int8_t* hwc_data, int num, int channel, void convert_to_hwc(int8_t* chw_data,
int height, int width) { int8_t* hwc_data,
int num,
int channel,
int height,
int width) {
int chw = channel * height * width; int chw = channel * height * width;
int wc = width * channel; int wc = width * channel;
int index = 0; int index = 0;
...@@ -128,7 +126,6 @@ int8_t float_to_int8(float fdata) { ...@@ -128,7 +126,6 @@ int8_t float_to_int8(float fdata) {
} }
void quantize(float* src, int8_t* dst, int len, float max) { void quantize(float* src, int8_t* dst, int len, float max) {
float fix_range = 127; float fix_range = 127;
float scale = fix_range / max; float scale = fix_range / max;
for (size_t i = 0; i < len; i++) { for (size_t i = 0; i < len; i++) {
...@@ -149,9 +146,12 @@ void align_chw(int8_t* src, int8_t* dst, int num, int chw) { ...@@ -149,9 +146,12 @@ void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
} }
} }
void align_num(int8_t* src, int8_t* dst, int num_per_div_before_alignment, void align_num(int8_t* src,
int num, int align_chw) { int8_t* dst,
int filter_num_alignment = get_filter_num_alignment(); int num_per_div_before_alignment,
int num,
int align_chw) {
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment); align_to_x(num_per_div_before_alignment, filter_num_alignment);
...@@ -178,8 +178,10 @@ void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) { ...@@ -178,8 +178,10 @@ void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int filter_num_alignment = get_filter_num_alignment(); int filter_num_alignment = get_filter_num_alignment();
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
for (index = 0; index < num_after_alignment; index++) { for (index = 0; index < num_after_alignment; index++) {
new_index = index / filter_num_alignment * filter_num_alignment + (index % (filter_num_alignment/2) / 4 * 8) + (index % (filter_num_alignment/2) % 4) + new_index = index / filter_num_alignment * filter_num_alignment +
(index / (filter_num_alignment/2) % 2 * 4); (index % (filter_num_alignment / 2) / 4 * 8) +
(index % (filter_num_alignment / 2) % 4) +
(index / (filter_num_alignment / 2) % 2 * 4);
memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align); memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
} }
} }
...@@ -191,7 +193,8 @@ void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) { ...@@ -191,7 +193,8 @@ void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
for (int i = 0; i < num_after_alignment; i += 2) { for (int i = 0; i < num_after_alignment; i += 2) {
for (int j = 0, k = 0; j < interleave_num; j += 2, k++) { for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(dst + i * chw_align + interleave_per_num * j, memcpy(dst + i * chw_align + interleave_per_num * j,
src + i * chw_align + interleave_per_num * k, interleave_per_num); src + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(dst + i * chw_align + interleave_per_num * (j + 1), memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
src + (i + 1) * chw_align + interleave_per_num * k, src + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num); interleave_per_num);
...@@ -199,14 +202,20 @@ void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) { ...@@ -199,14 +202,20 @@ void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
} }
} }
int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel, int8_t* format_filter(float* data_in,
int height, int width, int group_num, float max, int& mem_size_a, // NOLINT
std::vector<float>& filter_max) { int num,
int channel,
int height,
int width,
int group_num,
float max,
std::vector<float>& filter_max) { // NOLINT
int data_size = channel * height * width * num; int data_size = channel * height * width * num;
int chw = channel * height * width; int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw); int division_capacity = calc_division_capacity(chw);
int filter_num_alignment = get_filter_num_alignment(); int filter_num_alignment = get_filter_num_alignment();
int num_per_div_before_alignment = int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity); calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment = int num_per_div_after_alignment =
...@@ -215,8 +224,8 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel, ...@@ -215,8 +224,8 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num; // int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment; int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) + ((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, filter_num_alignment); align_to_x(residual, filter_num_alignment);
// saveFloatToFile("quantize_before", data_in, data_size); // saveFloatToFile("quantize_before", data_in, data_size);
...@@ -255,15 +264,19 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel, ...@@ -255,15 +264,19 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
fpga_free(hwc_data); fpga_free(hwc_data);
} }
if (num_after_alignment != num) { if (num_after_alignment != num) {
int filter_num_alignment = get_filter_num_alignment(); int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment); align_to_x(num_per_div_before_alignment, filter_num_alignment);
// int div_num = // int div_num =
// (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; // (num + num_per_div_before_alignment - 1) /
// num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * chw_aligned; int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data = int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t))); reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
align_num(temp_data, num_aligned_data, num_per_div_before_alignment, num, align_num(temp_data,
num_aligned_data,
num_per_div_before_alignment,
num,
chw_aligned); chw_aligned);
// saveToFile("align_num.txt", num_aligned_data, data_size * 8); // saveToFile("align_num.txt", num_aligned_data, data_size * 8);
...@@ -274,14 +287,15 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel, ...@@ -274,14 +287,15 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned)); reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
reorder(temp_data, aligned_data, num_after_alignment, chw); reorder(temp_data, aligned_data, num_after_alignment, chw);
// saveToFile("reorder.txt", aligned_data, data_size * 8); // saveToFile("reorder.txt", aligned_data, data_size * 8);
fpga_free(temp_data); // TODO change name of qdata; fpga_free(temp_data);
int8_t* interleaved_data = int8_t* interleaved_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned)); reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
interleave(aligned_data, interleaved_data, num_after_alignment, chw); interleave(aligned_data, interleaved_data, num_after_alignment, chw);
// saveToFile("interleave.txt", interleaved_data, data_size * 8); // saveToFile("interleave.txt", interleaved_data, data_size * 8);
fpga_free(aligned_data); fpga_free(aligned_data);
fpga_flush(interleaved_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * fpga_flush(interleaved_data,
num_after_alignment * sizeof(char)); align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
sizeof(char));
mem_size_a = num_after_alignment * chw_aligned; mem_size_a = num_after_alignment * chw_aligned;
return interleaved_data; return interleaved_data;
} }
...@@ -328,7 +342,11 @@ size_t align_element_n(int16_t** data_in, int num, int height, int width) { ...@@ -328,7 +342,11 @@ size_t align_element_n(int16_t** data_in, int num, int height, int width) {
return num_element * sizeof(int16_t); return num_element * sizeof(int16_t);
} }
void to_fp16(float* src, float16* dst, int num, int height, int width, void to_fp16(float* src,
float16* dst,
int num,
int height,
int width,
float* scale_ptr) { float* scale_ptr) {
int size = num * height * width; int size = num * height * width;
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
...@@ -344,8 +362,8 @@ void to_fp16(float* src, float16* dst, int num, int height, int width, ...@@ -344,8 +362,8 @@ void to_fp16(float* src, float16* dst, int num, int height, int width,
fpga_flush(dst, size * sizeof(int16_t)); fpga_flush(dst, size * sizeof(int16_t));
} }
void quantize_to_fp16(float** data_in, int num, int height, int width, void quantize_to_fp16(
float* scale_ptr) { float** data_in, int num, int height, int width, float* scale_ptr) {
float* tmp = *data_in; float* tmp = *data_in;
int size = num * height * width; int size = num * height * width;
...@@ -364,20 +382,16 @@ void quantize_to_fp16(float** data_in, int num, int height, int width, ...@@ -364,20 +382,16 @@ void quantize_to_fp16(float** data_in, int num, int height, int width,
*data_in = (float*)tmp_data; // NOLINT *data_in = (float*)tmp_data; // NOLINT
fpga_free(tmp); fpga_free(tmp);
} }
size_t format_dwconv_filter(float** data_in, int num, int height, int width, size_t format_dwconv_filter(
float* scale_ptr) { float** data_in, int num, int height, int width, float* scale_ptr) {
// float16* fp16_data = reinterpret_cast<float16*>(
// fpga_malloc(num * height * width * sizeof(float16)));
// to_fp16(*data_in, fp16_data, num, height, width, scale_ptr);
// int16_t** quantize_data = (int16_t**)&fp16_data; // NOLINT
quantize_to_fp16(data_in, num, height, width, scale_ptr); quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
convert_to_hwn(quantize_data, num, height, width); convert_to_hwn(quantize_data, num, height, width);
size_t size = align_element_n(quantize_data, num, height, width); size_t size = align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * fpga_flush(*quantize_data,
height * width * sizeof(int16_t)); align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * height * width *
sizeof(int16_t));
return size; return size;
} }
} // namespace filter } // namespace filter
......
...@@ -33,16 +33,22 @@ int calc_division_number(int num, int group_num, int division_capacity); ...@@ -33,16 +33,22 @@ int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity);
float find_max(float* data_in, int data_size); float find_max(float* data_in, int data_size);
int8_t* format_filter(float* data_in, int& mem_size, int num, int channel, int8_t* format_filter(float* data_in,
int height, int width, int group_num, float max, int& mem_size, // NOLINT
std::vector<float>& filter_max); int num,
int channel,
int height,
int width,
int group_num,
float max, // NOLINT
std::vector<float>& filter_max); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width); void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width); size_t align_element_n(int16_t** data_in, int num, int height, int width);
// void quantize_to_fp16(float** data_in, int num, int height, int width, // void quantize_to_fp16(float** data_in, int num, int height, int width,
// float* scale_ptr); // float* scale_ptr);
size_t format_dwconv_filter(float** data_in, int num, int height, int width, size_t format_dwconv_filter(
float* scale_ptr); float** data_in, int num, int height, int width, float* scale_ptr);
} // namespace filter } // namespace filter
} // namespace zynqmp } // namespace zynqmp
......
...@@ -158,7 +158,7 @@ void fpga_copy(void *dest, const void *src, size_t num) { ...@@ -158,7 +158,7 @@ void fpga_copy(void *dest, const void *src, size_t num) {
int fpga_reset() { int fpga_reset() {
struct FpgaResetArgs args; struct FpgaResetArgs args;
return do_ioctl(IOCTL_FPGA_RESET, &args); return do_ioctl(IOCTL_FPGA_RESET, &args);
} }
int ioctl_conv(const struct ConvArgs &args) { int ioctl_conv(const struct ConvArgs &args) {
...@@ -283,7 +283,6 @@ int perform_bypass(const struct BypassArgs &args) { ...@@ -283,7 +283,6 @@ int perform_bypass(const struct BypassArgs &args) {
bypassArgs.image.height = 1; bypassArgs.image.height = 1;
bypassArgs.output.scale_address = scales; bypassArgs.output.scale_address = scales;
float scale = 0; float scale = 0;
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
bypassArgs.image.channels = max_size; bypassArgs.image.channels = max_size;
...@@ -309,9 +308,8 @@ int perform_bypass(const struct BypassArgs &args) { ...@@ -309,9 +308,8 @@ int perform_bypass(const struct BypassArgs &args) {
output_address + count * max_size * out_type_size); output_address + count * max_size * out_type_size);
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]); scale = std::max(scale, scales[0]);
} }
args.output.scale_address[0] = scale; args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale; args.output.scale_address[1] = 1.0f / scale;
return ret; return ret;
...@@ -362,15 +360,15 @@ int compute_fpga_dwconv(const struct DWconvArgs &args) { ...@@ -362,15 +360,15 @@ int compute_fpga_dwconv(const struct DWconvArgs &args) {
std::cout << " out_address:" << args.output.address std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
// float *in_scale = (float *)args.image.scale_address; // float *in_scale = (float *)args.image.scale_address;
// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] << // std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl; // std::endl;
#endif #endif
return do_ioctl(IOCTL_CONFIG_DWCONV, &args); return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
} }
int config_activation(const struct ActiveParamterArgs& args) { int config_activation(const struct ActiveParamterArgs &args) {
return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args); return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
} }
// int config_power(const struct PowerArgs& args) { // int config_power(const struct PowerArgs& args) {
......
...@@ -32,6 +32,5 @@ class PE { ...@@ -32,6 +32,5 @@ class PE {
virtual ~PE() {} virtual ~PE() {}
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <string>
#include <vector> #include <vector>
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
...@@ -100,7 +101,7 @@ struct DepthwiseConvParam : ConvParam { ...@@ -100,7 +101,7 @@ struct DepthwiseConvParam : ConvParam {
}; };
struct GRUParam : PEParam { struct GRUParam : PEParam {
public: public:
Tensor* input = nullptr; Tensor* input = nullptr;
Tensor* h0 = nullptr; Tensor* h0 = nullptr;
Tensor* weight = nullptr; Tensor* weight = nullptr;
...@@ -112,7 +113,7 @@ public: ...@@ -112,7 +113,7 @@ public:
Tensor* hidden = nullptr; Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid"; std::string gate_activation = "sigmoid";
std::string activation= "tanh"; std::string activation = "tanh";
bool is_reverse = false; bool is_reverse = false;
bool origin_mode = false; bool origin_mode = false;
}; };
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <arm_neon.h> #include <arm_neon.h>
#include <algorithm>
#include <vector> #include <vector>
#include "lite/backends/fpga/KD/pe.hpp" #include "lite/backends/fpga/KD/pe.hpp"
...@@ -59,16 +60,14 @@ class ConvPE : public PE { ...@@ -59,16 +60,14 @@ class ConvPE : public PE {
if (param_.filter->shape().width() == 1 && if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) { param_.filter->shape().height() == 1) {
// use_cpu_ = true; // use_cpu_ = true;
} }
if (!use_cpu_) { if (!use_cpu_) {
// param_.filter->releaseData(); // param_.filter->releaseData();
} }
} }
void cpu_conv_hwc() { void cpu_conv_hwc() {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
input->syncToCPU(); input->syncToCPU();
...@@ -106,10 +105,12 @@ class ConvPE : public PE { ...@@ -106,10 +105,12 @@ class ConvPE : public PE {
for (int pw = 0; pw < pooled_width_; pw++) { for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h; int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w; int wstart = pw * kernel_step_w - image_pad_w;
int hend = std::min(hstart + kernel_height, (int) image_height); int hend =
int wend = std::min(wstart + kernel_width, (int) image_width); std::min(hstart + kernel_height, static_cast<int>(image_height));
hstart = std::max(hstart, (int) 0); int wend =
wstart = std::max(wstart, (int) 0); std::min(wstart + kernel_width, static_cast<int>(image_width));
hstart = std::max(hstart, static_cast<int>(0));
wstart = std::max(wstart, static_cast<int>(0));
for (int oc = 0; oc < out_channel; oc++) { for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f; float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc; const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
...@@ -117,28 +118,32 @@ class ConvPE : public PE { ...@@ -117,28 +118,32 @@ class ConvPE : public PE {
for (int h = hstart; h < hend; h++) { for (int h = hstart; h < hend; h++) {
int hi = 0; int hi = 0;
if (ph == 0) { if (ph == 0) {
hi = h - hstart + image_pad_h; hi = h - hstart + image_pad_h;
} else{ } else {
hi = h - hstart; hi = h - hstart;
} }
for (int w = wstart; w < wend; w++) { for (int w = wstart; w < wend; w++) {
int wi = 0; int wi = 0;
if (pw == 0) { if (pw == 0) {
wi = w - wstart + image_pad_w; wi = w - wstart + image_pad_w;
}else { } else {
wi = w - wstart; wi = w - wstart;
} }
const int index = (h * image_width + w) * image_channels + c; const int index = (h * image_width + w) * image_channels + c;
// int weight_index = (hi * kernel_width + wi) * image_channels + c;//TODO // int weight_index = (hi *
int weight_index = oc * filter_chw + kernel_width * // kernel_width + wi) * image_channels
kernel_height * c + kernel_width * hi + wi; // + c;//TODO
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index]; float value = image_addr[index] * filter_data[weight_index];
sum += value; sum += value;
} }
} }
} }
// std::cout << " ============================= pool_index:" << pool_index << " sum:" << sum << std::endl; // std::cout << " ============================= pool_index:" <<
// pool_index << " sum:" << sum << std::endl;
if (param_.relu.enabled && sum < 0) { if (param_.relu.enabled && sum < 0) {
sum = -sum; sum = -sum;
} }
...@@ -155,9 +160,6 @@ class ConvPE : public PE { ...@@ -155,9 +160,6 @@ class ConvPE : public PE {
output->scale()[1] = 127 / max; output->scale()[1] = 127 / max;
} }
void cpu_compute() { void cpu_compute() {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
...@@ -196,7 +198,6 @@ class ConvPE : public PE { ...@@ -196,7 +198,6 @@ class ConvPE : public PE {
for (int h = 0; h < output->shape().height(); h++) { for (int h = 0; h < output->shape().height(); h++) {
for (int w = 0; w < output->shape().width(); w++) { for (int w = 0; w < output->shape().width(); w++) {
float sum = 0; float sum = 0;
// #pragma omp parallel for // #pragma omp parallel for
...@@ -204,11 +205,10 @@ class ConvPE : public PE { ...@@ -204,11 +205,10 @@ class ConvPE : public PE {
int image_index = h * out_width * in_channel + w * in_channel + j; int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j]; float value = image_addr[image_index] * filter_ptr[j];
sum += value; sum += value;
// mi[j] = value; // mi[j] = value;
} }
// for (int j = 0; j < in_channel; j++) { // for (int j = 0; j < in_channel; j++) {
// sum += mi[j]; // sum += mi[j];
// } // }
...@@ -305,7 +305,8 @@ class ConvPE : public PE { ...@@ -305,7 +305,8 @@ class ConvPE : public PE {
// } // }
} }
if (param_.input->shape().channel() == 64 && param_.output->shape().channel() == 128) { if (param_.input->shape().channel() == 64 &&
param_.output->shape().channel() == 128) {
// exit(-1); // exit(-1);
} }
......
...@@ -85,7 +85,8 @@ inline void combine_bn_params(BatchnormParam* bn, ConvParam* param_) { ...@@ -85,7 +85,8 @@ inline void combine_bn_params(BatchnormParam* bn, ConvParam* param_) {
} }
} }
inline void combine_add_bn_params(BatchnormParam* bn, Tensor* bias, inline void combine_add_bn_params(BatchnormParam* bn,
Tensor* bias,
ConvParam* param_) { ConvParam* param_) {
int channel = param_->output->shape().channel(); int channel = param_->output->shape().channel();
Shape sb_shape(N, {channel}); Shape sb_shape(N, {channel});
...@@ -117,74 +118,88 @@ inline void combine_add_bn_params(BatchnormParam* bn, Tensor* bias, ...@@ -117,74 +118,88 @@ inline void combine_add_bn_params(BatchnormParam* bn, Tensor* bias,
param_->bias()->setDataLocation(CPU); param_->bias()->setDataLocation(CPU);
} }
inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter, inline void format_scale_bias(Tensor* scale,
Tensor* scale_bias, int group) { Tensor* bias,
Tensor* filter,
Tensor* scale_bias,
int group) {
float* scale_data = nullptr; float* scale_data = nullptr;
float* bias_data = nullptr; float* bias_data = nullptr;
if (scale != nullptr) { if (scale != nullptr) {
scale_data = scale->data<float>(); scale_data = scale->data<float>();
}
if (bias != nullptr) {
bias_data = bias->data<float>();
}
int channel = filter->shape().num();
int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
int c_per_group = channel / group;
int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
Shape bias_scale_shape(N, {2 * scale_bias_len});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
float* temp_data =
reinterpret_cast<float*>(fpga_malloc(2 * scale_bias_len * sizeof(float)));
memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
std::vector<float> scales;
if (scale_data != nullptr) {
for (int i = 0; i < channel; ++i) {
scales.push_back(scale_data[i]);
} }
if (bias != nullptr) { for (int i = 0; i < scale_bias_len - channel; i++) {
bias_data = bias->data<float>(); scales.push_back(1);
} }
int channel = filter->shape().num(); } else {
int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group; for (int i = 0; i < scale_bias_len; i++) {
scales.push_back(1);
int c_per_group = channel / group;
int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
Shape bias_scale_shape(N, {2 * scale_bias_len});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
float* temp_data = (float*)fpga_malloc(2 * scale_bias_len * sizeof(float)) ;
memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
std::vector<float> scales;
if (scale_data != nullptr) {
for (int i = 0; i < channel; ++i) {
scales.push_back(scale_data[i]);
}
for (int i = 0;i < scale_bias_len - channel; i++) {
scales.push_back(1);
}
} else {
for (int i = 0;i < scale_bias_len; i++) {
scales.push_back(1);
}
} }
}
for (int i = 0; i < scale_bias_len; ++i) { for (int i = 0; i < scale_bias_len; ++i) {
temp_data[i + scale_bias_len] = 1; temp_data[i + scale_bias_len] = 1;
temp_data[i] = 0; temp_data[i] = 0;
} }
for (int g = 0; g < group; g++) { for (int g = 0; g < group; g++) {
for (int c = 0; c < c_per_group; c++) { for (int c = 0; c < c_per_group; c++) {
int src_index = g * c_per_group + c; int src_index = g * c_per_group + c;
int dst_index = g * aligned_c_per_group + c; int dst_index = g * aligned_c_per_group + c;
float scale_value = scales[src_index]; float scale_value = scales[src_index];
float bias_value = bias_data == nullptr ? 0 : bias_data[src_index]; float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
temp_data[dst_index + scale_bias_len] = scale_value; temp_data[dst_index + scale_bias_len] = scale_value;
temp_data[dst_index] = bias_value; temp_data[dst_index] = bias_value;
}
} }
}
// int element_num_per_div = get_filter_num_per_div(filter, group); // int element_num_per_div = get_filter_num_per_div(filter, group);
// int scale_bias_len = align_to_x(channel / group, 8) * group; // int scale_bias_len = align_to_x(channel / group, 8) * group;
bias_scale::format_bias_scale_array(&temp_data, scale_bias_len / group, scale_bias_len); bias_scale::format_bias_scale_array(
memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float)); &temp_data, scale_bias_len / group, scale_bias_len);
memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
} }
inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group, inline void format_filter(Tensor* filter,
std::vector<float>& scales) { Tensor* quantized_filter,
int group,
std::vector<float>& scales) { // NOLINT
float max_value = find_max(*filter); float max_value = find_max(*filter);
Shape& filter_shape = filter->shape(); Shape& filter_shape = filter->shape();
int mem_size; int mem_size;
std::vector<float> max_values; std::vector<float> max_values;
int8_t* quantized_data = filter::format_filter(filter->data<float>(), mem_size ,filter_shape.num(), int8_t* quantized_data = filter::format_filter(filter->data<float>(),
filter_shape.channel(), filter_shape.height(), filter_shape.width(), group, max_value, max_values); mem_size,
filter_shape.num(),
float mem_factor = mem_size * 1.0f / filter->shape().numel(); filter_shape.channel(),
filter_shape.height(),
filter_shape.width(),
group,
max_value,
max_values);
float mem_factor = mem_size * 1.0f / filter->shape().numel();
quantized_filter->setMemScale(mem_factor); quantized_filter->setMemScale(mem_factor);
quantized_filter->setAligned(true); quantized_filter->setAligned(true);
...@@ -209,10 +224,10 @@ inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group, ...@@ -209,10 +224,10 @@ inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group,
// } // }
// ofs.close(); // ofs.close();
// exit(-1); // exit(-1);
} }
inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter, inline void format_dw_filter(Tensor* filter,
Tensor* quantized_filter,
float* scale) { float* scale) {
int num = filter->shape().num(); int num = filter->shape().num();
int height = filter->shape().height(); int height = filter->shape().height();
...@@ -259,12 +274,14 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -259,12 +274,14 @@ inline void split_filter_num(const ConvParam& c_param) {
int filter_num_per_div = get_filter_num_per_div(filter, param.groups); int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() * auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width(); filter->shape().width();
auto num = filter->shape().num(); auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw); int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment(); int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num = align_to_x(num / param.groups, filter_num_alignment) * param.groups; int aligned_num =
// int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) * param.groups; align_to_x(num / param.groups, filter_num_alignment) * param.groups;
// int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) *
// param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity); split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape(); Shape& out_shape = out->shape();
...@@ -280,20 +297,23 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -280,20 +297,23 @@ inline void split_filter_num(const ConvParam& c_param) {
ConvArgs& args = conv_param->args; ConvArgs& args = conv_param->args;
if (split_num == 1) { if (split_num == 1) {
out_address = out->data<float16>(); out_address = out->data<float16>();
out_scale_address = out->scale(); out_scale_address = out->scale();
} }
filter_num = i == split_num - 1 filter_num = i == split_num - 1
? channel - (split_num - 1) * filter_num_per_div // NOLINT ? channel - (split_num - 1) * filter_num_per_div // NOLINT
: filter_num_per_div; : filter_num_per_div;
if (split_num != 1) { if (split_num != 1) {
Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num}); Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
out_address = conv_param->output.mutableData<float16>(FP16, shape); out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale(); out_scale_address = conv_param->output.scale();
} }
Shape f_shape(NCHW, {filter_num, filter->shape().channel(), Shape f_shape(NCHW,
filter->shape().height(), filter->shape().width()}); {filter_num,
filter->shape().channel(),
filter->shape().height(),
filter->shape().width()});
Tensor new_filter; Tensor new_filter;
float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape); float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
...@@ -307,11 +327,12 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -307,11 +327,12 @@ inline void split_filter_num(const ConvParam& c_param) {
conv_param->filter.mutableData<float>(FP32, f_shape); conv_param->filter.mutableData<float>(FP32, f_shape);
if (param.groups != 1) { if (param.groups != 1) {
int mem_factor = 32 / filter_num_per_div; // TODO int mem_factor =
32 / filter_num_per_div; // TODO(chonwhite): change 32 to param;
conv_param->filter.setMemScale(mem_factor); conv_param->filter.setMemScale(mem_factor);
} }
std::vector<float> v; // TODO std::vector<float> v; // TODO(chonwhite): change local variable name
format_filter(&new_filter, &(conv_param->filter), param.groups, v); format_filter(&new_filter, &(conv_param->filter), param.groups, v);
conv_param->filter.setDataType(INT8); conv_param->filter.setDataType(INT8);
...@@ -326,15 +347,18 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -326,15 +347,18 @@ inline void split_filter_num(const ConvParam& c_param) {
float* bias_data = bias.mutableData<float>(FP32, s_shape); float* bias_data = bias.mutableData<float>(FP32, s_shape);
// std::cout << "v size: " << v.size() << std::endl; // std::cout << "v size: " << v.size() << std::endl;
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n]; scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
// scale_data[n] = param.scale()->data<float>()[n + chnnnel_start]; // scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
} }
for (int n = 0; n < filter_num; n++) { for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start]; bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
} }
Shape sb_shape(N, {sb_num}); Shape sb_shape(N, {sb_num});
format_scale_bias(&scale, &bias, &conv_param->filter, format_scale_bias(&scale,
&conv_param->scaleBias, param.groups); &bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
// conv_param->scaleBias.saveToFile("sb.txt"); // conv_param->scaleBias.saveToFile("sb.txt");
conv_param->scaleBias.flush(); conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>(); float* bs_data = conv_param->scaleBias.data<float>();
...@@ -363,7 +387,7 @@ inline void split_filter_num(const ConvParam& c_param) { ...@@ -363,7 +387,7 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.height = input->shape().height(); args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1]; args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0]; args.image.pad_height = param.paddings[0];
// TODO dilations[0] = dilations[1] // dilations[0] = dilations[1] ;
args.dilation = param.dilations[0]; args.dilation = param.dilations[0];
args.output.address = out_address; args.output.address = out_address;
...@@ -420,8 +444,11 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -420,8 +444,11 @@ inline void split_channel(const ConvParam& c_param) {
scale.flush(); scale.flush();
bias.flush(); bias.flush();
// Shape sb_shape(N, {2 * channel}); // Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale, &bias, &conv_param->filter, format_scale_bias(&scale,
&conv_param->scaleBias, param.groups); &bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
conv_param->scaleBias.flush(); conv_param->scaleBias.flush();
// conv_param->scaleBias.saveToFile("sb.txt"); // conv_param->scaleBias.saveToFile("sb.txt");
...@@ -445,7 +472,7 @@ inline void split_channel(const ConvParam& c_param) { ...@@ -445,7 +472,7 @@ inline void split_channel(const ConvParam& c_param) {
args.image.height = conv_param->input.shape().height(); args.image.height = conv_param->input.shape().height();
args.image.pad_width = param.paddings[1]; args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0]; args.image.pad_height = param.paddings[0];
// TODO dilations[0] = dilations[1] // dilations[0] = dilations[1]
args.dilation = param.dilations[0]; args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>(); args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale(); args.output.scale_address = conv_param->output.scale();
......
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pes/crop_pe.hpp" #include "lite/backends/fpga/KD/pes/crop_pe.hpp"
#include <vector>
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <cstring> #include <cstring>
#include <vector> #include <vector>
......
...@@ -36,11 +36,11 @@ class DepthwiseConvPE : public PE { ...@@ -36,11 +36,11 @@ class DepthwiseConvPE : public PE {
Tensor* input = param.input; Tensor* input = param.input;
Tensor* output = param.output; Tensor* output = param.output;
int channel = output->shape().channel(); int channel = output->shape().channel();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape()); float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
if (param_.bias()->dataType() == FP32) { if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>(); float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16 // bias从float转换成float16
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]); b_data[i] = float_to_half(new_bias_data[i]);
} }
...@@ -56,16 +56,18 @@ class DepthwiseConvPE : public PE { ...@@ -56,16 +56,18 @@ class DepthwiseConvPE : public PE {
Tensor* quantized_filter = param.quantizedFilter(); Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape()); quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data); format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
} else { } else {
//TODO filter 全为1时,且channal为对齐时 // filter 全为1时,且channal为对齐时
float16* scale_data = param_.scale()->data<float16>(); float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(FP16, param.filter->shape()); float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16)); // memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data, scale_data, param.filter->shape().numel() * sizeof(float16)); memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
param.quantizedFilter()->flush(); param.quantizedFilter()->flush();
} }
DWconvArgs args = {0}; DWconvArgs args = {0};
args.bias_address = b_data; args.bias_address = b_data;
...@@ -116,4 +118,4 @@ class DepthwiseConvPE : public PE { ...@@ -116,4 +118,4 @@ class DepthwiseConvPE : public PE {
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle_mobile } // namespace paddle
...@@ -57,7 +57,7 @@ class ElementwiseMulPE : public PE { ...@@ -57,7 +57,7 @@ class ElementwiseMulPE : public PE {
void updateInput(Tensor* t, int index) { void updateInput(Tensor* t, int index) {
if (index == 0) { if (index == 0) {
args_.scale_address = t->data<void>();//replace inputs? args_.scale_address = t->data<void>(); // replace inputs?
} }
} }
......
...@@ -40,13 +40,13 @@ class FullyConnectedPE : public PE { ...@@ -40,13 +40,13 @@ class FullyConnectedPE : public PE {
num_ = param_.input->shape().num(); num_ = param_.input->shape().num();
// if (num_ == 1) { // if (num_ == 1) {
// } else { // } else {
// tempOut_.mutableData<void>(FP16, param_.out->shape()); // tempOut_.mutableData<void>(FP16, param_.out->shape());
// convParam_.output = &tempOut_; // convParam_.output = &tempOut_;
// } // }
convParam_.output = param_.output; convParam_.output = param_.output;
convParam_.groups = 1; convParam_.groups = 1;
convParam_.strides = {1, 1}; convParam_.strides = {1, 1};
convParam_.paddings = {0, 0}; convParam_.paddings = {0, 0};
...@@ -93,9 +93,7 @@ class FullyConnectedPE : public PE { ...@@ -93,9 +93,7 @@ class FullyConnectedPE : public PE {
convPE_.apply(); convPE_.apply();
} }
bool dispatch() { bool dispatch() { return convPE_.dispatch(); }
return convPE_.dispatch();
}
FullyConnectedParam& param() { return param_; } FullyConnectedParam& param() { return param_; }
......
...@@ -14,16 +14,16 @@ limitations under the License. */ ...@@ -14,16 +14,16 @@ limitations under the License. */
#pragma once #pragma once
#include "lite/backends/arm/math/sgemm.h"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp" #include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp" #include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
#include "lite/backends/fpga/KD/pes/relu_pe.hpp" #include "lite/backends/fpga/KD/pes/relu_pe.hpp"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/arm/math/sgemm.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
namespace paddle { namespace paddle {
...@@ -38,8 +38,6 @@ struct GRUTensors { ...@@ -38,8 +38,6 @@ struct GRUTensors {
class GRUPE : public PE { class GRUPE : public PE {
public: public:
bool init() { bool init() {
// Tensor* output = param_.output; // Tensor* output = param_.output;
// output->setAligned(true); // output->setAligned(true);
...@@ -51,9 +49,10 @@ class GRUPE : public PE { ...@@ -51,9 +49,10 @@ class GRUPE : public PE {
auto hidden = param_.hidden; auto hidden = param_.hidden;
// auto hidden_dims = hidden->dims(); // auto hidden_dims = hidden->dims();
int frame_size = hidden->shape().channel(); int frame_size = hidden->shape().channel();
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}}; zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data = prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape); float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0; // set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16)); memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
...@@ -62,7 +61,9 @@ class GRUPE : public PE { ...@@ -62,7 +61,9 @@ class GRUPE : public PE {
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape); float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float)); memset(weight_data, 0, weight_shape.numel() * sizeof(float));
weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape); weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memcpy(weight_data, param_.weight->data<float>(), weight_shape.numel() * sizeof(float)); memcpy(weight_data,
param_.weight->data<float>(),
weight_shape.numel() * sizeof(float));
Shape gate_shape(zynqmp::NC, {1, frame_size * 2}); Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
gate_ping_.mutableData<void>(FP32, gate_shape); gate_ping_.mutableData<void>(FP32, gate_shape);
...@@ -85,8 +86,9 @@ class GRUPE : public PE { ...@@ -85,8 +86,9 @@ class GRUPE : public PE {
// // ==================== // // ====================
// Shape state_weight_shape(NC,{frame_size, frame_size}); // Shape state_weight_shape(NC,{frame_size, frame_size});
// float* state_weight_data = state_weight_.mutableData<float>(FP32, state_weight_shape); // float* state_weight_data = state_weight_.mutableData<float>(FP32,
// memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size, // state_weight_shape);
// memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
// state_weight_shape.numel() * sizeof(float)); // state_weight_shape.numel() * sizeof(float));
// FullyConnectedParam& reset_out_param = reset_out_pe_.param(); // FullyConnectedParam& reset_out_param = reset_out_pe_.param();
// reset_out_param.input = &prev_hidden; // reset_out_param.input = &prev_hidden;
...@@ -95,13 +97,12 @@ class GRUPE : public PE { ...@@ -95,13 +97,12 @@ class GRUPE : public PE {
// // ============== unit reset; // // ============== unit reset;
// update_gate_.mutableData<void>(FP16, pre_input_shape); // update_gate_.mutableData<void>(FP16, pre_input_shape);
// InputParam& relu_param = update_relu_pe_.param(); // InputParam& relu_param = update_relu_pe_.param();
// relu_param.input = &tempTensor; // relu_param.input = &tempTensor;
// relu_param.output = &update_gate_; // relu_param.output = &update_gate_;
// update_relu_pe_.init(); // update_relu_pe_.init();
// update_relu_pe_.apply(); // update_relu_pe_.apply();
reset_gate_.mutableData<void>(FP16, hidden_shape); reset_gate_.mutableData<void>(FP16, hidden_shape);
prev_hidden_.mutableData<void>(FP16, hidden_shape); prev_hidden_.mutableData<void>(FP16, hidden_shape);
reset_hidden_.mutableData<void>(FP16, hidden_shape); reset_hidden_.mutableData<void>(FP16, hidden_shape);
...@@ -112,7 +113,8 @@ class GRUPE : public PE { ...@@ -112,7 +113,8 @@ class GRUPE : public PE {
// reset_relu_pe_.apply(); // reset_relu_pe_.apply();
// float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape); // float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
// memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16)); // TODO // memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16));
// // TODO
// reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape); // reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
ElementwiseMulParam& mul_param = mul_pe_.param(); ElementwiseMulParam& mul_param = mul_pe_.param();
...@@ -120,17 +122,15 @@ class GRUPE : public PE { ...@@ -120,17 +122,15 @@ class GRUPE : public PE {
mul_param.output = &reset_hidden_; mul_param.output = &reset_hidden_;
mul_pe_.init(); mul_pe_.init();
mul_pe_.apply(); mul_pe_.apply();
// ============== // ==============
}
bool dispatch() {
return true;
} }
void gru_unit_reset_act(const lite_api::ActivationType active_gate, GRUTensors& value, bool dispatch() { return true; }
int frame_size, int batch_size) {
void gru_unit_reset_act(const lite_api::ActivationType active_gate,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
int stride_update = 3 * frame_size; int stride_update = 3 * frame_size;
int stride_cell_state = 3 * frame_size; int stride_cell_state = 3 * frame_size;
int stride_hidden_prev = frame_size; int stride_hidden_prev = frame_size;
...@@ -143,17 +143,22 @@ class GRUPE : public PE { ...@@ -143,17 +143,22 @@ class GRUPE : public PE {
float* reset_gate_data = update_gate_data + frame_size; float* reset_gate_data = update_gate_data + frame_size;
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
// memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel() * sizeof(float)); // memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel()
// * sizeof(float));
// tempTensor.flush(); // tempTensor.flush();
Tensor tmp; Tensor tmp;
Shape s(NC, {1, frame_size}); //TODO Shape s(NC, {1, frame_size});
float* tmp_data = tmp.mutableData<float>(FP32, s); float* tmp_data = tmp.mutableData<float>(FP32, s);
for (int i = 0; i < frame_size; i++) { for (int i = 0; i < frame_size; i++) {
// f(x) = x / (1 + abs(x))? // f(x) = x / (1 + abs(x))?
update_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(update_gate_data[i]); update_gate_data[i] =
reset_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(reset_gate_data[i]); lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
update_gate_data[i]);
reset_gate_data[i] =
lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
reset_gate_data[i]);
} }
memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float)); memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
tmp.flush(); tmp.flush();
...@@ -163,7 +168,7 @@ class GRUPE : public PE { ...@@ -163,7 +168,7 @@ class GRUPE : public PE {
Tensor* hidden_prev = value.pre_output; Tensor* hidden_prev = value.pre_output;
if (hidden_prev) { if (hidden_prev) {
// memcpy(prev_data, ) // memcpy(prev_data, )
// TODO change to pre_out; // TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output); prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt"); prev_hidden_.saveToFile("prev_.txt");
} }
...@@ -179,9 +184,11 @@ class GRUPE : public PE { ...@@ -179,9 +184,11 @@ class GRUPE : public PE {
} }
} }
void gru_unit_out_act(const lite_api::ActivationType active_node, bool origin_mode, void gru_unit_out_act(const lite_api::ActivationType active_node,
GRUTensors& value, int frame_size, int batch_size) { bool origin_mode,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
// int stride_update = 3 * frame_size; // int stride_update = 3 * frame_size;
// int stride_cell_state = 3 * frame_size; // int stride_cell_state = 3 * frame_size;
// int stride_hidden_prev = frame_size; // int stride_hidden_prev = frame_size;
...@@ -206,13 +213,16 @@ class GRUPE : public PE { ...@@ -206,13 +213,16 @@ class GRUPE : public PE {
// // if (hidden_prev) { // // if (hidden_prev) {
// // prev = hidden_prev[i]; // // prev = hidden_prev[i];
// // } // // }
// // cell_state[i] = lite::arm::math::active_f32<kSigmoid>(cell_state[i]); // // cell_state[i] =
// lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
// // hidden[i] = // // hidden[i] =
// // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] * prev; // // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] *
// prev;
// // } // // }
// } else { // } else {
// for (int i = 0; i < frame_size; ++i) { // for (int i = 0; i < frame_size; ++i) {
// cell_state[i] = lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]); // cell_state[i] =
// lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
// if (hidden_prev) { // if (hidden_prev) {
// prev = hidden_prev[i]; // prev = hidden_prev[i];
// } // }
...@@ -228,12 +238,11 @@ class GRUPE : public PE { ...@@ -228,12 +238,11 @@ class GRUPE : public PE {
// } // }
} }
void copy_input(GRUTensors& value) { void copy_input(GRUTensors& value) { // NOLINT
float max = find_max(*(value.gate)); float max = find_max(*(value.gate));
gate_ping_.mutableData<void>(FP32, value.gate->shape()); gate_ping_.mutableData<void>(FP32, value.gate->shape());
gate_ping_.copyFrom(value.gate); gate_ping_.copyFrom(value.gate);
// TODO update input pointer? // update input pointer?
// gate_.readFromFile("input/in.txt"); // gate_.readFromFile("input/in.txt");
// // pre_input_.saveToFile("pppp_in.txt"); // // pre_input_.saveToFile("pppp_in.txt");
...@@ -241,7 +250,6 @@ class GRUPE : public PE { ...@@ -241,7 +250,6 @@ class GRUPE : public PE {
// gate_.scale()[1] = 127 / max; // gate_.scale()[1] = 127 / max;
// gate_.printScale("pre_input_"); // gate_.printScale("pre_input_");
// gate_.saveToFile("pre_input_.txt"); // gate_.saveToFile("pre_input_.txt");
// pre_out_pe_.dispatch(); // pre_out_pe_.dispatch();
...@@ -249,12 +257,12 @@ class GRUPE : public PE { ...@@ -249,12 +257,12 @@ class GRUPE : public PE {
// pre_output_.saveToFile("pp_out.txt"); // pre_output_.saveToFile("pp_out.txt");
} }
void GRUCOmpute(GRUTensors& value, void GRUCOmpute(GRUTensors& value, // NOLINT
int frame_size, int frame_size,
int batch_size, int batch_size,
const lite_api::ActivationType active_node, const lite_api::ActivationType active_node,
const lite_api::ActivationType active_gate, const lite_api::ActivationType active_gate,
bool origin_mode) { bool origin_mode) {
copy_input(value); copy_input(value);
if (value.pre_output) { if (value.pre_output) {
...@@ -269,7 +277,8 @@ class GRUPE : public PE { ...@@ -269,7 +277,8 @@ class GRUPE : public PE {
// // state weight; // // state weight;
// reset_out_pe_.dispatch(); // reset_out_pe_.dispatch();
// } // }
// gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size); // gru_unit_out_act(active_node, origin_mode, value, frame_size,
// batch_size);
} }
GRUParam& param() { return param_; } GRUParam& param() { return param_; }
...@@ -282,13 +291,9 @@ class GRUPE : public PE { ...@@ -282,13 +291,9 @@ class GRUPE : public PE {
// return &gate_; // return &gate_;
// } // }
Tensor* updateGate() { Tensor* updateGate() { return &update_gate_; }
return &update_gate_;
}
Tensor* resetGate() { Tensor* resetGate() { return &reset_gate_; }
return &reset_gate_;
}
private: private:
GRUParam param_; GRUParam param_;
......
...@@ -18,121 +18,6 @@ ...@@ -18,121 +18,6 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace fpga { namespace fpga {}
// inline void gru_unit_reset_act(lite_api::ActivationType act_type,
// GRUMetaValue<float> value,
// int frame_size,
// int batch_size) {
// auto updata_gate = value.gate_value;
// auto reset_gate = value.gate_value + frame_size;
// auto hidden_prev = value.prev_out_value;
// auto reset_hidden_prev = value.reset_output_value;
// int stride_update = 3 * frame_size;
// int stride_reset = 3 * frame_size;
// int stride_hidden_prev = frame_size;
// int stride_reset_hidden_prev = frame_size;
// if (act_type == kRelu) {
// }
// }
// void gru_compute(arm::math::GRUMetaValue<float> value,
// int frame_size,
// int batch_size,
// const lite_api::ActivationType active_node,
// const lite_api::ActivationType active_gate,
// bool origin_mode) {
// std::cout << " =================== gru gru_compute =================== \n";
// // exit(-1);
// // sgemm(bool is_transA,
// // bool is_transB,
// // int M,
// // int N,
// // int K,
// // float alpha,
// // const float* A,
// // int lda,
// // const float* B,
// // int ldb,
// // float beta,
// // float* C,
// // int ldc,
// // const float* bias,
// // bool is_bias,
// // bool is_relu,
// // ARMContext* ctx);
// // sgemm for fc;
// // lite::arm::math::sgemm(false,
// // false,
// // m_,// batch;
// // n_,// filter num;
// // k_,// input_channel;
// // 1.f,
// // i_data,// input data;
// // k_,
// // w_data,// weight data;
// // n_,
// // 0.f,//beta;
// // o_data,// out data;
// // n_,
// // b_data,// bias;
// // false,
// // false,
// // &ctx);
// // C := alpha*op( A )*op( B ) + beta*C,
// if (value.prev_out_value) {
// // sgemm(false, // is_transA
// // false, // is_transB
// // batch_size, // M specifies the number of rows of the matrix
// // frame_size * 2, // N specifies the number of columns of the matrix
// // frame_size, // K
// // 1.f, // alpha
// // value.prev_out_value, // float* A,
// // frame_size, // lda
// // value.gate_weight, // float* B,
// // frame_size * 2, // ldb
// // 1.f, // beta
// // value.gate_value, // C*
// // frame_size * 3, // ldc
// // nullptr, // bias
// // false, // is_bias
// // false, // is_relu
// // ctx); // context
// }
// // gru_unit_reset_act(active_gate, value, frame_size, batch_size);
// if (value.prev_out_value) {
// // sgemm(false,
// // false,
// // batch_size,
// // frame_size,
// // frame_size,
// // 1.f,
// // value.reset_output_value,
// // frame_size,
// // value.state_weight,
// // frame_size,
// // 1.f,
// // value.gate_value + frame_size * 2,
// // frame_size * 3,
// // nullptr,
// // false,
// // false,
// // ctx);
// }
// // gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
// }
} }
} }
}
\ No newline at end of file
...@@ -38,7 +38,6 @@ class PoolingPE : public PE { ...@@ -38,7 +38,6 @@ class PoolingPE : public PE {
uint32_t k_height = param_.kernelSize[0]; uint32_t k_height = param_.kernelSize[0];
uint32_t k_width = param_.kernelSize[1]; uint32_t k_width = param_.kernelSize[1];
if (param_.globalPooling) { if (param_.globalPooling) {
k_width = input->shape().width(); k_width = input->shape().width();
k_height = input->shape().height(); k_height = input->shape().height();
...@@ -68,8 +67,9 @@ class PoolingPE : public PE { ...@@ -68,8 +67,9 @@ class PoolingPE : public PE {
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7); (k_width > 7 || k_height > 7);
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// (k_width > 255 || k_height > 255); // &&
// (k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE; use_cpu_ = param_.type == AVERAGE;
} }
...@@ -172,7 +172,8 @@ class PoolingPE : public PE { ...@@ -172,7 +172,8 @@ class PoolingPE : public PE {
input->syncToCPU(); input->syncToCPU();
Tensor float_input; Tensor float_input;
float* float_input_data = float_input.mutableData<float>(FP32, input->shape()); float* float_input_data =
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input); float_input.copyFrom(input);
float16* data_out = output->data<float16>(); float16* data_out = output->data<float16>();
......
...@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() { ...@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() {
if (cachedBoxes_ == nullptr) { if (cachedBoxes_ == nullptr) {
cachedBoxes_ = new Tensor(); cachedBoxes_ = new Tensor();
cachedVariances_ = new Tensor(); cachedVariances_ = new Tensor();
cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape()); cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
cachedVariances_->mutableData<float16>(FP16, cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
param_.outputVariances->shape());
cachedBoxes_->setDataLocation(CPU); cachedBoxes_->setDataLocation(CPU);
cachedVariances_->setDataLocation(CPU); cachedVariances_->setDataLocation(CPU);
compute_prior_box(); compute_prior_box();
......
...@@ -14,19 +14,16 @@ limitations under the License. */ ...@@ -14,19 +14,16 @@ limitations under the License. */
#pragma once #pragma once
#include "lite/backends/fpga/KD/pe.hpp" #include <algorithm>
#include "lite/backends/fpga/KD/pe_params.hpp"
#include <algorithm>
#include "lite/backends/fpga/KD/pe.hpp" #include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp" #include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp" #include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
namespace paddle { namespace paddle {
namespace zynqmp { namespace zynqmp {
class ScalePE : public PE { class ScalePE : public PE {
public: public:
inline int gcd(int a, int b) { inline int gcd(int a, int b) {
...@@ -124,7 +121,7 @@ class ScalePE : public PE { ...@@ -124,7 +121,7 @@ class ScalePE : public PE {
void apply() { void apply() {
Tensor* input = param_.input; Tensor* input = param_.input;
Tensor* output = param_.output; Tensor* output = param_.output;
Shape& input_shape = input->shape(); Shape& input_shape = input->shape();
DepthwiseConvParam& dw_param = dw_pe_.param(); DepthwiseConvParam& dw_param = dw_pe_.param();
int channel = input_shape.channel(); int channel = input_shape.channel();
...@@ -136,19 +133,19 @@ class ScalePE : public PE { ...@@ -136,19 +133,19 @@ class ScalePE : public PE {
int c_lcm = lcm(channel, alignment); int c_lcm = lcm(channel, alignment);
repeat = c_lcm / (channel); repeat = c_lcm / (channel);
} }
// TODO FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现 // FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现
Shape shape(N, {channel * repeat}); Shape shape(N, {channel * repeat});
float* filter_data = filter.mutableData<float>(FP32, shape); float* filter_data = filter.mutableData<float>(FP32, shape);
std::fill_n(filter_data, input->shape().channel(), 1.0f); std::fill_n(filter_data, input->shape().channel(), 1.0f);
Tensor *scale = dw_param.scale();
float16* scale_data = scale->mutableData<float16>(FP16, shape);
// memcpy(scale_data, param_.scale->data<float>(), input->shape().channel() * sizeof(float));
Tensor* scale = dw_param.scale();
float16* scale_data = scale->mutableData<float16>(FP16, shape);
// memcpy(scale_data, param_.scale->data<float>(), input->shape().channel()
// * sizeof(float));
Tensor *bias = dw_param.bias(); Tensor* bias = dw_param.bias();
float16* bias_data = bias->mutableData<float16>(FP16, shape); float16* bias_data = bias->mutableData<float16>(FP16, shape);
std::fill_n(bias_data, input->shape().channel(), 0); std::fill_n(bias_data, input->shape().channel(), 0);
...@@ -206,13 +203,14 @@ class ScalePE : public PE { ...@@ -206,13 +203,14 @@ class ScalePE : public PE {
} }
// if (param_.bias != nullptr) { // if (param_.bias != nullptr) {
// memcpy(bias_data, param_.bias->data<float>(), input->shape().channel() * sizeof(float)); // memcpy(bias_data, param_.bias->data<float>(), input->shape().channel()
// * sizeof(float));
// } // }
dw_param.input = param_.input; dw_param.input = param_.input;
dw_param.output = param_.output; dw_param.output = param_.output;
dw_param.filter = &filter; dw_param.filter = &filter;
dw_param.strides = {1, 1}; dw_param.strides = {1, 1};
dw_param.paddings = {0, 0}; dw_param.paddings = {0, 0};
dw_param.kernelSize = {1, 1}; dw_param.kernelSize = {1, 1};
...@@ -220,7 +218,6 @@ class ScalePE : public PE { ...@@ -220,7 +218,6 @@ class ScalePE : public PE {
dw_pe_.init(); dw_pe_.init();
dw_pe_.apply(); dw_pe_.apply();
} }
void cpu_compute() { void cpu_compute() {
...@@ -244,7 +241,8 @@ class ScalePE : public PE { ...@@ -244,7 +241,8 @@ class ScalePE : public PE {
for (int c = 0; c < input->shape().channel(); c++) { for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c; int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c]; float value = half_to_float(in_data[index]) * scale_data[c];
std::cout << "value:" << value << " = " << half_to_float(in_data[index]) << " x " << scale_data[c] << std::endl; std::cout << "value:" << value << " = " << half_to_float(in_data[index])
<< " x " << scale_data[c] << std::endl;
data_out[index] = float_to_half(value); data_out[index] = float_to_half(value);
if (value < 0) { if (value < 0) {
...@@ -261,18 +259,19 @@ class ScalePE : public PE { ...@@ -261,18 +259,19 @@ class ScalePE : public PE {
} }
bool dispatch() { bool dispatch() {
// cpu_compute(); // cpu_compute();
// return true; // return true;
if (param_.scale->dataType() == FP16) { if (param_.scale->dataType() == FP16) {
DepthwiseConvParam& dw_param = dw_pe_.param(); DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(), param_.scale->data<float16>(), param_.scale->shape().numel() * sizeof(float16)); memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0]; param_.scale->data<float16>(),
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1]; param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->flush(); dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
// apply();
dw_param.quantizedFilter()->flush();
// apply();
} }
// param_.scale->saveToFile("scale.txt"); // param_.scale->saveToFile("scale.txt");
// cpu_compute(); // cpu_compute();
...@@ -293,5 +292,3 @@ class ScalePE : public PE { ...@@ -293,5 +292,3 @@ class ScalePE : public PE {
}; };
} // namespace zynqmp } // namespace zynqmp
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <unistd.h>
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
...@@ -24,8 +25,6 @@ limitations under the License. */ ...@@ -24,8 +25,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include <unistd.h>
// #include "lite/core/tensor.h" // #include "lite/core/tensor.h"
#include "lite/backends/fpga/KD/dl_engine.hpp" #include "lite/backends/fpga/KD/dl_engine.hpp"
...@@ -303,13 +302,15 @@ class Tensor { ...@@ -303,13 +302,15 @@ class Tensor {
this->invalidate(); this->invalidate();
} }
void flush() { void flush() {
size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; size_t memorySize =
fpga_flush(placeHolder_->data(), memorySize); shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
} }
void invalidate() { void invalidate() {
size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_invalidate(placeHolder_->data(), memorySize); fpga_invalidate(placeHolder_->data(), memorySize);
} }
...@@ -357,7 +358,8 @@ class Tensor { ...@@ -357,7 +358,8 @@ class Tensor {
std::cout << type << " : " std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" + << std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" + std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" + std::to_string(shape_->width()) std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width())
<< std::endl; << std::endl;
std::cout << type << " \n"; std::cout << type << " \n";
printScale(); printScale();
...@@ -383,7 +385,6 @@ class Tensor { ...@@ -383,7 +385,6 @@ class Tensor {
} }
void saveToFile(std::string path) { void saveToFile(std::string path) {
syncToCPU(); syncToCPU();
invalidate(); invalidate();
std::ofstream ofs; std::ofstream ofs;
...@@ -395,7 +396,7 @@ class Tensor { ...@@ -395,7 +396,7 @@ class Tensor {
} }
void save_file_with_name(std::string path) { void save_file_with_name(std::string path) {
return; // return;
invalidate(); invalidate();
// usleep(20000); // usleep(20000);
// return; // return;
...@@ -406,7 +407,6 @@ class Tensor { ...@@ -406,7 +407,6 @@ class Tensor {
ofs << "dataType: " << dataType_ << std::endl; ofs << "dataType: " << dataType_ << std::endl;
ofs << "scale: " << scale()[0] << " , " << scale()[1] << std::endl; ofs << "scale: " << scale()[0] << " , " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) { for (int i = 0; i < shape_->numel(); i++) {
float value = 0; float value = 0;
if (dataType_ == FP32) { if (dataType_ == FP32) {
......
...@@ -161,7 +161,7 @@ class TensorLite { ...@@ -161,7 +161,7 @@ class TensorLite {
TensorLite Slice(int64_t begin, int64_t end) const; TensorLite Slice(int64_t begin, int64_t end) const;
template <typename T> template <typename T>
void Slice(TensorLite& dst, int64_t begin, int64_t end) const; void Slice(TensorLite &dst, int64_t begin, int64_t end) const; // NOLINT
TargetType target() const { return target_; } TargetType target() const { return target_; }
...@@ -247,9 +247,7 @@ R *TensorLite::mutable_data(TargetType target) { ...@@ -247,9 +247,7 @@ R *TensorLite::mutable_data(TargetType target) {
template <typename T> template <typename T>
TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
throw - 1;
throw -1;
CHECK_GE(begin, 0); CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]); CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end); CHECK_LT(begin, end);
...@@ -265,20 +263,21 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { ...@@ -265,20 +263,21 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
auto dst_dims = dims_; auto dst_dims = dims_;
dst_dims[0] = end - begin; dst_dims[0] = end - begin;
dst.Resize(dst_dims); dst.Resize(dst_dims);
void* dst_data = dst.mutable_data<T>(); void *dst_data = dst.mutable_data<T>();
T* src_data = const_cast<T*>(data<T>()); T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data, src_data + static_cast<size_t>(begin * base) * sizeof(T), dst_dims.production() * sizeof(T)); memcpy(dst_data,
src_data + static_cast<size_t>(begin * base) * sizeof(T),
dst_dims.production() * sizeof(T));
dst.ZynqTensor()->saveToFile("_slice", true); dst.ZynqTensor()->saveToFile("_slice", true);
// dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T); // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
return dst; return dst;
} }
} }
template <typename T> template <typename T>
void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const { void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
CHECK_GE(begin, 0); CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]); CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end); CHECK_LT(begin, end);
...@@ -287,13 +286,14 @@ void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const { ...@@ -287,13 +286,14 @@ void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
auto dst_dims = dims_; auto dst_dims = dims_;
dst_dims[0] = end - begin; dst_dims[0] = end - begin;
dst.Resize(dst_dims); dst.Resize(dst_dims);
void* dst_data = dst.mutable_data<T>(); void *dst_data = dst.mutable_data<T>();
int64_t base = numel() / dims_[0]; int64_t base = numel() / dims_[0];
T* src_data = const_cast<T*>(data<T>()); T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data, src_data + static_cast<size_t>(begin * dst_dims.production()), dst_dims.production() * sizeof(T)); memcpy(dst_data,
src_data + static_cast<size_t>(begin * dst_dims.production()),
dst_dims.production() * sizeof(T));
} }
template <typename TensorT> template <typename TensorT>
......
...@@ -90,13 +90,13 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -90,13 +90,13 @@ void TypeTargetTransformPass::AddIoCopyInst(
// string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id()); // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
// TODO(MyPandaShaoxiang) should set same place with input? // TODO(MyPandaShaoxiang) should set same place with input?
auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name); auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
// Set the place for io_copy_output_arg node, the target should be equal to // Set the place for io_copy_output_arg node, the target should be equal to
// to.target() // to.target()
// The precision and layout should be equal to from.precision(), from.layout() // The precision and layout should be equal to from.precision(), from.layout()
#ifndef LITE_WITH_FPGA #ifndef LITE_WITH_FPGA
io_copy_output_arg->AsArg().type = io_copy_output_arg->AsArg().type =
LiteType::GetTensorTy(to.target(), from.precision(), from.layout()); LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
#endif #endif
auto* io_copy_inst = graph->NewInstructNode(); auto* io_copy_inst = graph->NewInstructNode();
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
......
...@@ -60,57 +60,57 @@ class Optimizer { ...@@ -60,57 +60,57 @@ class Optimizer {
InitTargetTypeTransformPass(); InitTargetTypeTransformPass();
if (passes.empty()) { if (passes.empty()) {
std::vector<std::string> passes_local{ std::vector<std::string> passes_local{{
{"lite_quant_dequant_fuse_pass", // "lite_quant_dequant_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", // "lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// TODO(Superjomn) Refine the fusion related design to select fusion // TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically. // kernels for devices automatically.
"lite_conv_activation_fuse_pass", // "lite_conv_activation_fuse_pass", //
"lite_fc_fuse_pass", // "lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", // "lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", // "lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", // "lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", // "identity_scale_eliminate_pass", //
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
#endif #endif
"static_kernel_pick_pass", // pick original kernel from graph "static_kernel_pick_pass", // pick original kernel from graph
"variable_place_inference_pass", // inference arg/var's "variable_place_inference_pass", // inference arg/var's
// info(target/precision/layout/device) // info(target/precision/layout/device)
// using kernel info // using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's "argument_type_display_pass", // debug pass: show arg-type-node's
// info // info
// (target/precision/layout/device) // (target/precision/layout/device)
"type_target_cast_pass", // add io_copy/io_copy_once if meet "type_target_cast_pass", // add io_copy/io_copy_once if meet
// different targets when last and next // different targets when last and next
// node // node
"variable_place_inference_pass", // "variable_place_inference_pass", //
"argument_type_display_pass", // "argument_type_display_pass", //
"io_copy_kernel_pick_pass", // "io_copy_kernel_pick_pass", //
"argument_type_display_pass", // "argument_type_display_pass", //
"variable_place_inference_pass", // "variable_place_inference_pass", //
"argument_type_display_pass", // "argument_type_display_pass", //
"type_precision_cast_pass", // "type_precision_cast_pass", //
"variable_place_inference_pass", // "variable_place_inference_pass", //
"argument_type_display_pass", // "argument_type_display_pass", //
"type_layout_cast_pass", // add layout/layout_once op if meet "type_layout_cast_pass", // add layout/layout_once op if meet
// different layout when last and next node // different layout when last and next node
"argument_type_display_pass", // "argument_type_display_pass", //
"variable_place_inference_pass", // "variable_place_inference_pass", //
"argument_type_display_pass", "argument_type_display_pass",
"runtime_context_assign_pass", "runtime_context_assign_pass",
"argument_type_display_pass", "argument_type_display_pass",
// "memory_optimize_pass" // "memory_optimize_pass"
}}; }};
RunPasses(passes_local); RunPasses(passes_local);
} else { } else {
RunPasses(passes); RunPasses(passes);
......
...@@ -121,7 +121,7 @@ void RuntimeProgram::Run() { ...@@ -121,7 +121,7 @@ void RuntimeProgram::Run() {
inst.Run(); inst.Run();
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
// LITE_PRECISION_PROFILE(inst) // LITE_PRECISION_PROFILE(inst)
#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
} }
......
...@@ -221,6 +221,10 @@ void BoxCoderCompute::Run() { ...@@ -221,6 +221,10 @@ void BoxCoderCompute::Run() {
} }
} }
} }
// prior_box->ZynqTensor()->saveToFile("prior_box", true);
// prior_box_var->ZynqTensor()->saveToFile("prior_box_var", true);
// output_box->ZynqTensor()->saveToFile("box_coder", true);
} }
} // namespace arm } // namespace arm
......
...@@ -85,6 +85,9 @@ void PriorBoxCompute::Run() { ...@@ -85,6 +85,9 @@ void PriorBoxCompute::Run() {
is_clip, is_clip,
order, order,
min_max_aspect_ratios_order); min_max_aspect_ratios_order);
param.boxes->ZynqTensor()->saveToFile("pb_boxes", true);
param.variances->ZynqTensor()->saveToFile("pb_variance", true);
} }
} // namespace arm } // namespace arm
...@@ -98,22 +101,22 @@ REGISTER_LITE_KERNEL(prior_box, ...@@ -98,22 +101,22 @@ REGISTER_LITE_KERNEL(prior_box,
kNCHW, kNCHW,
paddle::lite::kernels::arm::PriorBoxCompute, paddle::lite::kernels::arm::PriorBoxCompute,
def) def)
.BindInput("Input",{LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(prior_box_fpga, // REGISTER_LITE_KERNEL(prior_box,
kARM, // kFPGA,
kFloat, // kFP16,
kNCHW, // kNHWC,
paddle::lite::kernels::arm::PriorBoxCompute, // paddle::lite::kernels::arm::PriorBoxCompute,
def) // def)
.BindInput("Input",{LiteType::GetTensorTy( // .BindInput("Input",{LiteType::GetTensorTy(
TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))}) // TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
.BindInput("Image", {LiteType::GetTensorTy( // .BindInput("Image", {LiteType::GetTensorTy(
TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))}) // TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) // .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) // .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); // .Finalize();
...@@ -21,7 +21,7 @@ add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc ...@@ -21,7 +21,7 @@ add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc
add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
# add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps}) add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) # add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps}) # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps}) add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/fpga/concat_compute.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/kernels/fpga/concat_compute.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <vector>
#include "lite/kernels/fpga/conv_compute.h" #include "lite/kernels/fpga/conv_compute.h"
#include <vector>
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
...@@ -79,8 +79,8 @@ void ConvCompute::Run() { ...@@ -79,8 +79,8 @@ void ConvCompute::Run() {
} else { } else {
conv_pe_.dispatch(); conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ConvParam& conv_param = conv_pe_.param(); zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output); Debugger::get_instance().registerOutput("conv", conv_param.output);
#endif #endif
} }
} }
......
...@@ -12,11 +12,11 @@ ...@@ -12,11 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include "lite/kernels/fpga/dropout_compute.h" #include "lite/kernels/fpga/dropout_compute.h"
#include <string>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/debugger.hpp" #include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/backends/fpga/KD/float16.hpp"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include "lite/kernels/fpga/elementwise_compute.h" #include "lite/kernels/fpga/elementwise_compute.h"
#include <string>
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp" #include "lite/backends/fpga/KD/debugger.hpp"
...@@ -38,7 +38,7 @@ void ElementwiseAddCompute::PrepareForRun() { ...@@ -38,7 +38,7 @@ void ElementwiseAddCompute::PrepareForRun() {
pe_.init(); pe_.init();
pe_.apply(); pe_.apply();
} }
void ElementwiseAddCompute::Run() { void ElementwiseAddCompute::Run() {
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ElementwiseAddParam& ew_param = pe_.param(); zynqmp::ElementwiseAddParam& ew_param = pe_.param();
...@@ -60,8 +60,8 @@ void ElementwiseAddActivationCompute::PrepareForRun() { ...@@ -60,8 +60,8 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
pe_.init(); pe_.init();
pe_.apply(); pe_.apply();
} }
void ElementwiseAddActivationCompute::Run() { void ElementwiseAddActivationCompute::Run() {
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ElementwiseAddParam& ew_param = pe_.param(); zynqmp::ElementwiseAddParam& ew_param = pe_.param();
Debugger::get_instance().registerOutput("ew_add", ew_param.output); Debugger::get_instance().registerOutput("ew_add", ew_param.output);
...@@ -86,12 +86,12 @@ void ElementwiseMulCompute::PrepareForRun() { ...@@ -86,12 +86,12 @@ void ElementwiseMulCompute::PrepareForRun() {
zynqmp::Shape shape(zynqmp::N, {channel}); zynqmp::Shape shape(zynqmp::N, {channel});
float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape); float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape); float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
float scale_value = param.Y->data<float>()[0];; float scale_value = param.Y->data<float>()[0];
for (int i = 0; i < channel; ++i) { for (int i = 0; i < channel; ++i) {
if (param.Y->dims().production() != 1) { if (param.Y->dims().production() != 1) {
scale_value = param.Y->ZynqTensor()->data<float>()[i]; scale_value = param.Y->ZynqTensor()->data<float>()[i];
} }
scale_data[i] = scale_value; scale_data[i] = scale_value;
bias_data[i] = 0; bias_data[i] = 0;
} }
...@@ -100,10 +100,11 @@ void ElementwiseMulCompute::PrepareForRun() { ...@@ -100,10 +100,11 @@ void ElementwiseMulCompute::PrepareForRun() {
pe_.apply(); pe_.apply();
} }
void ElementwiseMulCompute::Run() { void ElementwiseMulCompute::Run() {
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param(); zynqmp::ScaleParam& scale_param = pe_.param();
Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
Debugger::get_instance().registerOutput("ew_mul", scale_param.output); Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif #endif
} }
...@@ -123,8 +124,7 @@ REGISTER_LITE_KERNEL(elementwise_add, ...@@ -123,8 +124,7 @@ REGISTER_LITE_KERNEL(elementwise_add,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Y", .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
...@@ -162,12 +162,9 @@ REGISTER_LITE_KERNEL(elementwise_mul, ...@@ -162,12 +162,9 @@ REGISTER_LITE_KERNEL(elementwise_mul,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Y", .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -58,6 +58,7 @@ class ElementwiseMulCompute ...@@ -58,6 +58,7 @@ class ElementwiseMulCompute
void Run() override; void Run() override;
virtual ~ElementwiseMulCompute() = default; virtual ~ElementwiseMulCompute() = default;
private: private:
zynqmp::ScalePE pe_; zynqmp::ScalePE pe_;
}; };
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
// limitations under the License. // limitations under the License.
#include "lite/kernels/fpga/fc_compute.h" #include "lite/kernels/fpga/fc_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
// limitations under the License. // limitations under the License.
#include "lite/kernels/fpga/feed_compute.h" #include "lite/kernels/fpga/feed_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/fpga/fetch_compute.h" #include "lite/kernels/fpga/fetch_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -45,7 +45,7 @@ void FetchCompute::PrepareForRun() { ...@@ -45,7 +45,7 @@ void FetchCompute::PrepareForRun() {
void FetchCompute::Run() { void FetchCompute::Run() {
pe_.dispatch(); pe_.dispatch();
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param(); zynqmp::OutputParam& fetch_param = pe_.param();
Debugger::get_instance().registerOutput("fetch", fetch_param.output); Debugger::get_instance().registerOutput("fetch", fetch_param.output);
......
...@@ -11,10 +11,12 @@ ...@@ -11,10 +11,12 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <unistd.h>
#include "lite/kernels/fpga/gru_compute.h" // #include <chrono>
#include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#include "lite/backends/arm/math/gru_utils.h" #include "lite/backends/arm/math/gru_utils.h"
...@@ -23,20 +25,16 @@ ...@@ -23,20 +25,16 @@
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
#include "lite/kernels/fpga/gru_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp" #include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/backends/fpga/KD/pes/gru_util.hpp" #include "lite/backends/fpga/KD/pes/gru_util.hpp"
#include <iostream>
#include <chrono>
#include <unistd.h>
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace fpga { namespace fpga {
using namespace std;
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
inline lite_api::ActivationType get_gru_act_type(const std::string& type) { inline lite_api::ActivationType get_gru_act_type(const std::string& type) {
...@@ -71,7 +69,8 @@ void GRUCompute::PrepareForRun() { ...@@ -71,7 +69,8 @@ void GRUCompute::PrepareForRun() {
gru_param.bias = bias->ZynqTensor(); gru_param.bias = bias->ZynqTensor();
gru_param.batch_gate = param.batch_gate->ZynqTensor(); gru_param.batch_gate = param.batch_gate->ZynqTensor();
gru_param.batch_reset_hidden_prev = param.batch_reset_hidden_prev->ZynqTensor(); gru_param.batch_reset_hidden_prev =
param.batch_reset_hidden_prev->ZynqTensor();
gru_param.batch_hidden = param.batch_hidden->ZynqTensor(); gru_param.batch_hidden = param.batch_hidden->ZynqTensor();
gru_param.hidden = param.hidden->ZynqTensor(); gru_param.hidden = param.hidden->ZynqTensor();
...@@ -105,12 +104,12 @@ void GRUCompute::Run() { ...@@ -105,12 +104,12 @@ void GRUCompute::Run() {
float* batch_gate_data = batch_gate->mutable_data<float>(); float* batch_gate_data = batch_gate->mutable_data<float>();
lite::arm::math::LoDTensor2BatchFunctor<float> to_batch; lite::arm::math::LoDTensor2BatchFunctor<float> to_batch;
to_batch(*input, batch_gate, true, param.is_reverse); //1. to_batch(*input, batch_gate, true, param.is_reverse); // 1.
save_tensor(batch_gate, "_batch_gate.txt"); save_tensor(batch_gate, "_batch_gate.txt");
if (bias) { if (bias) {
auto bias_data = bias->data<float>(); //2. auto bias_data = bias->data<float>(); // 2.
lite::arm::math::gru_add_with_bias(batch_gate_data, lite::arm::math::gru_add_with_bias(batch_gate_data,
bias_data, bias_data,
batch_gate_data, batch_gate_data,
...@@ -130,13 +129,12 @@ void GRUCompute::Run() { ...@@ -130,13 +129,12 @@ void GRUCompute::Run() {
Tensor ordered_h0; Tensor ordered_h0;
std::vector<uint64_t> order(batch_gate->lod()[2]); std::vector<uint64_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
// Since the batch computing for GRU reorders the input sequences // Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
// to reorder. // to reorder.
// lite::arm::math::ReorderInitState<float>(*h0, order, &ordered_h0, true); //3. // lite::arm::math::ReorderInitState<float>(*h0, order, &ordered_h0, true);
// //3.
gru_value.prev_out_value = ordered_h0.mutable_data<float>(); gru_value.prev_out_value = ordered_h0.mutable_data<float>();
gru_tensors.pre_output = ordered_h0.ZynqTensor(); gru_tensors.pre_output = ordered_h0.ZynqTensor();
std::cout << "================= h0 =================\n"; std::cout << "================= h0 =================\n";
...@@ -171,31 +169,36 @@ void GRUCompute::Run() { ...@@ -171,31 +169,36 @@ void GRUCompute::Run() {
batch_reset_hidden_prev->mutable_data<float>() + batch_reset_hidden_prev->mutable_data<float>() +
bstart * batch_reset_hidden_prev->dims()[1]; bstart * batch_reset_hidden_prev->dims()[1];
zynqmp::Shape float_input_shape(zynqmp::NC, {cur_batch_size, batch_gate->dims()[1]}); zynqmp::Shape float_input_shape(zynqmp::NC,
float* float_data = float_input.mutableData<float>(zynqmp::FP32, float_input_shape); {cur_batch_size, batch_gate->dims()[1]});
memcpy(float_data, gru_value.gate_value, batch_gate->dims()[1] * sizeof(float)); float* float_data =
float_input.mutableData<float>(zynqmp::FP32, float_input_shape);
memcpy(float_data,
gru_value.gate_value,
batch_gate->dims()[1] * sizeof(float));
float_input.flush(); float_input.flush();
float* hidden_data = hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape); float* hidden_data =
hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
// memcpy(hidden_prev_data, ) // memcpy(hidden_prev_data, )
// zynqmp::Tensor* gate = pe_.gate(); // zynqmp::Tensor* gate = pe_.gate();
gru_tensors.gate = &float_input; gru_tensors.gate = &float_input;
gru_tensors.output = &hidden_out; gru_tensors.output = &hidden_out;
pe_.GRUCOmpute(gru_tensors, frame_size,
cur_batch_size,
active_node,
active_gate,
param.origin_mode);
//TODO copy data back to original tensor; pe_.GRUCOmpute(gru_tensors,
frame_size,
cur_batch_size,
active_node,
active_gate,
param.origin_mode);
// TODO(chonwhite): copy data back to original tensor;
gru_tensors.pre_output = gru_tensors.output; gru_tensors.pre_output = gru_tensors.output;
// exit(-1);
// gru_value.prev_out_value = gru_value.output_value; // gru_value.prev_out_value = gru_value.output_value;
} }
lite::arm::math::Batch2LoDTensorFunctor<float> to_seq; //5. lite::arm::math::Batch2LoDTensorFunctor<float> to_seq; // 5.
*(batch_hidden->mutable_lod()) = batch_gate->lod(); *(batch_hidden->mutable_lod()) = batch_gate->lod();
batch_hidden->mutable_data<float>(); batch_hidden->mutable_data<float>();
to_seq(*batch_hidden, hidden); to_seq(*batch_hidden, hidden);
...@@ -204,7 +207,6 @@ void GRUCompute::Run() { ...@@ -204,7 +207,6 @@ void GRUCompute::Run() {
save_tensor(hidden, "_gru.txt"); save_tensor(hidden, "_gru.txt");
exit(-1); exit(-1);
} }
} // namespace fpga } // namespace fpga
......
...@@ -25,8 +25,8 @@ namespace lite { ...@@ -25,8 +25,8 @@ namespace lite {
namespace kernels { namespace kernels {
namespace fpga { namespace fpga {
class GRUCompute class GRUCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
using param_t = operators::GRUParam; using param_t = operators::GRUParam;
...@@ -37,6 +37,7 @@ class GRUCompute ...@@ -37,6 +37,7 @@ class GRUCompute
void Run() override; void Run() override;
virtual ~GRUCompute() = default; virtual ~GRUCompute() = default;
private: private:
zynqmp::Tensor pre_output_; zynqmp::Tensor pre_output_;
zynqmp::Tensor pre_bias_; zynqmp::Tensor pre_bias_;
...@@ -47,7 +48,7 @@ class GRUCompute ...@@ -47,7 +48,7 @@ class GRUCompute
zynqmp::FullyConnectedPE reset_out_pe_; zynqmp::FullyConnectedPE reset_out_pe_;
// zynqmp::Tensor input_; // zynqmp::Tensor input_;
zynqmp::GRUPE pe_; zynqmp::GRUPE pe_;
}; };
} // namespace fpga } // namespace fpga
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/fpga/im2sequence_compute.h"
#include <vector> #include <vector>
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
// #include "lite/backends/arm/math/funcs.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
#include "lite/kernels/fpga/im2sequence_compute.h"
#include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/float16.hpp"
...@@ -28,7 +28,6 @@ namespace fpga { ...@@ -28,7 +28,6 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
void im2sequence(const float16* input, void im2sequence(const float16* input,
const int input_c, const int input_c,
const int input_h, const int input_h,
...@@ -76,17 +75,13 @@ void im2sequence(const float16* input, ...@@ -76,17 +75,13 @@ void im2sequence(const float16* input,
} }
} }
template<typename T> template <typename T>
void hwc_to_chw(T* chw_data, void hwc_to_chw(T* chw_data,
const T* hwc_data, const T* hwc_data,
int num, int num,
int channel, int channel,
int height, int height,
int width) { int width) {
std::cout << " ============= HWC -> CHW =============\n";
std::cout << "channel: " << channel << std::endl;
std::cout << "height: " << height << std::endl;
std::cout << "width: " << width << std::endl;
int chw = channel * height * width; int chw = channel * height * width;
int wc = width * channel; int wc = width * channel;
int wh = width * height; int wh = width * height;
...@@ -103,65 +98,6 @@ void hwc_to_chw(T* chw_data, ...@@ -103,65 +98,6 @@ void hwc_to_chw(T* chw_data,
} }
} }
// void im2sequence(const float16* input,
// const int input_c,
// const int input_h,
// const int input_w,
// const int kernel_h,
// const int kernel_w,
// const int pad_top,
// const int pad_bottom,
// const int pad_left,
// const int pad_right,
// const int stride_h,
// const int stride_w,
// const int out_h,
// const int out_w,
// float16* out) {
// int window_size = kernel_h * kernel_w;
// int out_rows = out_h * out_w;
// int out_cols = input_c * window_size;
// int H_pad = input_h + pad_top + pad_bottom;
// int W_pad = input_w + pad_left + pad_right;
// size_t channel_mem_size = input_c * sizeof(float16);
// int index = 0;
// float16 zero = zynqmp::float_to_half(0.0f);
// for (int h_id = 0; h_id < out_h; h_id++) {
// for (int w_id = 0; w_id < out_w; w_id++) {
// // consider dilation.
// int start_h = h_id * stride_h - pad_top;
// int start_w = w_id * stride_w - pad_left;
// for (int c_id = 0; c_id < input_c; c_id++) {
// for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) {
// int in_h_id = start_h + k_h_id;
// bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad);
// int out_start_id =
// (h_id * out_w + w_id) * out_cols + c_id * window_size;
// for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) {
// int in_w_id = start_w + k_w_id;
// void* dst = out + index;//TODO
// exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad);
// if (exceed_flag) {
// memset( dst, 0, channel_mem_size);
// } else {
// void* src = const_cast<float16*>(input) + (in_h_id * input_w + in_w_id);
// memcpy(dst, src, channel_mem_size);
// }
// index++;
// int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id;
// int out_id = out_start_id + k_h_id * kernel_w + k_w_id;
// out[out_id] = exceed_flag ? zero : input[input_id];
// }
// }
// }
// }
// }
// }
void Im2SequenceCompute::PrepareForRun() {} void Im2SequenceCompute::PrepareForRun() {}
void Im2SequenceCompute::Run() { void Im2SequenceCompute::Run() {
...@@ -171,7 +107,8 @@ void Im2SequenceCompute::Run() { ...@@ -171,7 +107,8 @@ void Im2SequenceCompute::Run() {
auto paddings = param.paddings; auto paddings = param.paddings;
const auto* x_data = param.X->data<float16>(); const auto* x_data = param.X->data<float16>();
float16* o_data = (float16*)param.Out->mutable_data<float16>(); float16* o_data =
reinterpret_cast<float16*>(param.Out->mutable_data<float16>());
float16* o2 = o_data; float16* o2 = o_data;
...@@ -181,18 +118,17 @@ void Im2SequenceCompute::Run() { ...@@ -181,18 +118,17 @@ void Im2SequenceCompute::Run() {
param.X->ZynqTensor()->syncToCPU(); param.X->ZynqTensor()->syncToCPU();
float16* chw_data = new float16[param.X->numel()]; float16* chw_data = new float16[param.X->numel()];
hwc_to_chw<float16>(chw_data, x_data, param.X->dims()[0], param.X->dims()[1], param.X->dims()[2], param.X->dims()[3]); hwc_to_chw<float16>(chw_data,
x_data,
param.X->dims()[0],
param.X->dims()[1],
param.X->dims()[2],
param.X->dims()[3]);
const float16* in = chw_data; const float16* in = chw_data;
int out_cols = input_dims[1] * kernels[0] * kernels[1]; int out_cols = input_dims[1] * kernels[0] * kernels[1];
std::cout << "im_num:" << im_num << " im_size:" << im_size << std::endl;
std::cout << "out_cols:" << out_cols << std::endl;
// exit(-1);
int total_rows = 0; int total_rows = 0;
std::vector<uint64_t> im_offset; std::vector<uint64_t> im_offset;
im_offset.push_back(total_rows); im_offset.push_back(total_rows);
...@@ -228,20 +164,20 @@ void Im2SequenceCompute::Run() { ...@@ -228,20 +164,20 @@ void Im2SequenceCompute::Run() {
int out_offset = 0; int out_offset = 0;
for (int im_id = 0; im_id < im_num; im_id++) { for (int im_id = 0; im_id < im_num; im_id++) {
im2sequence(in + im_id * im_size, im2sequence(in + im_id * im_size,
input_dims[1], input_dims[1],
input_dims[2], input_dims[2],
input_dims[3], input_dims[3],
param.kernels[0], param.kernels[0],
param.kernels[1], param.kernels[1],
param.paddings[0], param.paddings[0],
param.paddings[1], param.paddings[1],
param.paddings[2], param.paddings[2],
param.paddings[3], param.paddings[3],
param.strides[0], param.strides[0],
param.strides[1], param.strides[1],
out_h_vec[im_id], out_h_vec[im_id],
out_w_vec[im_id], out_w_vec[im_id],
o2 + im_offset[im_id] * out_cols); o2 + im_offset[im_id] * out_cols);
} }
} else { } else {
int out_h = int out_h =
...@@ -253,20 +189,20 @@ void Im2SequenceCompute::Run() { ...@@ -253,20 +189,20 @@ void Im2SequenceCompute::Run() {
for (int im_id = 0; im_id < im_num; im_id++) { for (int im_id = 0; im_id < im_num; im_id++) {
int out_size_per_im = out_h * out_w * out_cols; int out_size_per_im = out_h * out_w * out_cols;
im2sequence(in + im_id * im_size, im2sequence(in + im_id * im_size,
input_dims[1], input_dims[1],
input_dims[2], input_dims[2],
input_dims[3], input_dims[3],
param.kernels[0], param.kernels[0],
param.kernels[1], param.kernels[1],
param.paddings[0], param.paddings[0],
param.paddings[1], param.paddings[1],
param.paddings[2], param.paddings[2],
param.paddings[3], param.paddings[3],
param.strides[0], param.strides[0],
param.strides[1], param.strides[1],
out_h, out_h,
out_w, out_w,
o2 + im_id * out_size_per_im); o2 + im_id * out_size_per_im);
im_offset.push_back(uint64_t((im_id + 1) * out_h * out_w)); im_offset.push_back(uint64_t((im_id + 1) * out_h * out_w));
} }
auto lod = param.Out->mutable_lod(); auto lod = param.Out->mutable_lod();
...@@ -277,8 +213,6 @@ void Im2SequenceCompute::Run() { ...@@ -277,8 +213,6 @@ void Im2SequenceCompute::Run() {
delete[] chw_data; delete[] chw_data;
param.Out->ZynqTensor()->flush(); param.Out->ZynqTensor()->flush();
param.Out->ZynqTensor()->copyScaleFrom(param.X->ZynqTensor()); param.Out->ZynqTensor()->copyScaleFrom(param.X->ZynqTensor());
param.X->ZynqTensor()->saveToFile("im_in.txt");
param.Out->ZynqTensor()->saveToFile("im2sequence.txt");
} }
} // namespace fpga } // namespace fpga
...@@ -292,11 +226,13 @@ REGISTER_LITE_KERNEL(im2sequence, ...@@ -292,11 +226,13 @@ REGISTER_LITE_KERNEL(im2sequence,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::Im2SequenceCompute, paddle::lite::kernels::fpga::Im2SequenceCompute,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), .BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), .BindOutput("Out",
PRECISION(kFP16), {LiteType::GetTensorTy(TARGET(kFPGA),
DATALAYOUT(kNHWC))}) PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -23,8 +23,8 @@ namespace lite { ...@@ -23,8 +23,8 @@ namespace lite {
namespace kernels { namespace kernels {
namespace fpga { namespace fpga {
class Im2SequenceCompute class Im2SequenceCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
using param_t = operators::Im2SequenceParam; using param_t = operators::Im2SequenceParam;
......
...@@ -47,9 +47,11 @@ class IoCopyHostToFpgaCompute ...@@ -47,9 +47,11 @@ class IoCopyHostToFpgaCompute
param.x->target() == TARGET(kFPGA)); param.x->target() == TARGET(kFPGA));
// param.y->CopyDataFrom(*param.x); // param.y->CopyDataFrom(*param.x);
param.y->mutable_data<float16>(); param.y->mutable_data<float16>();
if (param.x->ZynqTensor()->aligned() && param.x->ZynqTensor()->shape().shouldAlign()) { if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor; zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16, param.x->ZynqTensor()->shape()); tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor()); tempTensor.copyFrom(param.x->ZynqTensor());
// tempTensor.saveToFile("tempTensor", true); // tempTensor.saveToFile("tempTensor", true);
tempTensor.setAligned(true); tempTensor.setAligned(true);
...@@ -63,7 +65,6 @@ class IoCopyHostToFpgaCompute ...@@ -63,7 +65,6 @@ class IoCopyHostToFpgaCompute
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
auto out_lod = param.y->mutable_lod(); auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod(); *out_lod = param.x->lod();
} }
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override { std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
...@@ -106,9 +107,11 @@ class IoCopyFpgaToHostCompute ...@@ -106,9 +107,11 @@ class IoCopyFpgaToHostCompute
param.y->ZynqTensor()->setDataType(zynqmp::FP32); param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice(); param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->aligned() && param.x->ZynqTensor()->shape().shouldAlign()) { if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor; zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16, param.x->ZynqTensor()->shape()); tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor()); tempTensor.copyFrom(param.x->ZynqTensor());
// tempTensor.saveToFile("tempTensor", true); // tempTensor.saveToFile("tempTensor", true);
tempTensor.setAligned(true); tempTensor.setAligned(true);
...@@ -124,7 +127,6 @@ class IoCopyFpgaToHostCompute ...@@ -124,7 +127,6 @@ class IoCopyFpgaToHostCompute
*out_lod = param.x->lod(); *out_lod = param.x->lod();
} }
std::string doc() const override { return "Copy IO from FPGA to HOST"; } std::string doc() const override { return "Copy IO from FPGA to HOST"; }
}; };
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
#include "lite/api/paddle_place.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -27,13 +26,9 @@ namespace fpga { ...@@ -27,13 +26,9 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
template<typename T> template <typename T>
void convert_to_hwc(T* chw_data, void convert_to_hwc(
T* hwc_data, T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
int num,
int channel,
int height,
int width) {
std::cout << " -------------- chw -> HWC ---------------\n"; std::cout << " -------------- chw -> HWC ---------------\n";
std::cout << "channel: " << channel << std::endl; std::cout << "channel: " << channel << std::endl;
std::cout << "height: " << height << std::endl; std::cout << "height: " << height << std::endl;
...@@ -54,13 +49,9 @@ void convert_to_hwc(T* chw_data, ...@@ -54,13 +49,9 @@ void convert_to_hwc(T* chw_data,
} }
} }
template<typename T> template <typename T>
void hwc_to_chw(T* chw_data, void hwc_to_chw(
T* hwc_data, T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
int num,
int channel,
int height,
int width) {
std::cout << " ============= HWC -> CHW =============\n"; std::cout << " ============= HWC -> CHW =============\n";
std::cout << "channel: " << channel << std::endl; std::cout << "channel: " << channel << std::endl;
std::cout << "height: " << height << std::endl; std::cout << "height: " << height << std::endl;
...@@ -98,13 +89,13 @@ void TransHwcToChw(Tensor* dest, const Tensor* src) { ...@@ -98,13 +89,13 @@ void TransHwcToChw(Tensor* dest, const Tensor* src) {
if (dest->dims().size() > 3) { if (dest->dims().size() > 3) {
width = dest->dims()[3]; width = dest->dims()[3];
} }
hwc_to_chw<float>(chw, hwc, num, channel, height, width); hwc_to_chw<float>(chw, hwc, num, channel, height, width);
} }
if (src->ZynqTensor()->dataType() == zynqmp::FP16) { if (src->ZynqTensor()->dataType() == zynqmp::FP16) {
std::cout << "float16\n"; std::cout << "float16\n";
float16* chw = dest->mutable_data<float16>(); float16* chw = dest->mutable_data<float16>();
float16* hwc = const_cast<float16*>(src->data<float16>()); float16* hwc = const_cast<float16*>(src->data<float16>());
int num = dest->dims()[0]; int num = dest->dims()[0];
int channel = dest->dims()[1]; int channel = dest->dims()[1];
...@@ -116,10 +107,9 @@ void TransHwcToChw(Tensor* dest, const Tensor* src) { ...@@ -116,10 +107,9 @@ void TransHwcToChw(Tensor* dest, const Tensor* src) {
if (dest->dims().size() > 3) { if (dest->dims().size() > 3) {
width = dest->dims()[3]; width = dest->dims()[3];
} }
hwc_to_chw<float16>(chw, hwc, num, channel, height, width); hwc_to_chw<float16>(chw, hwc, num, channel, height, width);
} }
} }
void TransChwToHwc(Tensor* dest, const Tensor* src) { void TransChwToHwc(Tensor* dest, const Tensor* src) {
std::cout << "chw to hwc \n"; std::cout << "chw to hwc \n";
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <vector>
#include "lite/kernels/fpga/mul_compute.h" #include "lite/kernels/fpga/mul_compute.h"
#include <vector>
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
...@@ -27,7 +27,7 @@ namespace fpga { ...@@ -27,7 +27,7 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
void MulCompute::PrepareForRun() { void MulCompute::PrepareForRun() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
// ==================================================== // ====================================================
zynqmp::FullyConnectedParam& fc_param = pe_.param(); zynqmp::FullyConnectedParam& fc_param = pe_.param();
...@@ -44,7 +44,8 @@ void MulCompute::PrepareForRun() { ...@@ -44,7 +44,8 @@ void MulCompute::PrepareForRun() {
zynqmp::Shape bias_shape(zynqmp::N, {channel}); zynqmp::Shape bias_shape(zynqmp::N, {channel});
float* bias_data = fc_param.bias->mutableData<float>(zynqmp::FP32, bias_shape); float* bias_data =
fc_param.bias->mutableData<float>(zynqmp::FP32, bias_shape);
memset(bias_data, 0, channel * sizeof(float)); memset(bias_data, 0, channel * sizeof(float));
bias_.flush(); bias_.flush();
...@@ -62,7 +63,6 @@ void mul(MulCompute* k) { ...@@ -62,7 +63,6 @@ void mul(MulCompute* k) {
float16* out_data = param.output->mutable_data<float16>(); float16* out_data = param.output->mutable_data<float16>();
int g_index = 0; int g_index = 0;
for (int n = 0; n < 1; n++) { for (int n = 0; n < 1; n++) {
for (int on = 0; on < fn; on++) { for (int on = 0; on < fn; on++) {
float sum = 0; float sum = 0;
int si = 0; int si = 0;
...@@ -79,10 +79,9 @@ void mul(MulCompute* k) { ...@@ -79,10 +79,9 @@ void mul(MulCompute* k) {
} }
} }
void MulCompute::Run() { void MulCompute::Run() {
pe_.dispatch(); pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
zynqmp::FullyConnectedParam& fc_param = pe_.param(); zynqmp::FullyConnectedParam& fc_param = pe_.param();
Debugger.get_instance().registerOutput("mul", fc_param.output); Debugger.get_instance().registerOutput("mul", fc_param.output);
...@@ -96,11 +95,13 @@ void MulCompute::Run() { ...@@ -96,11 +95,13 @@ void MulCompute::Run() {
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
mul, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::MulCompute, def) mul, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::MulCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), .BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), .BindOutput("Out",
PRECISION(kFP16), {LiteType::GetTensorTy(TARGET(kFPGA),
DATALAYOUT(kNHWC))}) PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -25,8 +25,8 @@ namespace lite { ...@@ -25,8 +25,8 @@ namespace lite {
namespace kernels { namespace kernels {
namespace fpga { namespace fpga {
class MulCompute class MulCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
using param_t = operators::MulParam; using param_t = operators::MulParam;
......
...@@ -209,7 +209,7 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, ...@@ -209,7 +209,7 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
SliceOneClass<T>(scores, c, &score_slice); SliceOneClass<T>(scores, c, &score_slice);
SliceOneClass<T>(bboxes, c, &bbox_slice); SliceOneClass<T>(bboxes, c, &bbox_slice);
} }
NMSFast(bboxes,// TODO bbox_slice NMSFast(bboxes,
score_slice, score_slice,
score_threshold, score_threshold,
nms_threshold, nms_threshold,
...@@ -226,7 +226,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, ...@@ -226,7 +226,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
*num_nmsed_out = num_det; *num_nmsed_out = num_det;
const T* scores_data = scores.data<T>(); const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) { if (keep_top_k > -1 && num_det > keep_top_k) {
Tensor score_slice; Tensor score_slice;
const T* sdata; const T* sdata;
...@@ -333,7 +332,7 @@ void MulticlassNmsCompute::Run() { ...@@ -333,7 +332,7 @@ void MulticlassNmsCompute::Run() {
std::vector<std::map<int, std::vector<int>>> all_indices; std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<uint64_t> batch_starts = {0}; std::vector<uint64_t> batch_starts = {0};
int64_t batch_size = score_dims[0]; int64_t batch_size = score_dims[0];
int64_t out_dim = box_dim + 2; int64_t out_dim = box_dim + 2;
int num_nmsed_out = 0; int num_nmsed_out = 0;
Tensor boxes_slice, scores_slice; Tensor boxes_slice, scores_slice;
...@@ -356,8 +355,6 @@ void MulticlassNmsCompute::Run() { ...@@ -356,8 +355,6 @@ void MulticlassNmsCompute::Run() {
batch_starts.push_back(batch_starts.back() + num_nmsed_out); batch_starts.push_back(batch_starts.back() + num_nmsed_out);
} }
uint64_t num_kept = batch_starts.back(); uint64_t num_kept = batch_starts.back();
if (num_kept == 0) { if (num_kept == 0) {
outs->Resize({1, 1}); outs->Resize({1, 1});
...@@ -394,10 +391,12 @@ void MulticlassNmsCompute::Run() { ...@@ -394,10 +391,12 @@ void MulticlassNmsCompute::Run() {
outs->set_lod(lod); outs->set_lod(lod);
#ifdef FPGA_PRINT_TENSOR #ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor());
Debugger::get_instance().registerOutput("scores", scores->ZynqTensor());
Debugger::get_instance().registerOutput("nms", outs->ZynqTensor()); Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
#endif #endif
} }
} // namespace host } // namespace fpga
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
......
...@@ -50,13 +50,16 @@ void NormCompute::Run() { ...@@ -50,13 +50,16 @@ void NormCompute::Run() {
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
norm, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::NormCompute, def) norm, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::NormCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), .BindInput("X",
PRECISION(kFP16), {LiteType::GetTensorTy(TARGET(kFPGA),
DATALAYOUT(kNHWC))})
.BindOutput("Norm", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Norm",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
...@@ -26,8 +26,8 @@ namespace lite { ...@@ -26,8 +26,8 @@ namespace lite {
namespace kernels { namespace kernels {
namespace fpga { namespace fpga {
class NormCompute class NormCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
using param_t = operators::NormParam; using param_t = operators::NormParam;
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/fpga/pooling_compute.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/kernels/fpga/pooling_compute.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/type_system.h" #include "lite/core/type_system.h"
......
...@@ -12,10 +12,12 @@ ...@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/fpga/prior_box_compute.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/kernels/fpga/prior_box_compute.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
...@@ -23,11 +25,59 @@ namespace fpga { ...@@ -23,11 +25,59 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip,
std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6;
output_aspect_ratior->clear();
output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i];
bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true;
break;
}
}
if (!already_exist) {
output_aspect_ratior->push_back(ar);
if (flip) {
output_aspect_ratior->push_back(1.0f / ar);
}
}
}
}
void PriorBoxCompute::PrepareForRun() { void PriorBoxCompute::PrepareForRun() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
bool is_flip = param.flip;
bool is_clip = param.clip;
std::vector<float> min_size = param.min_sizes;
std::vector<float> max_size = param.max_sizes;
std::vector<float> aspect_ratio = param.aspect_ratios;
std::vector<float> variance = param.variances_;
int img_w = param.img_w;
int img_h = param.img_h;
float step_w = param.step_w;
float step_h = param.step_h;
float offset = param.offset;
std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
size_t prior_num = aspect_ratios_vec.size() * min_size.size();
prior_num += max_size.size();
std::vector<std::string> order = param.order;
bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
param.boxes->mutable_data<float16>(); int win1 = param.input->dims()[3];
param.variances->mutable_data<float16>(); int hin1 = param.input->dims()[2];
DDim shape_out({hin1, win1, prior_num, 4});
param.boxes->Resize(shape_out);
param.variances->Resize(shape_out);
param.boxes->mutable_data<float>();
param.variances->mutable_data<float>();
// ==================================================== // ====================================================
zynqmp::PriorBoxParam& priobox_param = pe_.param(); zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor(); priobox_param.input = param.input->ZynqTensor();
...@@ -38,7 +88,7 @@ void PriorBoxCompute::PrepareForRun() { ...@@ -38,7 +88,7 @@ void PriorBoxCompute::PrepareForRun() {
priobox_param.maxSizes = param.max_sizes; priobox_param.maxSizes = param.max_sizes;
priobox_param.aspectRatios = param.aspect_ratios; priobox_param.aspectRatios = param.aspect_ratios;
priobox_param.variances = param.variances_; priobox_param.variances = param.variances_;
// priobox_param.minMaxAspectRatiosOrder = param->MinMaxAspectRatiosOrder(); priobox_param.minMaxAspectRatiosOrder = min_max_aspect_ratios_order;
priobox_param.flip = param.flip; priobox_param.flip = param.flip;
priobox_param.clip = param.clip; priobox_param.clip = param.clip;
priobox_param.stepW = param.step_w; priobox_param.stepW = param.step_w;
...@@ -49,7 +99,16 @@ void PriorBoxCompute::PrepareForRun() { ...@@ -49,7 +99,16 @@ void PriorBoxCompute::PrepareForRun() {
pe_.apply(); pe_.apply();
} }
void PriorBoxCompute::Run() { pe_.dispatch(); } void PriorBoxCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::PriorBoxParam& priobox_param = pe_.param();
Debugger::get_instance().registerOutput("pb_boxes",
priobox_param.outputBoxes);
Debugger::get_instance().registerOutput("pb_variances",
priobox_param.outputVariances);
#endif
}
} // namespace fpga } // namespace fpga
} // namespace kernels } // namespace kernels
...@@ -62,8 +121,30 @@ REGISTER_LITE_KERNEL(prior_box, ...@@ -62,8 +121,30 @@ REGISTER_LITE_KERNEL(prior_box,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::PriorBoxCompute, paddle::lite::kernels::fpga::PriorBoxCompute,
def) def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Input",
.BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))}) {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Image",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
// REGISTER_LITE_KERNEL(prior_box,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::PriorBoxCompute,
// def)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("Image", {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
// .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
// .Finalize();
...@@ -23,7 +23,8 @@ namespace lite { ...@@ -23,7 +23,8 @@ namespace lite {
namespace kernels { namespace kernels {
namespace fpga { namespace fpga {
class PriorBoxCompute : public KernelLite<TARGET(kFPGA), PRECISION(kFP16)> { class PriorBoxCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
using param_t = operators::PriorBoxParam; using param_t = operators::PriorBoxParam;
......
...@@ -73,11 +73,13 @@ REGISTER_LITE_KERNEL(sequence_pool, ...@@ -73,11 +73,13 @@ REGISTER_LITE_KERNEL(sequence_pool,
kNHWC, kNHWC,
paddle::lite::kernels::fpga::SequencePoolCompute, paddle::lite::kernels::fpga::SequencePoolCompute,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), .BindInput("X",
PRECISION(kFP16), {LiteType::GetTensorTy(TARGET(kFPGA),
DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -23,8 +23,8 @@ namespace lite { ...@@ -23,8 +23,8 @@ namespace lite {
namespace kernels { namespace kernels {
namespace fpga { namespace fpga {
class SequencePoolCompute class SequencePoolCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
void PrepareForRun() override; void PrepareForRun() override;
......
...@@ -29,7 +29,6 @@ using float16 = zynqmp::float16; ...@@ -29,7 +29,6 @@ using float16 = zynqmp::float16;
class SoftmaxCompute class SoftmaxCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public: public:
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
......
...@@ -27,7 +27,6 @@ namespace fpga { ...@@ -27,7 +27,6 @@ namespace fpga {
using float16 = zynqmp::float16; using float16 = zynqmp::float16;
// Transpose // Transpose
void TransposeCompute::Run() { void TransposeCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -45,7 +44,6 @@ void Transpose2Compute::Run() { ...@@ -45,7 +44,6 @@ void Transpose2Compute::Run() {
// auto out = param.Out(); // auto out = param.Out();
// auto out_data = out->data<half>(); // auto out_data = out->data<half>();
// int num = input_x_dims[1]; // int num = input_x_dims[1];
// int channel = input_x_dims[2]; // int channel = input_x_dims[2];
...@@ -58,7 +56,7 @@ void Transpose2Compute::Run() { ...@@ -58,7 +56,7 @@ void Transpose2Compute::Run() {
// index++; // index++;
// } // }
// } // }
} else { } else {
param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
} }
...@@ -97,8 +95,6 @@ REGISTER_LITE_KERNEL(transpose2, ...@@ -97,8 +95,6 @@ REGISTER_LITE_KERNEL(transpose2,
{LiteType::GetTensorTy(TARGET(kFPGA), {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.BindOutput("Out", .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
{LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("XShape",
{LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -3,7 +3,7 @@ message(STATUS "compile with lite host kernels") ...@@ -3,7 +3,7 @@ message(STATUS "compile with lite host kernels")
add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps}) add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps}) add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
# add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any) #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any) #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
...@@ -392,9 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms, ...@@ -392,9 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms,
kNCHW, kNCHW,
paddle::lite::kernels::host::MulticlassNmsCompute, paddle::lite::kernels::host::MulticlassNmsCompute,
def) def)
.BindInput("BBoxes", {LiteType::GetTensorTy( .BindInput("BBoxes",
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) {LiteType::GetTensorTy(TARGET(kHost),
.BindInput("Scores", {LiteType::GetTensorTy( PRECISION(kAny),
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) DATALAYOUT(kAny))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize(); .Finalize();
...@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4} ...@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
# global variables # global variables
BUILD_EXTRA=OFF BUILD_EXTRA=ON
BUILD_JAVA=ON BUILD_JAVA=ON
BUILD_PYTHON=OFF BUILD_PYTHON=OFF
BUILD_DIR=$(pwd) BUILD_DIR=$(pwd)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册