提交 4dddc907 编写于 作者: C chonwhite

VGG-SSD works

上级 945aa36f
......@@ -3,4 +3,3 @@
--arm_abi=armv8 \
--arm_lang=gcc \
test
......@@ -3,4 +3,3 @@
--arm_abi=armv8 \
--arm_lang=gcc \
test
......@@ -143,8 +143,6 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
}
#endif
// get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
......
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(prior_box_fpga, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_or, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(logical_not, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(roi_align, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(generate_proposals, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(crop, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(beam_search, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(power, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
USE_LITE_KERNEL(negative, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(less_than, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(not_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(less_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(greater_than, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(greater_equal, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(multiclass_nms, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(box_clip, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gru, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_arm_float);
USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(elementwise_mul, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(shape, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gather, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(im2sequence, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(density_prior_box, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(lookup_table_v2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(unsqueeze2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layer_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sequence_softmax, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(norm, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nhwc2nchw);
USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nchw2nhwc);
USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nhwc2nchw);
USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_sub_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fill_constant_batch_size_like, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, host_to_device);
USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, device_to_host);
USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, host_to_device_once);
USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, device_to_host_once);
USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(beam_search_decode, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(assign_value, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(gru, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(anchor_generator, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(gru_unit, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def);
\ No newline at end of file
#pragma once
#include "paddle_lite_factory_helper.h"
USE_LITE_OP(lookup_table_v2);
USE_LITE_OP(feed);
USE_LITE_OP(fake_channel_wise_dequantize_max_abs);
USE_LITE_OP(assign);
USE_LITE_OP(layout);
USE_LITE_OP(transpose);
USE_LITE_OP(transpose2);
USE_LITE_OP(pool2d);
USE_LITE_OP(batch_norm);
USE_LITE_OP(reshape);
USE_LITE_OP(reshape2);
USE_LITE_OP(search_fc);
USE_LITE_OP(sequence_reverse);
USE_LITE_OP(matmul);
USE_LITE_OP(slice);
USE_LITE_OP(lod_reset);
USE_LITE_OP(graph_op);
USE_LITE_OP(expand);
USE_LITE_OP(top_k);
USE_LITE_OP(fake_quantize_range_abs_max);
USE_LITE_OP(arg_max);
USE_LITE_OP(beam_search);
USE_LITE_OP(box_clip);
USE_LITE_OP(fake_quantize_dequantize_moving_average_abs_max);
USE_LITE_OP(box_coder);
USE_LITE_OP(search_seq_depadding);
USE_LITE_OP(write_to_array);
USE_LITE_OP(is_empty);
USE_LITE_OP(prior_box);
USE_LITE_OP(sequence_concat);
USE_LITE_OP(affine_channel);
USE_LITE_OP(shape);
USE_LITE_OP(axpy);
USE_LITE_OP(anchor_generator);
USE_LITE_OP(reduce_max);
USE_LITE_OP(gru);
USE_LITE_OP(uniform_random);
USE_LITE_OP(unsqueeze);
USE_LITE_OP(unsqueeze2);
USE_LITE_OP(layout_once);
USE_LITE_OP(search_group_padding);
USE_LITE_OP(assign_value);
USE_LITE_OP(norm);
USE_LITE_OP(concat);
USE_LITE_OP(fill_constant);
USE_LITE_OP(fill_constant_batch_size_like);
USE_LITE_OP(calib_once);
USE_LITE_OP(decode_bboxes);
USE_LITE_OP(negative);
USE_LITE_OP(layer_norm);
USE_LITE_OP(mean);
USE_LITE_OP(lrn);
USE_LITE_OP(fetch);
USE_LITE_OP(power);
USE_LITE_OP(stack);
USE_LITE_OP(gather);
USE_LITE_OP(fc);
USE_LITE_OP(gru_unit);
USE_LITE_OP(increment);
USE_LITE_OP(mul);
USE_LITE_OP(search_seq_softmax);
USE_LITE_OP(var_conv_2d);
USE_LITE_OP(dropout);
USE_LITE_OP(fusion_elementwise_sub_activation);
USE_LITE_OP(fusion_elementwise_add_activation);
USE_LITE_OP(fusion_elementwise_mul_activation);
USE_LITE_OP(fusion_elementwise_max_activation);
USE_LITE_OP(fusion_elementwise_div_activation);
USE_LITE_OP(elementwise_sub);
USE_LITE_OP(elementwise_add);
USE_LITE_OP(elementwise_mul);
USE_LITE_OP(elementwise_max);
USE_LITE_OP(elementwise_div);
USE_LITE_OP(pad2d);
USE_LITE_OP(crop);
USE_LITE_OP(sequence_expand);
USE_LITE_OP(search_aligned_mat_mul);
USE_LITE_OP(io_copy);
USE_LITE_OP(squeeze);
USE_LITE_OP(squeeze2);
USE_LITE_OP(reduce_sum);
USE_LITE_OP(square);
USE_LITE_OP(relu);
USE_LITE_OP(leaky_relu);
USE_LITE_OP(relu_clipped);
USE_LITE_OP(prelu);
USE_LITE_OP(sigmoid);
USE_LITE_OP(tanh);
USE_LITE_OP(swish);
USE_LITE_OP(relu6);
USE_LITE_OP(log);
USE_LITE_OP(exp);
USE_LITE_OP(floor);
USE_LITE_OP(hard_sigmoid);
USE_LITE_OP(sqrt);
USE_LITE_OP(rsqrt);
USE_LITE_OP(softsign);
USE_LITE_OP(sequence_expand_as);
USE_LITE_OP(match_matrix_tensor);
USE_LITE_OP(range);
USE_LITE_OP(lookup_table);
USE_LITE_OP(fake_quantize_moving_average_abs_max);
USE_LITE_OP(search_grnn);
USE_LITE_OP(conv2d_transpose);
USE_LITE_OP(flatten);
USE_LITE_OP(flatten2);
USE_LITE_OP(nearest_interp);
USE_LITE_OP(bilinear_interp);
USE_LITE_OP(softmax);
USE_LITE_OP(reduce_mean);
USE_LITE_OP(cast);
USE_LITE_OP(fake_dequantize_max_abs);
USE_LITE_OP(read_from_array);
USE_LITE_OP(yolo_box);
USE_LITE_OP(multiclass_nms);
USE_LITE_OP(while);
USE_LITE_OP(conv2d);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(split);
USE_LITE_OP(scale);
USE_LITE_OP(beam_search_decode);
USE_LITE_OP(im2sequence);
USE_LITE_OP(sequence_topk_avg_pooling);
USE_LITE_OP(io_copy_once);
USE_LITE_OP(roi_align);
USE_LITE_OP(sequence_reshape);
USE_LITE_OP(equal);
USE_LITE_OP(notequal);
USE_LITE_OP(less_than);
USE_LITE_OP(less_equal);
USE_LITE_OP(greater_than);
USE_LITE_OP(greater_equal);
USE_LITE_OP(calib);
USE_LITE_OP(sequence_pool);
USE_LITE_OP(attention_padding_mask);
USE_LITE_OP(search_attention_padding_mask);
USE_LITE_OP(density_prior_box);
USE_LITE_OP(search_seq_fc);
USE_LITE_OP(generate_proposals);
USE_LITE_OP(sequence_arithmetic);
USE_LITE_OP(search_seq_arithmetic);
USE_LITE_OP(shuffle_channel);
USE_LITE_OP(sequence_softmax);
USE_LITE_OP(logical_xor);
USE_LITE_OP(logical_and);
USE_LITE_OP(logical_or);
USE_LITE_OP(logical_not);
\ No newline at end of file
......@@ -23,7 +23,6 @@
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_file, "", "input_file");
namespace paddle {
......@@ -71,11 +70,7 @@ TEST(ResNet50, test) {
Place{TARGET(kARM), PRECISION(kFloat)},
});
predictor.Build(FLAGS_model_dir,
"",
"",
valid_places);
predictor.Build(FLAGS_model_dir, "", "", valid_places);
// predictor.Build(FLAGS_model_dir,
// FLAGS_model_dir + "/model",
......@@ -83,13 +78,11 @@ TEST(ResNet50, test) {
// Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
// valid_places);
auto* input_tensor = predictor.GetInput(0);
int width = 416;
int height = 416;
std::ifstream file_stream(FLAGS_input_file);
// file_stream.open(path);
if (!file_stream.good()) {
......@@ -101,7 +94,8 @@ TEST(ResNet50, test) {
file_stream >> height;
file_stream >> width;
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, height, width})));
input_tensor->Resize(
DDim(std::vector<DDim::value_type>({1, 3, height, width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
......@@ -125,17 +119,17 @@ TEST(ResNet50, test) {
}
auto* out = predictor.GetOutput(0);
for (int i = 0;i < out->dims().production();i++) {
std::cout << ":" << out->data<float>()[i] << std::endl;
for (int i = 0; i < out->dims().production(); i++) {
std::cout << ":" << out->data<float>()[i] << std::endl;
}
// std::cout << "-------\n";
// auto* out1 = predictor.GetOutput(1);
// for (int i = 0;i < out1->dims().production();i++) {
// std::cout << ":" << out1->data<float>()[i] << std::endl;
// std::cout << ":" << out1->data<float>()[i] << std::endl;
// }
std::string file = "output/" + FLAGS_input_file.substr (6);
std::string file = "output/" + FLAGS_input_file.substr(6);
std::cout << "file:::" << file << std::endl;
std::ofstream ofs;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
// #include "lite/backends/fpga/lite_tensor.h"
#include "lite/core/tensor.h"
......@@ -16,19 +34,16 @@ class Debugger {
void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
// tensor->printScale();
// tensor->saveToFile(op_type, true);
if (op_type != "conv") {
// tensor->saveToFile(op_type, true);
}
}
private:
std::unordered_map<std::string, bool> op_config;
Debugger() {
op_config["concat"] = true;
op_config["conv_add_bn"] = true;
op_config["conv_add_bn_relu"] = true;
op_config["conv_add"] = true;
op_config["conv_add_relu"] = true;
op_config["conv_bn"] = true;
op_config["conv_bn_relu"] = true;
op_config["conv"] = true;
op_config["crop"] = true;
}
};
......@@ -39,16 +54,16 @@ inline void chw_to_hwc(Tensor* t, float* dst) {
int height = 1;
int width = 1;
if (t->dims().size() > 2){
height = t->dims()[2];
if (t->dims().size() > 2) {
height = t->dims()[2];
}
if (t->dims().size() > 3){
width = t->dims()[3];
if (t->dims().size() > 3) {
width = t->dims()[3];
}
// int width = t->dims()[3];
const float* chw_data = t->data<float>();
float* hwc_data = dst;
int chw = channel * height * width;
int wc = width * channel;
int index = 0;
......@@ -64,7 +79,7 @@ inline void chw_to_hwc(Tensor* t, float* dst) {
}
}
inline void read_from_file(lite::Tensor* t,const std::string& path) {
inline void read_from_file(lite::Tensor* t, const std::string& path) {
std::ifstream file_stream;
file_stream.open(path);
if (!file_stream) {
......@@ -81,17 +96,18 @@ inline void read_from_file(lite::Tensor* t,const std::string& path) {
}
inline void save_float(float* data, const std::string& name, int len) {
// return;
// return;
static int counter = 0;
std::string old_string = std::to_string(counter);
std::string new_string = std::string(3 - old_string.length(), '0') + old_string;
std::string new_string =
std::string(3 - old_string.length(), '0') + old_string;
std::string file = "arm_" + new_string + name;
counter++;
std::cout << "-------------------------- saving file: --------------------------" << file << std::endl;
std::cout
<< "-------------------------- saving file: --------------------------"
<< file << std::endl;
std::ofstream ofs;
ofs.open(file);
// float* data = dst;
......@@ -102,9 +118,11 @@ inline void save_float(float* data, const std::string& name, int len) {
ofs.close();
}
inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert = true) {
inline void save_tensor(lite::Tensor* t,
const std::string& name,
bool convert = true) {
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
float* dst = new float[t->numel()];
if (convert) {
chw_to_hwc(t, dst);
data = dst;
......@@ -114,8 +132,9 @@ inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert =
delete[] dst;
}
inline void save_tensor(const lite::Tensor* t,const std::string& name, bool convert = true) {
inline void save_tensor(const lite::Tensor* t,
const std::string& name,
bool convert = true) {
// return;
float* data = const_cast<float*>(t->data<float>());
float* dst = new float[t->numel()];
......@@ -128,6 +147,5 @@ inline void save_tensor(const lite::Tensor* t,const std::string& name, bool conv
delete[] dst;
}
}
}
\ No newline at end of file
} // namespace lite
} // namespace paddle
......@@ -45,12 +45,8 @@ struct None : Layout {
int channelIndex() { return -1; }
int heightIndex() { return -1; }
int widthIndex() { return -1; }
int alignedElementCount(const std::vector<int>& dims) {
return 16;
}
virtual int elementCount(const std::vector<int>& dims) {
return 1;
}
int alignedElementCount(const std::vector<int>& dims) { return 16; }
virtual int elementCount(const std::vector<int>& dims) { return 1; }
};
struct NCHW : Layout {
......
......@@ -25,28 +25,31 @@ namespace bias_scale {
void align_element(float **data_in, int num_per_div_before_alignment, int num) {
int copynum = 0;
float *ptr_unaligned = *data_in;
int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment,
BS_NUM_ALIGNMENT);
int num_element = 2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned =
(float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, num_element * sizeof(float));
for (int i = 0; i < div_num; i++) {
if (i == div_num - 1) {
copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
if (i == div_num - 1) {
copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
}
fpga_free(ptr_unaligned);
*data_in = ptr_aligned;
......@@ -55,13 +58,14 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
size_t interleave(float **data_in, int num_after_alignment) {
float *ptr_uninterleaved = *data_in;
float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
int num = num_after_alignment / 4;
for (int i = 0; i < num; i++) {
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
4 * sizeof(float));
memcpy(
ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, 4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
ptr_uninterleaved + num_after_alignment + 4 * i,
4 * sizeof(float));
}
fpga_free(ptr_uninterleaved);
......@@ -70,12 +74,14 @@ size_t interleave(float **data_in, int num_after_alignment) {
}
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
int element_num_per_division,
int num) {
align_element(bias_scale_array, element_num_per_division, num);
int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
size_t mem = interleave(bias_scale_array, div_num * element_num_after_division);
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
size_t mem =
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, mem);
}
void format_bias_array(float **bias_array, int num) {
......@@ -83,12 +89,12 @@ void format_bias_array(float **bias_array, int num) {
int num_before_align = num;
int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
int16_t *ptr_aligned =
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
for (int i = 0; i < num_before_align; i++) {
float value = ptr_aligned[i];
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
float value = ptr_aligned[i];
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
}
*bias_array = (float *)ptr_aligned; // NOLINT
fpga_free(ptr_unaligned);
......
......@@ -21,7 +21,8 @@ namespace bias_scale {
void align_element(float** data_in, int num_per_div_before_alignment, int num);
size_t interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
int element_num_per_division,
int num);
void format_bias_array(float** bias_array, int num);
} // namespace bias_scale
......
......@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/llapi/filter.h"
#include <memory.h>
#include <algorithm>
#include <fstream>
#include <string>
#include "lite/backends/fpga/KD/llapi/filter.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -27,7 +27,7 @@ namespace filter {
static int FILTER_SIZE = 2048;
static int COLUMN = 4;
void saveToFile(std::string name,void* data_in, int size) {
void saveToFile(std::string name, void* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
......@@ -39,7 +39,7 @@ void saveToFile(std::string name,void* data_in, int size) {
// ofs.close();
}
void saveFloatToFile(std::string name,float* data_in, int size) {
void saveFloatToFile(std::string name, float* data_in, int size) {
// std::ofstream ofs;
// ofs.open(name);
......@@ -50,22 +50,16 @@ void saveFloatToFile(std::string name,float* data_in, int size) {
// ofs.close();
}
void set_filter_capacity(uint32_t cap) {
FILTER_SIZE = cap;
}
void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
void set_colunm(uint32_t column) {
COLUMN = column;
}
void set_colunm(uint32_t column) { COLUMN = column; }
// replace zynqmp_api.h #define FILTER_NUM_ALIGNMENT
int get_filter_num_alignment() {
return COLUMN * 4;
}
int get_filter_num_alignment() { return COLUMN * 4; }
int calc_division_capacity(int chw) {
// int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
int filter_num_alignment = get_filter_num_alignment();
int filter_num_alignment = get_filter_num_alignment();
int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
return n < FILTER_SIZE ? n : FILTER_SIZE;
}
......@@ -91,8 +85,12 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
}
}
void convert_to_hwc(int8_t* chw_data, int8_t* hwc_data, int num, int channel,
int height, int width) {
void convert_to_hwc(int8_t* chw_data,
int8_t* hwc_data,
int num,
int channel,
int height,
int width) {
int chw = channel * height * width;
int wc = width * channel;
int index = 0;
......@@ -128,7 +126,6 @@ int8_t float_to_int8(float fdata) {
}
void quantize(float* src, int8_t* dst, int len, float max) {
float fix_range = 127;
float scale = fix_range / max;
for (size_t i = 0; i < len; i++) {
......@@ -149,9 +146,12 @@ void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
}
}
void align_num(int8_t* src, int8_t* dst, int num_per_div_before_alignment,
int num, int align_chw) {
int filter_num_alignment = get_filter_num_alignment();
void align_num(int8_t* src,
int8_t* dst,
int num_per_div_before_alignment,
int num,
int align_chw) {
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment);
......@@ -178,8 +178,10 @@ void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
int filter_num_alignment = get_filter_num_alignment();
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
for (index = 0; index < num_after_alignment; index++) {
new_index = index / filter_num_alignment * filter_num_alignment + (index % (filter_num_alignment/2) / 4 * 8) + (index % (filter_num_alignment/2) % 4) +
(index / (filter_num_alignment/2) % 2 * 4);
new_index = index / filter_num_alignment * filter_num_alignment +
(index % (filter_num_alignment / 2) / 4 * 8) +
(index % (filter_num_alignment / 2) % 4) +
(index / (filter_num_alignment / 2) % 2 * 4);
memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
}
}
......@@ -191,7 +193,8 @@ void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
for (int i = 0; i < num_after_alignment; i += 2) {
for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(dst + i * chw_align + interleave_per_num * j,
src + i * chw_align + interleave_per_num * k, interleave_per_num);
src + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
src + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num);
......@@ -199,14 +202,20 @@ void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
}
}
int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
int height, int width, int group_num, float max,
std::vector<float>& filter_max) {
int8_t* format_filter(float* data_in,
int& mem_size_a, // NOLINT
int num,
int channel,
int height,
int width,
int group_num,
float max,
std::vector<float>& filter_max) { // NOLINT
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int filter_num_alignment = get_filter_num_alignment();
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
......@@ -215,8 +224,8 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, filter_num_alignment);
// saveFloatToFile("quantize_before", data_in, data_size);
......@@ -255,15 +264,19 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
fpga_free(hwc_data);
}
if (num_after_alignment != num) {
int filter_num_alignment = get_filter_num_alignment();
int filter_num_alignment = get_filter_num_alignment();
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, filter_num_alignment);
// int div_num =
// (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
// (num + num_per_div_before_alignment - 1) /
// num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * chw_aligned;
int8_t* num_aligned_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
align_num(temp_data, num_aligned_data, num_per_div_before_alignment, num,
align_num(temp_data,
num_aligned_data,
num_per_div_before_alignment,
num,
chw_aligned);
// saveToFile("align_num.txt", num_aligned_data, data_size * 8);
......@@ -274,14 +287,15 @@ int8_t* format_filter(float* data_in, int& mem_size_a, int num, int channel,
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
reorder(temp_data, aligned_data, num_after_alignment, chw);
// saveToFile("reorder.txt", aligned_data, data_size * 8);
fpga_free(temp_data); // TODO change name of qdata;
fpga_free(temp_data);
int8_t* interleaved_data =
reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
interleave(aligned_data, interleaved_data, num_after_alignment, chw);
// saveToFile("interleave.txt", interleaved_data, data_size * 8);
fpga_free(aligned_data);
fpga_flush(interleaved_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
fpga_flush(interleaved_data,
align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
sizeof(char));
mem_size_a = num_after_alignment * chw_aligned;
return interleaved_data;
}
......@@ -328,7 +342,11 @@ size_t align_element_n(int16_t** data_in, int num, int height, int width) {
return num_element * sizeof(int16_t);
}
void to_fp16(float* src, float16* dst, int num, int height, int width,
void to_fp16(float* src,
float16* dst,
int num,
int height,
int width,
float* scale_ptr) {
int size = num * height * width;
for (int n = 0; n < num; n++) {
......@@ -344,8 +362,8 @@ void to_fp16(float* src, float16* dst, int num, int height, int width,
fpga_flush(dst, size * sizeof(int16_t));
}
void quantize_to_fp16(float** data_in, int num, int height, int width,
float* scale_ptr) {
void quantize_to_fp16(
float** data_in, int num, int height, int width, float* scale_ptr) {
float* tmp = *data_in;
int size = num * height * width;
......@@ -364,20 +382,16 @@ void quantize_to_fp16(float** data_in, int num, int height, int width,
*data_in = (float*)tmp_data; // NOLINT
fpga_free(tmp);
}
size_t format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr) {
// float16* fp16_data = reinterpret_cast<float16*>(
// fpga_malloc(num * height * width * sizeof(float16)));
// to_fp16(*data_in, fp16_data, num, height, width, scale_ptr);
// int16_t** quantize_data = (int16_t**)&fp16_data; // NOLINT
size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in;
int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
convert_to_hwn(quantize_data, num, height, width);
size_t size = align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
fpga_flush(*quantize_data,
align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * height * width *
sizeof(int16_t));
return size;
}
} // namespace filter
......
......@@ -33,16 +33,22 @@ int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity);
float find_max(float* data_in, int data_size);
int8_t* format_filter(float* data_in, int& mem_size, int num, int channel,
int height, int width, int group_num, float max,
std::vector<float>& filter_max);
int8_t* format_filter(float* data_in,
int& mem_size, // NOLINT
int num,
int channel,
int height,
int width,
int group_num,
float max, // NOLINT
std::vector<float>& filter_max); // NOLINT
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
size_t align_element_n(int16_t** data_in, int num, int height, int width);
// void quantize_to_fp16(float** data_in, int num, int height, int width,
// float* scale_ptr);
size_t format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr);
size_t format_dwconv_filter(
float** data_in, int num, int height, int width, float* scale_ptr);
} // namespace filter
} // namespace zynqmp
......
......@@ -158,7 +158,7 @@ void fpga_copy(void *dest, const void *src, size_t num) {
int fpga_reset() {
struct FpgaResetArgs args;
return do_ioctl(IOCTL_FPGA_RESET, &args);
return do_ioctl(IOCTL_FPGA_RESET, &args);
}
int ioctl_conv(const struct ConvArgs &args) {
......@@ -283,7 +283,6 @@ int perform_bypass(const struct BypassArgs &args) {
bypassArgs.image.height = 1;
bypassArgs.output.scale_address = scales;
float scale = 0;
for (int i = 0; i < count; ++i) {
bypassArgs.image.channels = max_size;
......@@ -309,9 +308,8 @@ int perform_bypass(const struct BypassArgs &args) {
output_address + count * max_size * out_type_size);
ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
scale = std::max(scale, scales[0]);
}
args.output.scale_address[0] = scale;
args.output.scale_address[1] = 1.0f / scale;
return ret;
......@@ -362,15 +360,15 @@ int compute_fpga_dwconv(const struct DWconvArgs &args) {
std::cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
// float *in_scale = (float *)args.image.scale_address;
// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
// float *in_scale = (float *)args.image.scale_address;
// std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
// std::endl;
#endif
return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
}
int config_activation(const struct ActiveParamterArgs& args) {
return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
int config_activation(const struct ActiveParamterArgs &args) {
return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
}
// int config_power(const struct PowerArgs& args) {
......
......@@ -32,6 +32,5 @@ class PE {
virtual ~PE() {}
};
} // namespace zynqmp
} // namespace paddle
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include <string>
#include <vector>
#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
......@@ -100,7 +101,7 @@ struct DepthwiseConvParam : ConvParam {
};
struct GRUParam : PEParam {
public:
public:
Tensor* input = nullptr;
Tensor* h0 = nullptr;
Tensor* weight = nullptr;
......@@ -112,7 +113,7 @@ public:
Tensor* hidden = nullptr;
std::string gate_activation = "sigmoid";
std::string activation= "tanh";
std::string activation = "tanh";
bool is_reverse = false;
bool origin_mode = false;
};
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <arm_neon.h>
#include <algorithm>
#include <vector>
#include "lite/backends/fpga/KD/pe.hpp"
......@@ -59,16 +60,14 @@ class ConvPE : public PE {
if (param_.filter->shape().width() == 1 &&
param_.filter->shape().height() == 1) {
// use_cpu_ = true;
// use_cpu_ = true;
}
if (!use_cpu_) {
// param_.filter->releaseData();
}
}
void cpu_conv_hwc() {
Tensor* input = param_.input;
Tensor* output = param_.output;
input->syncToCPU();
......@@ -106,10 +105,12 @@ class ConvPE : public PE {
for (int pw = 0; pw < pooled_width_; pw++) {
int hstart = ph * kernel_step_h - image_pad_h;
int wstart = pw * kernel_step_w - image_pad_w;
int hend = std::min(hstart + kernel_height, (int) image_height);
int wend = std::min(wstart + kernel_width, (int) image_width);
hstart = std::max(hstart, (int) 0);
wstart = std::max(wstart, (int) 0);
int hend =
std::min(hstart + kernel_height, static_cast<int>(image_height));
int wend =
std::min(wstart + kernel_width, static_cast<int>(image_width));
hstart = std::max(hstart, static_cast<int>(0));
wstart = std::max(wstart, static_cast<int>(0));
for (int oc = 0; oc < out_channel; oc++) {
float sum = 0.0f;
const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
......@@ -117,28 +118,32 @@ class ConvPE : public PE {
for (int h = hstart; h < hend; h++) {
int hi = 0;
if (ph == 0) {
hi = h - hstart + image_pad_h;
} else{
hi = h - hstart;
hi = h - hstart + image_pad_h;
} else {
hi = h - hstart;
}
for (int w = wstart; w < wend; w++) {
int wi = 0;
if (pw == 0) {
wi = w - wstart + image_pad_w;
}else {
wi = w - wstart;
wi = w - wstart + image_pad_w;
} else {
wi = w - wstart;
}
const int index = (h * image_width + w) * image_channels + c;
// int weight_index = (hi * kernel_width + wi) * image_channels + c;//TODO
int weight_index = oc * filter_chw + kernel_width *
kernel_height * c + kernel_width * hi + wi;
// int weight_index = (hi *
// kernel_width + wi) * image_channels
// + c;//TODO
int weight_index = oc * filter_chw +
kernel_width * kernel_height * c +
kernel_width * hi + wi;
float value = image_addr[index] * filter_data[weight_index];
sum += value;
}
}
}
// std::cout << " ============================= pool_index:" << pool_index << " sum:" << sum << std::endl;
// std::cout << " ============================= pool_index:" <<
// pool_index << " sum:" << sum << std::endl;
if (param_.relu.enabled && sum < 0) {
sum = -sum;
}
......@@ -155,9 +160,6 @@ class ConvPE : public PE {
output->scale()[1] = 127 / max;
}
void cpu_compute() {
Tensor* input = param_.input;
Tensor* output = param_.output;
......@@ -196,7 +198,6 @@ class ConvPE : public PE {
for (int h = 0; h < output->shape().height(); h++) {
for (int w = 0; w < output->shape().width(); w++) {
float sum = 0;
// #pragma omp parallel for
......@@ -204,11 +205,10 @@ class ConvPE : public PE {
int image_index = h * out_width * in_channel + w * in_channel + j;
float value = image_addr[image_index] * filter_ptr[j];
sum += value;
// mi[j] = value;
}
// for (int j = 0; j < in_channel; j++) {
// sum += mi[j];
// }
......@@ -305,7 +305,8 @@ class ConvPE : public PE {
// }
}
if (param_.input->shape().channel() == 64 && param_.output->shape().channel() == 128) {
if (param_.input->shape().channel() == 64 &&
param_.output->shape().channel() == 128) {
// exit(-1);
}
......
......@@ -85,7 +85,8 @@ inline void combine_bn_params(BatchnormParam* bn, ConvParam* param_) {
}
}
inline void combine_add_bn_params(BatchnormParam* bn, Tensor* bias,
inline void combine_add_bn_params(BatchnormParam* bn,
Tensor* bias,
ConvParam* param_) {
int channel = param_->output->shape().channel();
Shape sb_shape(N, {channel});
......@@ -117,74 +118,88 @@ inline void combine_add_bn_params(BatchnormParam* bn, Tensor* bias,
param_->bias()->setDataLocation(CPU);
}
inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter,
Tensor* scale_bias, int group) {
inline void format_scale_bias(Tensor* scale,
Tensor* bias,
Tensor* filter,
Tensor* scale_bias,
int group) {
float* scale_data = nullptr;
float* bias_data = nullptr;
if (scale != nullptr) {
scale_data = scale->data<float>();
float* bias_data = nullptr;
if (scale != nullptr) {
scale_data = scale->data<float>();
}
if (bias != nullptr) {
bias_data = bias->data<float>();
}
int channel = filter->shape().num();
int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
int c_per_group = channel / group;
int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
Shape bias_scale_shape(N, {2 * scale_bias_len});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
float* temp_data =
reinterpret_cast<float*>(fpga_malloc(2 * scale_bias_len * sizeof(float)));
memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
std::vector<float> scales;
if (scale_data != nullptr) {
for (int i = 0; i < channel; ++i) {
scales.push_back(scale_data[i]);
}
if (bias != nullptr) {
bias_data = bias->data<float>();
for (int i = 0; i < scale_bias_len - channel; i++) {
scales.push_back(1);
}
int channel = filter->shape().num();
int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
int c_per_group = channel / group;
int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
Shape bias_scale_shape(N, {2 * scale_bias_len});
float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
float* temp_data = (float*)fpga_malloc(2 * scale_bias_len * sizeof(float)) ;
memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
std::vector<float> scales;
if (scale_data != nullptr) {
for (int i = 0; i < channel; ++i) {
scales.push_back(scale_data[i]);
}
for (int i = 0;i < scale_bias_len - channel; i++) {
scales.push_back(1);
}
} else {
for (int i = 0;i < scale_bias_len; i++) {
scales.push_back(1);
}
} else {
for (int i = 0; i < scale_bias_len; i++) {
scales.push_back(1);
}
}
for (int i = 0; i < scale_bias_len; ++i) {
temp_data[i + scale_bias_len] = 1;
temp_data[i] = 0;
}
for (int i = 0; i < scale_bias_len; ++i) {
temp_data[i + scale_bias_len] = 1;
temp_data[i] = 0;
}
for (int g = 0; g < group; g++) {
for (int c = 0; c < c_per_group; c++) {
int src_index = g * c_per_group + c;
int dst_index = g * aligned_c_per_group + c;
float scale_value = scales[src_index];
float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
temp_data[dst_index + scale_bias_len] = scale_value;
temp_data[dst_index] = bias_value;
}
for (int g = 0; g < group; g++) {
for (int c = 0; c < c_per_group; c++) {
int src_index = g * c_per_group + c;
int dst_index = g * aligned_c_per_group + c;
float scale_value = scales[src_index];
float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
temp_data[dst_index + scale_bias_len] = scale_value;
temp_data[dst_index] = bias_value;
}
}
// int element_num_per_div = get_filter_num_per_div(filter, group);
// int scale_bias_len = align_to_x(channel / group, 8) * group;
bias_scale::format_bias_scale_array(&temp_data, scale_bias_len / group, scale_bias_len);
memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
// int element_num_per_div = get_filter_num_per_div(filter, group);
// int scale_bias_len = align_to_x(channel / group, 8) * group;
bias_scale::format_bias_scale_array(
&temp_data, scale_bias_len / group, scale_bias_len);
memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
}
inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group,
std::vector<float>& scales) {
inline void format_filter(Tensor* filter,
Tensor* quantized_filter,
int group,
std::vector<float>& scales) { // NOLINT
float max_value = find_max(*filter);
Shape& filter_shape = filter->shape();
int mem_size;
std::vector<float> max_values;
int8_t* quantized_data = filter::format_filter(filter->data<float>(), mem_size ,filter_shape.num(),
filter_shape.channel(), filter_shape.height(), filter_shape.width(), group, max_value, max_values);
float mem_factor = mem_size * 1.0f / filter->shape().numel();
int8_t* quantized_data = filter::format_filter(filter->data<float>(),
mem_size,
filter_shape.num(),
filter_shape.channel(),
filter_shape.height(),
filter_shape.width(),
group,
max_value,
max_values);
float mem_factor = mem_size * 1.0f / filter->shape().numel();
quantized_filter->setMemScale(mem_factor);
quantized_filter->setAligned(true);
......@@ -209,10 +224,10 @@ inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group,
// }
// ofs.close();
// exit(-1);
}
inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter,
inline void format_dw_filter(Tensor* filter,
Tensor* quantized_filter,
float* scale) {
int num = filter->shape().num();
int height = filter->shape().height();
......@@ -259,12 +274,14 @@ inline void split_filter_num(const ConvParam& c_param) {
int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
auto chw = filter->shape().channel() * filter->shape().height() *
filter->shape().width();
filter->shape().width();
auto num = filter->shape().num();
int div_capacity = filter::calc_division_capacity(chw);
int filter_num_alignment = filter::get_filter_num_alignment();
int aligned_num = align_to_x(num / param.groups, filter_num_alignment) * param.groups;
// int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) * param.groups;
int aligned_num =
align_to_x(num / param.groups, filter_num_alignment) * param.groups;
// int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) *
// param.groups;
split_num = filter::calc_split_num(aligned_num, div_capacity);
Shape& out_shape = out->shape();
......@@ -280,20 +297,23 @@ inline void split_filter_num(const ConvParam& c_param) {
ConvArgs& args = conv_param->args;
if (split_num == 1) {
out_address = out->data<float16>();
out_scale_address = out->scale();
out_address = out->data<float16>();
out_scale_address = out->scale();
}
filter_num = i == split_num - 1
? channel - (split_num - 1) * filter_num_per_div // NOLINT
: filter_num_per_div;
? channel - (split_num - 1) * filter_num_per_div // NOLINT
: filter_num_per_div;
if (split_num != 1) {
Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
out_address = conv_param->output.mutableData<float16>(FP16, shape);
out_scale_address = conv_param->output.scale();
}
Shape f_shape(NCHW, {filter_num, filter->shape().channel(),
filter->shape().height(), filter->shape().width()});
Shape f_shape(NCHW,
{filter_num,
filter->shape().channel(),
filter->shape().height(),
filter->shape().width()});
Tensor new_filter;
float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
......@@ -307,11 +327,12 @@ inline void split_filter_num(const ConvParam& c_param) {
conv_param->filter.mutableData<float>(FP32, f_shape);
if (param.groups != 1) {
int mem_factor = 32 / filter_num_per_div; // TODO
int mem_factor =
32 / filter_num_per_div; // TODO(chonwhite): change 32 to param;
conv_param->filter.setMemScale(mem_factor);
}
std::vector<float> v; // TODO
std::vector<float> v; // TODO(chonwhite): change local variable name
format_filter(&new_filter, &(conv_param->filter), param.groups, v);
conv_param->filter.setDataType(INT8);
......@@ -326,15 +347,18 @@ inline void split_filter_num(const ConvParam& c_param) {
float* bias_data = bias.mutableData<float>(FP32, s_shape);
// std::cout << "v size: " << v.size() << std::endl;
for (int n = 0; n < filter_num; n++) {
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
// scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
}
for (int n = 0; n < filter_num; n++) {
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
}
Shape sb_shape(N, {sb_num});
format_scale_bias(&scale, &bias, &conv_param->filter,
&conv_param->scaleBias, param.groups);
format_scale_bias(&scale,
&bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
// conv_param->scaleBias.saveToFile("sb.txt");
conv_param->scaleBias.flush();
float* bs_data = conv_param->scaleBias.data<float>();
......@@ -363,7 +387,7 @@ inline void split_filter_num(const ConvParam& c_param) {
args.image.height = input->shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
// TODO dilations[0] = dilations[1]
// dilations[0] = dilations[1] ;
args.dilation = param.dilations[0];
args.output.address = out_address;
......@@ -420,8 +444,11 @@ inline void split_channel(const ConvParam& c_param) {
scale.flush();
bias.flush();
// Shape sb_shape(N, {2 * channel});
format_scale_bias(&scale, &bias, &conv_param->filter,
&conv_param->scaleBias, param.groups);
format_scale_bias(&scale,
&bias,
&conv_param->filter,
&conv_param->scaleBias,
param.groups);
conv_param->scaleBias.flush();
// conv_param->scaleBias.saveToFile("sb.txt");
......@@ -445,7 +472,7 @@ inline void split_channel(const ConvParam& c_param) {
args.image.height = conv_param->input.shape().height();
args.image.pad_width = param.paddings[1];
args.image.pad_height = param.paddings[0];
// TODO dilations[0] = dilations[1]
// dilations[0] = dilations[1]
args.dilation = param.dilations[0];
args.output.address = conv_param->output.mutableData<void>();
args.output.scale_address = conv_param->output.scale();
......
......@@ -14,8 +14,6 @@ limitations under the License. */
#include "lite/backends/fpga/KD/pes/crop_pe.hpp"
#include <vector>
namespace paddle {
namespace zynqmp {
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <cstring>
#include <vector>
......
......@@ -36,11 +36,11 @@ class DepthwiseConvPE : public PE {
Tensor* input = param.input;
Tensor* output = param.output;
int channel = output->shape().channel();
float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
if (param_.bias()->dataType() == FP32) {
float* new_bias_data = param_.bias()->data<float>();
// bias从float转换成float16
// bias从float转换成float16
for (int i = 0; i < channel; i++) {
b_data[i] = float_to_half(new_bias_data[i]);
}
......@@ -56,16 +56,18 @@ class DepthwiseConvPE : public PE {
Tensor* quantized_filter = param.quantizedFilter();
quantized_filter->mutableData<float16>(FP16, param.filter->shape());
format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
} else {
//TODO filter 全为1时,且channal为对齐时
// filter 全为1时,且channal为对齐时
float16* scale_data = param_.scale()->data<float16>();
float16* filter_data = param.quantizedFilter()->mutableData<float16>(FP16, param.filter->shape());
float16* filter_data = param.quantizedFilter()->mutableData<float16>(
FP16, param.filter->shape());
// memcpy(filter_data, scale_data, channel * sizeof(float16));
memcpy(filter_data, scale_data, param.filter->shape().numel() * sizeof(float16));
memcpy(filter_data,
scale_data,
param.filter->shape().numel() * sizeof(float16));
param.quantizedFilter()->flush();
}
DWconvArgs args = {0};
args.bias_address = b_data;
......@@ -116,4 +118,4 @@ class DepthwiseConvPE : public PE {
};
} // namespace zynqmp
} // namespace paddle_mobile
} // namespace paddle
......@@ -57,7 +57,7 @@ class ElementwiseMulPE : public PE {
void updateInput(Tensor* t, int index) {
if (index == 0) {
args_.scale_address = t->data<void>();//replace inputs?
args_.scale_address = t->data<void>(); // replace inputs?
}
}
......
......@@ -40,13 +40,13 @@ class FullyConnectedPE : public PE {
num_ = param_.input->shape().num();
// if (num_ == 1) {
// } else {
// tempOut_.mutableData<void>(FP16, param_.out->shape());
// convParam_.output = &tempOut_;
// }
convParam_.output = param_.output;
convParam_.groups = 1;
convParam_.strides = {1, 1};
convParam_.paddings = {0, 0};
......@@ -93,9 +93,7 @@ class FullyConnectedPE : public PE {
convPE_.apply();
}
bool dispatch() {
return convPE_.dispatch();
}
bool dispatch() { return convPE_.dispatch(); }
FullyConnectedParam& param() { return param_; }
......
......@@ -14,16 +14,16 @@ limitations under the License. */
#pragma once
#include "lite/backends/arm/math/sgemm.h"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/arm/math/sgemm.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/type_system.h"
namespace paddle {
......@@ -38,8 +38,6 @@ struct GRUTensors {
class GRUPE : public PE {
public:
bool init() {
// Tensor* output = param_.output;
// output->setAligned(true);
......@@ -51,9 +49,10 @@ class GRUPE : public PE {
auto hidden = param_.hidden;
// auto hidden_dims = hidden->dims();
int frame_size = hidden->shape().channel();
zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
float16* prev_hidden_data = prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
float16* prev_hidden_data =
prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
// set previous hidden data to 0;
memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
......@@ -62,7 +61,9 @@ class GRUPE : public PE {
float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memset(weight_data, 0, weight_shape.numel() * sizeof(float));
weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
memcpy(weight_data, param_.weight->data<float>(), weight_shape.numel() * sizeof(float));
memcpy(weight_data,
param_.weight->data<float>(),
weight_shape.numel() * sizeof(float));
Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
gate_ping_.mutableData<void>(FP32, gate_shape);
......@@ -85,8 +86,9 @@ class GRUPE : public PE {
// // ====================
// Shape state_weight_shape(NC,{frame_size, frame_size});
// float* state_weight_data = state_weight_.mutableData<float>(FP32, state_weight_shape);
// memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
// float* state_weight_data = state_weight_.mutableData<float>(FP32,
// state_weight_shape);
// memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size,
// state_weight_shape.numel() * sizeof(float));
// FullyConnectedParam& reset_out_param = reset_out_pe_.param();
// reset_out_param.input = &prev_hidden;
......@@ -95,13 +97,12 @@ class GRUPE : public PE {
// // ============== unit reset;
// update_gate_.mutableData<void>(FP16, pre_input_shape);
// InputParam& relu_param = update_relu_pe_.param();
// InputParam& relu_param = update_relu_pe_.param();
// relu_param.input = &tempTensor;
// relu_param.output = &update_gate_;
// update_relu_pe_.init();
// update_relu_pe_.apply();
reset_gate_.mutableData<void>(FP16, hidden_shape);
prev_hidden_.mutableData<void>(FP16, hidden_shape);
reset_hidden_.mutableData<void>(FP16, hidden_shape);
......@@ -112,7 +113,8 @@ class GRUPE : public PE {
// reset_relu_pe_.apply();
// float16* prev_data = prev_.mutableData<float16>(FP16, pre_input_shape);
// memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16)); // TODO
// memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16));
// // TODO
// reset_hidden_prev_.mutableData<float16>(FP16, pre_input_shape);
ElementwiseMulParam& mul_param = mul_pe_.param();
......@@ -120,17 +122,15 @@ class GRUPE : public PE {
mul_param.output = &reset_hidden_;
mul_pe_.init();
mul_pe_.apply();
// ==============
}
bool dispatch() {
return true;
// ==============
}
void gru_unit_reset_act(const lite_api::ActivationType active_gate, GRUTensors& value,
int frame_size, int batch_size) {
bool dispatch() { return true; }
void gru_unit_reset_act(const lite_api::ActivationType active_gate,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
int stride_update = 3 * frame_size;
int stride_cell_state = 3 * frame_size;
int stride_hidden_prev = frame_size;
......@@ -143,17 +143,22 @@ class GRUPE : public PE {
float* reset_gate_data = update_gate_data + frame_size;
for (int b = 0; b < batch_size; b++) {
// memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel() * sizeof(float));
// memcpy(tempTensor.data<void>(), reset_gate_data, gate->shape().numel()
// * sizeof(float));
// tempTensor.flush();
Tensor tmp;
Shape s(NC, {1, frame_size}); //TODO
Shape s(NC, {1, frame_size});
float* tmp_data = tmp.mutableData<float>(FP32, s);
for (int i = 0; i < frame_size; i++) {
// f(x) = x / (1 + abs(x))?
update_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(update_gate_data[i]);
reset_gate_data[i] = lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(reset_gate_data[i]);
update_gate_data[i] =
lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
update_gate_data[i]);
reset_gate_data[i] =
lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
reset_gate_data[i]);
}
memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
tmp.flush();
......@@ -163,7 +168,7 @@ class GRUPE : public PE {
Tensor* hidden_prev = value.pre_output;
if (hidden_prev) {
// memcpy(prev_data, )
// TODO change to pre_out;
// TODO(chonwhite): change to pre_out;
prev_hidden_.copyFrom(value.pre_output);
prev_hidden_.saveToFile("prev_.txt");
}
......@@ -179,9 +184,11 @@ class GRUPE : public PE {
}
}
void gru_unit_out_act(const lite_api::ActivationType active_node, bool origin_mode,
GRUTensors& value, int frame_size, int batch_size) {
void gru_unit_out_act(const lite_api::ActivationType active_node,
bool origin_mode,
GRUTensors& value, // NOLINT
int frame_size,
int batch_size) {
// int stride_update = 3 * frame_size;
// int stride_cell_state = 3 * frame_size;
// int stride_hidden_prev = frame_size;
......@@ -206,13 +213,16 @@ class GRUPE : public PE {
// // if (hidden_prev) {
// // prev = hidden_prev[i];
// // }
// // cell_state[i] = lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
// // cell_state[i] =
// lite::arm::math::active_f32<kSigmoid>(cell_state[i]);
// // hidden[i] =
// // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] * prev;
// // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] *
// prev;
// // }
// } else {
// for (int i = 0; i < frame_size; ++i) {
// cell_state[i] = lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
// cell_state[i] =
// lite::arm::math::active_f32<lite_api::ActivationType::kRelu>(cell_state[i]);
// if (hidden_prev) {
// prev = hidden_prev[i];
// }
......@@ -228,12 +238,11 @@ class GRUPE : public PE {
// }
}
void copy_input(GRUTensors& value) {
void copy_input(GRUTensors& value) { // NOLINT
float max = find_max(*(value.gate));
gate_ping_.mutableData<void>(FP32, value.gate->shape());
gate_ping_.copyFrom(value.gate);
// TODO update input pointer?
// update input pointer?
// gate_.readFromFile("input/in.txt");
// // pre_input_.saveToFile("pppp_in.txt");
......@@ -241,7 +250,6 @@ class GRUPE : public PE {
// gate_.scale()[1] = 127 / max;
// gate_.printScale("pre_input_");
// gate_.saveToFile("pre_input_.txt");
// pre_out_pe_.dispatch();
......@@ -249,12 +257,12 @@ class GRUPE : public PE {
// pre_output_.saveToFile("pp_out.txt");
}
void GRUCOmpute(GRUTensors& value,
int frame_size,
int batch_size,
const lite_api::ActivationType active_node,
const lite_api::ActivationType active_gate,
bool origin_mode) {
void GRUCOmpute(GRUTensors& value, // NOLINT
int frame_size,
int batch_size,
const lite_api::ActivationType active_node,
const lite_api::ActivationType active_gate,
bool origin_mode) {
copy_input(value);
if (value.pre_output) {
......@@ -269,7 +277,8 @@ class GRUPE : public PE {
// // state weight;
// reset_out_pe_.dispatch();
// }
// gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
// gru_unit_out_act(active_node, origin_mode, value, frame_size,
// batch_size);
}
GRUParam& param() { return param_; }
......@@ -282,13 +291,9 @@ class GRUPE : public PE {
// return &gate_;
// }
Tensor* updateGate() {
return &update_gate_;
}
Tensor* updateGate() { return &update_gate_; }
Tensor* resetGate() {
return &reset_gate_;
}
Tensor* resetGate() { return &reset_gate_; }
private:
GRUParam param_;
......
......@@ -18,121 +18,6 @@
namespace paddle {
namespace lite {
namespace fpga {
// inline void gru_unit_reset_act(lite_api::ActivationType act_type,
// GRUMetaValue<float> value,
// int frame_size,
// int batch_size) {
// auto updata_gate = value.gate_value;
// auto reset_gate = value.gate_value + frame_size;
// auto hidden_prev = value.prev_out_value;
// auto reset_hidden_prev = value.reset_output_value;
// int stride_update = 3 * frame_size;
// int stride_reset = 3 * frame_size;
// int stride_hidden_prev = frame_size;
// int stride_reset_hidden_prev = frame_size;
// if (act_type == kRelu) {
// }
// }
// void gru_compute(arm::math::GRUMetaValue<float> value,
// int frame_size,
// int batch_size,
// const lite_api::ActivationType active_node,
// const lite_api::ActivationType active_gate,
// bool origin_mode) {
// std::cout << " =================== gru gru_compute =================== \n";
// // exit(-1);
// // sgemm(bool is_transA,
// // bool is_transB,
// // int M,
// // int N,
// // int K,
// // float alpha,
// // const float* A,
// // int lda,
// // const float* B,
// // int ldb,
// // float beta,
// // float* C,
// // int ldc,
// // const float* bias,
// // bool is_bias,
// // bool is_relu,
// // ARMContext* ctx);
// // sgemm for fc;
// // lite::arm::math::sgemm(false,
// // false,
// // m_,// batch;
// // n_,// filter num;
// // k_,// input_channel;
// // 1.f,
// // i_data,// input data;
// // k_,
// // w_data,// weight data;
// // n_,
// // 0.f,//beta;
// // o_data,// out data;
// // n_,
// // b_data,// bias;
// // false,
// // false,
// // &ctx);
// // C := alpha*op( A )*op( B ) + beta*C,
// if (value.prev_out_value) {
// // sgemm(false, // is_transA
// // false, // is_transB
// // batch_size, // M specifies the number of rows of the matrix
// // frame_size * 2, // N specifies the number of columns of the matrix
// // frame_size, // K
// // 1.f, // alpha
// // value.prev_out_value, // float* A,
// // frame_size, // lda
// // value.gate_weight, // float* B,
// // frame_size * 2, // ldb
// // 1.f, // beta
// // value.gate_value, // C*
// // frame_size * 3, // ldc
// // nullptr, // bias
// // false, // is_bias
// // false, // is_relu
// // ctx); // context
// }
// // gru_unit_reset_act(active_gate, value, frame_size, batch_size);
// if (value.prev_out_value) {
// // sgemm(false,
// // false,
// // batch_size,
// // frame_size,
// // frame_size,
// // 1.f,
// // value.reset_output_value,
// // frame_size,
// // value.state_weight,
// // frame_size,
// // 1.f,
// // value.gate_value + frame_size * 2,
// // frame_size * 3,
// // nullptr,
// // false,
// // false,
// // ctx);
// }
// // gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
// }
namespace fpga {}
}
}
}
\ No newline at end of file
......@@ -38,7 +38,6 @@ class PoolingPE : public PE {
uint32_t k_height = param_.kernelSize[0];
uint32_t k_width = param_.kernelSize[1];
if (param_.globalPooling) {
k_width = input->shape().width();
k_height = input->shape().height();
......@@ -68,8 +67,9 @@ class PoolingPE : public PE {
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 7 || k_height > 7);
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
// (k_width > 255 || k_height > 255);
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// &&
// (k_width > 255 || k_height > 255);
use_cpu_ = param_.type == AVERAGE;
}
......@@ -172,7 +172,8 @@ class PoolingPE : public PE {
input->syncToCPU();
Tensor float_input;
float* float_input_data = float_input.mutableData<float>(FP32, input->shape());
float* float_input_data =
float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
......
......@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() {
if (cachedBoxes_ == nullptr) {
cachedBoxes_ = new Tensor();
cachedVariances_ = new Tensor();
cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape());
cachedVariances_->mutableData<float16>(FP16,
param_.outputVariances->shape());
cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
cachedBoxes_->setDataLocation(CPU);
cachedVariances_->setDataLocation(CPU);
compute_prior_box();
......
......@@ -14,19 +14,16 @@ limitations under the License. */
#pragma once
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include <algorithm>
#include <algorithm>
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
#include "lite/backends/fpga/KD/tensor.hpp"
namespace paddle {
namespace zynqmp {
class ScalePE : public PE {
public:
inline int gcd(int a, int b) {
......@@ -124,7 +121,7 @@ class ScalePE : public PE {
void apply() {
Tensor* input = param_.input;
Tensor* output = param_.output;
Shape& input_shape = input->shape();
Shape& input_shape = input->shape();
DepthwiseConvParam& dw_param = dw_pe_.param();
int channel = input_shape.channel();
......@@ -136,19 +133,19 @@ class ScalePE : public PE {
int c_lcm = lcm(channel, alignment);
repeat = c_lcm / (channel);
}
// TODO FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现
// FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现
Shape shape(N, {channel * repeat});
float* filter_data = filter.mutableData<float>(FP32, shape);
std::fill_n(filter_data, input->shape().channel(), 1.0f);
Tensor *scale = dw_param.scale();
float16* scale_data = scale->mutableData<float16>(FP16, shape);
// memcpy(scale_data, param_.scale->data<float>(), input->shape().channel() * sizeof(float));
Tensor* scale = dw_param.scale();
float16* scale_data = scale->mutableData<float16>(FP16, shape);
// memcpy(scale_data, param_.scale->data<float>(), input->shape().channel()
// * sizeof(float));
Tensor *bias = dw_param.bias();
Tensor* bias = dw_param.bias();
float16* bias_data = bias->mutableData<float16>(FP16, shape);
std::fill_n(bias_data, input->shape().channel(), 0);
......@@ -206,13 +203,14 @@ class ScalePE : public PE {
}
// if (param_.bias != nullptr) {
// memcpy(bias_data, param_.bias->data<float>(), input->shape().channel() * sizeof(float));
// memcpy(bias_data, param_.bias->data<float>(), input->shape().channel()
// * sizeof(float));
// }
dw_param.input = param_.input;
dw_param.output = param_.output;
dw_param.filter = &filter;
dw_param.strides = {1, 1};
dw_param.paddings = {0, 0};
dw_param.kernelSize = {1, 1};
......@@ -220,7 +218,6 @@ class ScalePE : public PE {
dw_pe_.init();
dw_pe_.apply();
}
void cpu_compute() {
......@@ -244,7 +241,8 @@ class ScalePE : public PE {
for (int c = 0; c < input->shape().channel(); c++) {
int index = i * input->shape().channel() + c;
float value = half_to_float(in_data[index]) * scale_data[c];
std::cout << "value:" << value << " = " << half_to_float(in_data[index]) << " x " << scale_data[c] << std::endl;
std::cout << "value:" << value << " = " << half_to_float(in_data[index])
<< " x " << scale_data[c] << std::endl;
data_out[index] = float_to_half(value);
if (value < 0) {
......@@ -261,18 +259,19 @@ class ScalePE : public PE {
}
bool dispatch() {
// cpu_compute();
// return true;
if (param_.scale->dataType() == FP16) {
DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(), param_.scale->data<float16>(), param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
dw_param.quantizedFilter()->flush();
// apply();
DepthwiseConvParam& dw_param = dw_pe_.param();
memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
param_.scale->data<float16>(),
param_.scale->shape().numel() * sizeof(float16));
dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
dw_param.quantizedFilter()->flush();
// apply();
}
// param_.scale->saveToFile("scale.txt");
// cpu_compute();
......@@ -293,5 +292,3 @@ class ScalePE : public PE {
};
} // namespace zynqmp
} // namespace paddle
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdio.h>
#include <unistd.h>
#include <algorithm>
#include <cmath>
#include <cstring>
......@@ -24,8 +25,6 @@ limitations under the License. */
#include <string>
#include <vector>
#include <unistd.h>
// #include "lite/core/tensor.h"
#include "lite/backends/fpga/KD/dl_engine.hpp"
......@@ -303,13 +302,15 @@ class Tensor {
this->invalidate();
}
void flush() {
size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
void flush() {
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_flush(placeHolder_->data(), memorySize);
}
void invalidate() {
size_t memorySize = shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
size_t memorySize =
shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
fpga_invalidate(placeHolder_->data(), memorySize);
}
......@@ -357,7 +358,8 @@ class Tensor {
std::cout << type << " : "
<< std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" + std::to_string(shape_->width())
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width())
<< std::endl;
std::cout << type << " \n";
printScale();
......@@ -383,7 +385,6 @@ class Tensor {
}
void saveToFile(std::string path) {
syncToCPU();
invalidate();
std::ofstream ofs;
......@@ -395,7 +396,7 @@ class Tensor {
}
void save_file_with_name(std::string path) {
return;
// return;
invalidate();
// usleep(20000);
// return;
......@@ -406,7 +407,6 @@ class Tensor {
ofs << "dataType: " << dataType_ << std::endl;
ofs << "scale: " << scale()[0] << " , " << scale()[1] << std::endl;
for (int i = 0; i < shape_->numel(); i++) {
float value = 0;
if (dataType_ == FP32) {
......
......@@ -161,7 +161,7 @@ class TensorLite {
TensorLite Slice(int64_t begin, int64_t end) const;
template <typename T>
void Slice(TensorLite& dst, int64_t begin, int64_t end) const;
void Slice(TensorLite &dst, int64_t begin, int64_t end) const; // NOLINT
TargetType target() const { return target_; }
......@@ -247,9 +247,7 @@ R *TensorLite::mutable_data(TargetType target) {
template <typename T>
TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
throw -1;
throw - 1;
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
......@@ -265,20 +263,21 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
auto dst_dims = dims_;
dst_dims[0] = end - begin;
dst.Resize(dst_dims);
void* dst_data = dst.mutable_data<T>();
void *dst_data = dst.mutable_data<T>();
T* src_data = const_cast<T*>(data<T>());
memcpy(dst_data, src_data + static_cast<size_t>(begin * base) * sizeof(T), dst_dims.production() * sizeof(T));
T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data,
src_data + static_cast<size_t>(begin * base) * sizeof(T),
dst_dims.production() * sizeof(T));
dst.ZynqTensor()->saveToFile("_slice", true);
// dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
return dst;
}
}
}
template <typename T>
void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
CHECK_GE(begin, 0);
CHECK_LE(end, dims_[0]);
CHECK_LT(begin, end);
......@@ -287,13 +286,14 @@ void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
auto dst_dims = dims_;
dst_dims[0] = end - begin;
dst.Resize(dst_dims);
void* dst_data = dst.mutable_data<T>();
void *dst_data = dst.mutable_data<T>();
int64_t base = numel() / dims_[0];
T* src_data = const_cast<T*>(data<T>());
memcpy(dst_data, src_data + static_cast<size_t>(begin * dst_dims.production()), dst_dims.production() * sizeof(T));
T *src_data = const_cast<T *>(data<T>());
memcpy(dst_data,
src_data + static_cast<size_t>(begin * dst_dims.production()),
dst_dims.production() * sizeof(T));
}
template <typename TensorT>
......
......@@ -90,13 +90,13 @@ void TypeTargetTransformPass::AddIoCopyInst(
// string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
// TODO(MyPandaShaoxiang) should set same place with input?
auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
// Set the place for io_copy_output_arg node, the target should be equal to
// to.target()
// The precision and layout should be equal to from.precision(), from.layout()
#ifndef LITE_WITH_FPGA
// Set the place for io_copy_output_arg node, the target should be equal to
// to.target()
// The precision and layout should be equal to from.precision(), from.layout()
#ifndef LITE_WITH_FPGA
io_copy_output_arg->AsArg().type =
LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
#endif
#endif
auto* io_copy_inst = graph->NewInstructNode();
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
......
......@@ -60,57 +60,57 @@ class Optimizer {
InitTargetTypeTransformPass();
if (passes.empty()) {
std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
"lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", //
std::vector<std::string> passes_local{{
"lite_quant_dequant_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
"lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", //
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
"lite_elementwise_add_activation_fuse_pass", //
"lite_elementwise_add_activation_fuse_pass", //
#endif
"static_kernel_pick_pass", // pick original kernel from graph
"variable_place_inference_pass", // inference arg/var's
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
// info
// (target/precision/layout/device)
"type_target_cast_pass", // add io_copy/io_copy_once if meet
// different targets when last and next
// node
"variable_place_inference_pass", //
"argument_type_display_pass", //
"io_copy_kernel_pick_pass", //
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_precision_cast_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_layout_cast_pass", // add layout/layout_once op if meet
// different layout when last and next node
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
// "memory_optimize_pass"
}};
"static_kernel_pick_pass", // pick original kernel from graph
"variable_place_inference_pass", // inference arg/var's
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass", // debug pass: show arg-type-node's
// info
// (target/precision/layout/device)
"type_target_cast_pass", // add io_copy/io_copy_once if meet
// different targets when last and next
// node
"variable_place_inference_pass", //
"argument_type_display_pass", //
"io_copy_kernel_pick_pass", //
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_precision_cast_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"type_layout_cast_pass", // add layout/layout_once op if meet
// different layout when last and next node
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass",
"runtime_context_assign_pass",
"argument_type_display_pass",
// "memory_optimize_pass"
}};
RunPasses(passes_local);
} else {
RunPasses(passes);
......
......@@ -121,7 +121,7 @@ void RuntimeProgram::Run() {
inst.Run();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
// LITE_PRECISION_PROFILE(inst)
// LITE_PRECISION_PROFILE(inst)
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
......
......@@ -221,6 +221,10 @@ void BoxCoderCompute::Run() {
}
}
}
// prior_box->ZynqTensor()->saveToFile("prior_box", true);
// prior_box_var->ZynqTensor()->saveToFile("prior_box_var", true);
// output_box->ZynqTensor()->saveToFile("box_coder", true);
}
} // namespace arm
......
......@@ -85,6 +85,9 @@ void PriorBoxCompute::Run() {
is_clip,
order,
min_max_aspect_ratios_order);
param.boxes->ZynqTensor()->saveToFile("pb_boxes", true);
param.variances->ZynqTensor()->saveToFile("pb_variance", true);
}
} // namespace arm
......@@ -98,22 +101,22 @@ REGISTER_LITE_KERNEL(prior_box,
kNCHW,
paddle::lite::kernels::arm::PriorBoxCompute,
def)
.BindInput("Input",{LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(prior_box_fpga,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::PriorBoxCompute,
def)
.BindInput("Input",{LiteType::GetTensorTy(
TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
.BindInput("Image", {LiteType::GetTensorTy(
TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// REGISTER_LITE_KERNEL(prior_box,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::arm::PriorBoxCompute,
// def)
// .BindInput("Input",{LiteType::GetTensorTy(
// TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
// .BindInput("Image", {LiteType::GetTensorTy(
// TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))})
// .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
// .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
// .Finalize();
......@@ -21,7 +21,7 @@ add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc
add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
# add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
......
......@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/concat_compute.h"
#include <string>
#include <vector>
#include "lite/kernels/fpga/concat_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/kernels/fpga/conv_compute.h"
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
......@@ -79,8 +79,8 @@ void ConvCompute::Run() {
} else {
conv_pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output);
zynqmp::ConvParam& conv_param = conv_pe_.param();
Debugger::get_instance().registerOutput("conv", conv_param.output);
#endif
}
}
......
......@@ -12,11 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "lite/kernels/fpga/dropout_compute.h"
#include <string>
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/backends/fpga/KD/float16.hpp"
namespace paddle {
namespace lite {
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "lite/kernels/fpga/elementwise_compute.h"
#include <string>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp"
......@@ -38,7 +38,7 @@ void ElementwiseAddCompute::PrepareForRun() {
pe_.init();
pe_.apply();
}
void ElementwiseAddCompute::Run() {
void ElementwiseAddCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ElementwiseAddParam& ew_param = pe_.param();
......@@ -60,8 +60,8 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
pe_.init();
pe_.apply();
}
void ElementwiseAddActivationCompute::Run() {
pe_.dispatch();
void ElementwiseAddActivationCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ElementwiseAddParam& ew_param = pe_.param();
Debugger::get_instance().registerOutput("ew_add", ew_param.output);
......@@ -86,12 +86,12 @@ void ElementwiseMulCompute::PrepareForRun() {
zynqmp::Shape shape(zynqmp::N, {channel});
float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
float scale_value = param.Y->data<float>()[0];;
float scale_value = param.Y->data<float>()[0];
for (int i = 0; i < channel; ++i) {
if (param.Y->dims().production() != 1) {
scale_value = param.Y->ZynqTensor()->data<float>()[i];
}
}
scale_data[i] = scale_value;
bias_data[i] = 0;
}
......@@ -100,10 +100,11 @@ void ElementwiseMulCompute::PrepareForRun() {
pe_.apply();
}
void ElementwiseMulCompute::Run() {
void ElementwiseMulCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::ScaleParam& scale_param = pe_.param();
Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif
}
......@@ -123,8 +124,7 @@ REGISTER_LITE_KERNEL(elementwise_add,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
......@@ -162,12 +162,9 @@ REGISTER_LITE_KERNEL(elementwise_mul,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -58,6 +58,7 @@ class ElementwiseMulCompute
void Run() override;
virtual ~ElementwiseMulCompute() = default;
private:
zynqmp::ScalePE pe_;
};
......
......@@ -13,9 +13,9 @@
// limitations under the License.
#include "lite/kernels/fpga/fc_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......
......@@ -13,9 +13,9 @@
// limitations under the License.
#include "lite/kernels/fpga/feed_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......
......@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/fetch_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace paddle {
namespace lite {
......@@ -45,7 +45,7 @@ void FetchCompute::PrepareForRun() {
void FetchCompute::Run() {
pe_.dispatch();
auto& param = this->Param<param_t>();
#ifdef FPGA_PRINT_TENSOR
zynqmp::OutputParam& fetch_param = pe_.param();
Debugger::get_instance().registerOutput("fetch", fetch_param.output);
......
......@@ -11,10 +11,12 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/gru_compute.h"
#include <unistd.h>
// #include <chrono>
#include <iostream>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/arm/math/gru_utils.h"
......@@ -23,20 +25,16 @@
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
#include "lite/kernels/fpga/gru_compute.h"
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/backends/fpga/KD/pes/gru_util.hpp"
#include <iostream>
#include <chrono>
#include <unistd.h>
namespace paddle {
namespace lite {
namespace kernels {
namespace fpga {
using namespace std;
using float16 = zynqmp::float16;
inline lite_api::ActivationType get_gru_act_type(const std::string& type) {
......@@ -71,7 +69,8 @@ void GRUCompute::PrepareForRun() {
gru_param.bias = bias->ZynqTensor();
gru_param.batch_gate = param.batch_gate->ZynqTensor();
gru_param.batch_reset_hidden_prev = param.batch_reset_hidden_prev->ZynqTensor();
gru_param.batch_reset_hidden_prev =
param.batch_reset_hidden_prev->ZynqTensor();
gru_param.batch_hidden = param.batch_hidden->ZynqTensor();
gru_param.hidden = param.hidden->ZynqTensor();
......@@ -105,12 +104,12 @@ void GRUCompute::Run() {
float* batch_gate_data = batch_gate->mutable_data<float>();
lite::arm::math::LoDTensor2BatchFunctor<float> to_batch;
to_batch(*input, batch_gate, true, param.is_reverse); //1.
to_batch(*input, batch_gate, true, param.is_reverse); // 1.
save_tensor(batch_gate, "_batch_gate.txt");
if (bias) {
auto bias_data = bias->data<float>(); //2.
auto bias_data = bias->data<float>(); // 2.
lite::arm::math::gru_add_with_bias(batch_gate_data,
bias_data,
batch_gate_data,
......@@ -130,13 +129,12 @@ void GRUCompute::Run() {
Tensor ordered_h0;
std::vector<uint64_t> order(batch_gate->lod()[2]);
if (h0) {
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
// lite::arm::math::ReorderInitState<float>(*h0, order, &ordered_h0, true); //3.
// lite::arm::math::ReorderInitState<float>(*h0, order, &ordered_h0, true);
// //3.
gru_value.prev_out_value = ordered_h0.mutable_data<float>();
gru_tensors.pre_output = ordered_h0.ZynqTensor();
std::cout << "================= h0 =================\n";
......@@ -171,31 +169,36 @@ void GRUCompute::Run() {
batch_reset_hidden_prev->mutable_data<float>() +
bstart * batch_reset_hidden_prev->dims()[1];
zynqmp::Shape float_input_shape(zynqmp::NC, {cur_batch_size, batch_gate->dims()[1]});
float* float_data = float_input.mutableData<float>(zynqmp::FP32, float_input_shape);
memcpy(float_data, gru_value.gate_value, batch_gate->dims()[1] * sizeof(float));
zynqmp::Shape float_input_shape(zynqmp::NC,
{cur_batch_size, batch_gate->dims()[1]});
float* float_data =
float_input.mutableData<float>(zynqmp::FP32, float_input_shape);
memcpy(float_data,
gru_value.gate_value,
batch_gate->dims()[1] * sizeof(float));
float_input.flush();
float* hidden_data = hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
float* hidden_data =
hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
// memcpy(hidden_prev_data, )
// zynqmp::Tensor* gate = pe_.gate();
gru_tensors.gate = &float_input;
gru_tensors.output = &hidden_out;
pe_.GRUCOmpute(gru_tensors, frame_size,
cur_batch_size,
active_node,
active_gate,
param.origin_mode);
//TODO copy data back to original tensor;
pe_.GRUCOmpute(gru_tensors,
frame_size,
cur_batch_size,
active_node,
active_gate,
param.origin_mode);
// TODO(chonwhite): copy data back to original tensor;
gru_tensors.pre_output = gru_tensors.output;
// exit(-1);
// gru_value.prev_out_value = gru_value.output_value;
}
lite::arm::math::Batch2LoDTensorFunctor<float> to_seq; //5.
lite::arm::math::Batch2LoDTensorFunctor<float> to_seq; // 5.
*(batch_hidden->mutable_lod()) = batch_gate->lod();
batch_hidden->mutable_data<float>();
to_seq(*batch_hidden, hidden);
......@@ -204,7 +207,6 @@ void GRUCompute::Run() {
save_tensor(hidden, "_gru.txt");
exit(-1);
}
} // namespace fpga
......
......@@ -25,8 +25,8 @@ namespace lite {
namespace kernels {
namespace fpga {
class GRUCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
class GRUCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::GRUParam;
......@@ -37,6 +37,7 @@ class GRUCompute
void Run() override;
virtual ~GRUCompute() = default;
private:
zynqmp::Tensor pre_output_;
zynqmp::Tensor pre_bias_;
......@@ -47,7 +48,7 @@ class GRUCompute
zynqmp::FullyConnectedPE reset_out_pe_;
// zynqmp::Tensor input_;
zynqmp::GRUPE pe_;
zynqmp::GRUPE pe_;
};
} // namespace fpga
......
......@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/im2sequence_compute.h"
#include <vector>
#include "lite/api/paddle_place.h"
// #include "lite/backends/arm/math/funcs.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/kernels/fpga/im2sequence_compute.h"
#include "lite/backends/fpga/KD/float16.hpp"
......@@ -28,7 +28,6 @@ namespace fpga {
using float16 = zynqmp::float16;
void im2sequence(const float16* input,
const int input_c,
const int input_h,
......@@ -76,17 +75,13 @@ void im2sequence(const float16* input,
}
}
template<typename T>
template <typename T>
void hwc_to_chw(T* chw_data,
const T* hwc_data,
int num,
int channel,
int height,
int width) {
std::cout << " ============= HWC -> CHW =============\n";
std::cout << "channel: " << channel << std::endl;
std::cout << "height: " << height << std::endl;
std::cout << "width: " << width << std::endl;
int chw = channel * height * width;
int wc = width * channel;
int wh = width * height;
......@@ -103,65 +98,6 @@ void hwc_to_chw(T* chw_data,
}
}
// void im2sequence(const float16* input,
// const int input_c,
// const int input_h,
// const int input_w,
// const int kernel_h,
// const int kernel_w,
// const int pad_top,
// const int pad_bottom,
// const int pad_left,
// const int pad_right,
// const int stride_h,
// const int stride_w,
// const int out_h,
// const int out_w,
// float16* out) {
// int window_size = kernel_h * kernel_w;
// int out_rows = out_h * out_w;
// int out_cols = input_c * window_size;
// int H_pad = input_h + pad_top + pad_bottom;
// int W_pad = input_w + pad_left + pad_right;
// size_t channel_mem_size = input_c * sizeof(float16);
// int index = 0;
// float16 zero = zynqmp::float_to_half(0.0f);
// for (int h_id = 0; h_id < out_h; h_id++) {
// for (int w_id = 0; w_id < out_w; w_id++) {
// // consider dilation.
// int start_h = h_id * stride_h - pad_top;
// int start_w = w_id * stride_w - pad_left;
// for (int c_id = 0; c_id < input_c; c_id++) {
// for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) {
// int in_h_id = start_h + k_h_id;
// bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad);
// int out_start_id =
// (h_id * out_w + w_id) * out_cols + c_id * window_size;
// for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) {
// int in_w_id = start_w + k_w_id;
// void* dst = out + index;//TODO
// exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad);
// if (exceed_flag) {
// memset( dst, 0, channel_mem_size);
// } else {
// void* src = const_cast<float16*>(input) + (in_h_id * input_w + in_w_id);
// memcpy(dst, src, channel_mem_size);
// }
// index++;
// int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id;
// int out_id = out_start_id + k_h_id * kernel_w + k_w_id;
// out[out_id] = exceed_flag ? zero : input[input_id];
// }
// }
// }
// }
// }
// }
void Im2SequenceCompute::PrepareForRun() {}
void Im2SequenceCompute::Run() {
......@@ -171,7 +107,8 @@ void Im2SequenceCompute::Run() {
auto paddings = param.paddings;
const auto* x_data = param.X->data<float16>();
float16* o_data = (float16*)param.Out->mutable_data<float16>();
float16* o_data =
reinterpret_cast<float16*>(param.Out->mutable_data<float16>());
float16* o2 = o_data;
......@@ -181,18 +118,17 @@ void Im2SequenceCompute::Run() {
param.X->ZynqTensor()->syncToCPU();
float16* chw_data = new float16[param.X->numel()];
hwc_to_chw<float16>(chw_data, x_data, param.X->dims()[0], param.X->dims()[1], param.X->dims()[2], param.X->dims()[3]);
hwc_to_chw<float16>(chw_data,
x_data,
param.X->dims()[0],
param.X->dims()[1],
param.X->dims()[2],
param.X->dims()[3]);
const float16* in = chw_data;
const float16* in = chw_data;
int out_cols = input_dims[1] * kernels[0] * kernels[1];
std::cout << "im_num:" << im_num << " im_size:" << im_size << std::endl;
std::cout << "out_cols:" << out_cols << std::endl;
// exit(-1);
int total_rows = 0;
std::vector<uint64_t> im_offset;
im_offset.push_back(total_rows);
......@@ -228,20 +164,20 @@ void Im2SequenceCompute::Run() {
int out_offset = 0;
for (int im_id = 0; im_id < im_num; im_id++) {
im2sequence(in + im_id * im_size,
input_dims[1],
input_dims[2],
input_dims[3],
param.kernels[0],
param.kernels[1],
param.paddings[0],
param.paddings[1],
param.paddings[2],
param.paddings[3],
param.strides[0],
param.strides[1],
out_h_vec[im_id],
out_w_vec[im_id],
o2 + im_offset[im_id] * out_cols);
input_dims[1],
input_dims[2],
input_dims[3],
param.kernels[0],
param.kernels[1],
param.paddings[0],
param.paddings[1],
param.paddings[2],
param.paddings[3],
param.strides[0],
param.strides[1],
out_h_vec[im_id],
out_w_vec[im_id],
o2 + im_offset[im_id] * out_cols);
}
} else {
int out_h =
......@@ -253,20 +189,20 @@ void Im2SequenceCompute::Run() {
for (int im_id = 0; im_id < im_num; im_id++) {
int out_size_per_im = out_h * out_w * out_cols;
im2sequence(in + im_id * im_size,
input_dims[1],
input_dims[2],
input_dims[3],
param.kernels[0],
param.kernels[1],
param.paddings[0],
param.paddings[1],
param.paddings[2],
param.paddings[3],
param.strides[0],
param.strides[1],
out_h,
out_w,
o2 + im_id * out_size_per_im);
input_dims[1],
input_dims[2],
input_dims[3],
param.kernels[0],
param.kernels[1],
param.paddings[0],
param.paddings[1],
param.paddings[2],
param.paddings[3],
param.strides[0],
param.strides[1],
out_h,
out_w,
o2 + im_id * out_size_per_im);
im_offset.push_back(uint64_t((im_id + 1) * out_h * out_w));
}
auto lod = param.Out->mutable_lod();
......@@ -277,8 +213,6 @@ void Im2SequenceCompute::Run() {
delete[] chw_data;
param.Out->ZynqTensor()->flush();
param.Out->ZynqTensor()->copyScaleFrom(param.X->ZynqTensor());
param.X->ZynqTensor()->saveToFile("im_in.txt");
param.Out->ZynqTensor()->saveToFile("im2sequence.txt");
}
} // namespace fpga
......@@ -292,11 +226,13 @@ REGISTER_LITE_KERNEL(im2sequence,
kNHWC,
paddle::lite::kernels::fpga::Im2SequenceCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -23,8 +23,8 @@ namespace lite {
namespace kernels {
namespace fpga {
class Im2SequenceCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
class Im2SequenceCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::Im2SequenceParam;
......
......@@ -47,9 +47,11 @@ class IoCopyHostToFpgaCompute
param.x->target() == TARGET(kFPGA));
// param.y->CopyDataFrom(*param.x);
param.y->mutable_data<float16>();
if (param.x->ZynqTensor()->aligned() && param.x->ZynqTensor()->shape().shouldAlign()) {
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16, param.x->ZynqTensor()->shape());
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
// tempTensor.saveToFile("tempTensor", true);
tempTensor.setAligned(true);
......@@ -63,7 +65,6 @@ class IoCopyHostToFpgaCompute
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
auto out_lod = param.y->mutable_lod();
*out_lod = param.x->lod();
}
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
......@@ -106,9 +107,11 @@ class IoCopyFpgaToHostCompute
param.y->ZynqTensor()->setDataType(zynqmp::FP32);
param.x->ZynqTensor()->syncToDevice();
if (param.x->ZynqTensor()->aligned() && param.x->ZynqTensor()->shape().shouldAlign()) {
if (param.x->ZynqTensor()->aligned() &&
param.x->ZynqTensor()->shape().shouldAlign()) {
zynqmp::Tensor tempTensor;
tempTensor.mutableData<float16>(zynqmp::FP16, param.x->ZynqTensor()->shape());
tempTensor.mutableData<float16>(zynqmp::FP16,
param.x->ZynqTensor()->shape());
tempTensor.copyFrom(param.x->ZynqTensor());
// tempTensor.saveToFile("tempTensor", true);
tempTensor.setAligned(true);
......@@ -124,7 +127,6 @@ class IoCopyFpgaToHostCompute
*out_lod = param.x->lod();
}
std::string doc() const override { return "Copy IO from FPGA to HOST"; }
};
......
......@@ -18,7 +18,6 @@
#include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h"
#include "lite/core/type_system.h"
#include "lite/api/paddle_place.h"
namespace paddle {
namespace lite {
......@@ -27,13 +26,9 @@ namespace fpga {
using float16 = zynqmp::float16;
template<typename T>
void convert_to_hwc(T* chw_data,
T* hwc_data,
int num,
int channel,
int height,
int width) {
template <typename T>
void convert_to_hwc(
T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
std::cout << " -------------- chw -> HWC ---------------\n";
std::cout << "channel: " << channel << std::endl;
std::cout << "height: " << height << std::endl;
......@@ -54,13 +49,9 @@ void convert_to_hwc(T* chw_data,
}
}
template<typename T>
void hwc_to_chw(T* chw_data,
T* hwc_data,
int num,
int channel,
int height,
int width) {
template <typename T>
void hwc_to_chw(
T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
std::cout << " ============= HWC -> CHW =============\n";
std::cout << "channel: " << channel << std::endl;
std::cout << "height: " << height << std::endl;
......@@ -98,13 +89,13 @@ void TransHwcToChw(Tensor* dest, const Tensor* src) {
if (dest->dims().size() > 3) {
width = dest->dims()[3];
}
hwc_to_chw<float>(chw, hwc, num, channel, height, width);
}
if (src->ZynqTensor()->dataType() == zynqmp::FP16) {
std::cout << "float16\n";
float16* chw = dest->mutable_data<float16>();
float16* chw = dest->mutable_data<float16>();
float16* hwc = const_cast<float16*>(src->data<float16>());
int num = dest->dims()[0];
int channel = dest->dims()[1];
......@@ -116,10 +107,9 @@ void TransHwcToChw(Tensor* dest, const Tensor* src) {
if (dest->dims().size() > 3) {
width = dest->dims()[3];
}
hwc_to_chw<float16>(chw, hwc, num, channel, height, width);
}
}
void TransChwToHwc(Tensor* dest, const Tensor* src) {
std::cout << "chw to hwc \n";
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/kernels/fpga/mul_compute.h"
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
......@@ -27,7 +27,7 @@ namespace fpga {
using float16 = zynqmp::float16;
void MulCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto& param = this->Param<param_t>();
// ====================================================
zynqmp::FullyConnectedParam& fc_param = pe_.param();
......@@ -44,7 +44,8 @@ void MulCompute::PrepareForRun() {
zynqmp::Shape bias_shape(zynqmp::N, {channel});
float* bias_data = fc_param.bias->mutableData<float>(zynqmp::FP32, bias_shape);
float* bias_data =
fc_param.bias->mutableData<float>(zynqmp::FP32, bias_shape);
memset(bias_data, 0, channel * sizeof(float));
bias_.flush();
......@@ -62,7 +63,6 @@ void mul(MulCompute* k) {
float16* out_data = param.output->mutable_data<float16>();
int g_index = 0;
for (int n = 0; n < 1; n++) {
for (int on = 0; on < fn; on++) {
float sum = 0;
int si = 0;
......@@ -79,10 +79,9 @@ void mul(MulCompute* k) {
}
}
void MulCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::FullyConnectedParam& fc_param = pe_.param();
Debugger.get_instance().registerOutput("mul", fc_param.output);
......@@ -96,11 +95,13 @@ void MulCompute::Run() {
REGISTER_LITE_KERNEL(
mul, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::MulCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -25,8 +25,8 @@ namespace lite {
namespace kernels {
namespace fpga {
class MulCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
class MulCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::MulParam;
......
......@@ -209,7 +209,7 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
SliceOneClass<T>(scores, c, &score_slice);
SliceOneClass<T>(bboxes, c, &bbox_slice);
}
NMSFast(bboxes,// TODO bbox_slice
NMSFast(bboxes,
score_slice,
score_threshold,
nms_threshold,
......@@ -226,7 +226,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
*num_nmsed_out = num_det;
const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) {
Tensor score_slice;
const T* sdata;
......@@ -333,7 +332,7 @@ void MulticlassNmsCompute::Run() {
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<uint64_t> batch_starts = {0};
int64_t batch_size = score_dims[0];
int64_t out_dim = box_dim + 2;
int num_nmsed_out = 0;
Tensor boxes_slice, scores_slice;
......@@ -356,8 +355,6 @@ void MulticlassNmsCompute::Run() {
batch_starts.push_back(batch_starts.back() + num_nmsed_out);
}
uint64_t num_kept = batch_starts.back();
if (num_kept == 0) {
outs->Resize({1, 1});
......@@ -394,10 +391,12 @@ void MulticlassNmsCompute::Run() {
outs->set_lod(lod);
#ifdef FPGA_PRINT_TENSOR
Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor());
Debugger::get_instance().registerOutput("scores", scores->ZynqTensor());
Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
#endif
}
} // namespace host
} // namespace fpga
} // namespace kernels
} // namespace lite
} // namespace paddle
......
......@@ -50,13 +50,16 @@ void NormCompute::Run() {
REGISTER_LITE_KERNEL(
norm, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::NormCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Norm", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Norm",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -26,8 +26,8 @@ namespace lite {
namespace kernels {
namespace fpga {
class NormCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
class NormCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::NormParam;
......
......@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/pooling_compute.h"
#include <string>
#include <vector>
#include "lite/kernels/fpga/pooling_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
......
......@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/prior_box_compute.h"
#include <string>
#include <vector>
#include "lite/backends/fpga/KD/debugger.hpp"
#include "lite/kernels/fpga/prior_box_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
......@@ -23,11 +25,59 @@ namespace fpga {
using float16 = zynqmp::float16;
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip,
std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6;
output_aspect_ratior->clear();
output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i];
bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true;
break;
}
}
if (!already_exist) {
output_aspect_ratior->push_back(ar);
if (flip) {
output_aspect_ratior->push_back(1.0f / ar);
}
}
}
}
void PriorBoxCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
bool is_flip = param.flip;
bool is_clip = param.clip;
std::vector<float> min_size = param.min_sizes;
std::vector<float> max_size = param.max_sizes;
std::vector<float> aspect_ratio = param.aspect_ratios;
std::vector<float> variance = param.variances_;
int img_w = param.img_w;
int img_h = param.img_h;
float step_w = param.step_w;
float step_h = param.step_h;
float offset = param.offset;
std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
size_t prior_num = aspect_ratios_vec.size() * min_size.size();
prior_num += max_size.size();
std::vector<std::string> order = param.order;
bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
param.boxes->mutable_data<float16>();
param.variances->mutable_data<float16>();
int win1 = param.input->dims()[3];
int hin1 = param.input->dims()[2];
DDim shape_out({hin1, win1, prior_num, 4});
param.boxes->Resize(shape_out);
param.variances->Resize(shape_out);
param.boxes->mutable_data<float>();
param.variances->mutable_data<float>();
// ====================================================
zynqmp::PriorBoxParam& priobox_param = pe_.param();
priobox_param.input = param.input->ZynqTensor();
......@@ -38,7 +88,7 @@ void PriorBoxCompute::PrepareForRun() {
priobox_param.maxSizes = param.max_sizes;
priobox_param.aspectRatios = param.aspect_ratios;
priobox_param.variances = param.variances_;
// priobox_param.minMaxAspectRatiosOrder = param->MinMaxAspectRatiosOrder();
priobox_param.minMaxAspectRatiosOrder = min_max_aspect_ratios_order;
priobox_param.flip = param.flip;
priobox_param.clip = param.clip;
priobox_param.stepW = param.step_w;
......@@ -49,7 +99,16 @@ void PriorBoxCompute::PrepareForRun() {
pe_.apply();
}
void PriorBoxCompute::Run() { pe_.dispatch(); }
void PriorBoxCompute::Run() {
pe_.dispatch();
#ifdef FPGA_PRINT_TENSOR
zynqmp::PriorBoxParam& priobox_param = pe_.param();
Debugger::get_instance().registerOutput("pb_boxes",
priobox_param.outputBoxes);
Debugger::get_instance().registerOutput("pb_variances",
priobox_param.outputVariances);
#endif
}
} // namespace fpga
} // namespace kernels
......@@ -62,8 +121,30 @@ REGISTER_LITE_KERNEL(prior_box,
kNHWC,
paddle::lite::kernels::fpga::PriorBoxCompute,
def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindInput("Image",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// REGISTER_LITE_KERNEL(prior_box,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::PriorBoxCompute,
// def)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("Image", {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
// .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
// .Finalize();
......@@ -23,7 +23,8 @@ namespace lite {
namespace kernels {
namespace fpga {
class PriorBoxCompute : public KernelLite<TARGET(kFPGA), PRECISION(kFP16)> {
class PriorBoxCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::PriorBoxParam;
......
......@@ -73,11 +73,13 @@ REGISTER_LITE_KERNEL(sequence_pool,
kNHWC,
paddle::lite::kernels::fpga::SequencePoolCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA),
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -23,8 +23,8 @@ namespace lite {
namespace kernels {
namespace fpga {
class SequencePoolCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
class SequencePoolCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
......
......@@ -29,7 +29,6 @@ using float16 = zynqmp::float16;
class SoftmaxCompute
: public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
public:
void PrepareForRun() override;
void Run() override;
......
......@@ -27,7 +27,6 @@ namespace fpga {
using float16 = zynqmp::float16;
// Transpose
void TransposeCompute::Run() {
auto& param = this->Param<param_t>();
......@@ -45,7 +44,6 @@ void Transpose2Compute::Run() {
// auto out = param.Out();
// auto out_data = out->data<half>();
// int num = input_x_dims[1];
// int channel = input_x_dims[2];
......@@ -58,7 +56,7 @@ void Transpose2Compute::Run() {
// index++;
// }
// }
} else {
param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
}
......@@ -97,8 +95,6 @@ REGISTER_LITE_KERNEL(transpose2,
{LiteType::GetTensorTy(TARGET(kFPGA),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("XShape",
{LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -3,7 +3,7 @@ message(STATUS "compile with lite host kernels")
add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
# add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
......@@ -392,9 +392,13 @@ REGISTER_LITE_KERNEL(multiclass_nms,
kNCHW,
paddle::lite::kernels::host::MulticlassNmsCompute,
def)
.BindInput("BBoxes", {LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
.BindInput("Scores", {LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))})
.BindInput("BBoxes",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindInput("Scores",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
......@@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
# global variables
BUILD_EXTRA=OFF
BUILD_EXTRA=ON
BUILD_JAVA=ON
BUILD_PYTHON=OFF
BUILD_DIR=$(pwd)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册