diff --git a/.gitignore b/.gitignore index ce40fea2be877c09bb299781d8937c081843b50c..9db2912c07bc2d6abb01c322a25519ac0ff158fa 100644 --- a/.gitignore +++ b/.gitignore @@ -104,10 +104,3 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/MobileNetDemo/MobileNetDemo/Resources - -# generated files -lite/api/paddle_use_kernels.h -lite/api/paddle_use_ops.h -lite/backends/arm/math/dotprod/gemm_sdot.h -lite/tools/cmake_tools/ast.pyc - diff --git a/CMakeLists.txt b/CMakeLists.txt index 786b1322b346631d1570a6ebd9c572302531db4e..77a94bea1efcdafaa67b4c078bfb0a756f7b1cec 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,8 +22,6 @@ if (WITH_PADDLE_MOBILE) return() endif(WITH_PADDLE_MOBILE) -# set(CMAKE_BUILD_TYPE DEBUG) - set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(CMAKE_CXX_STANDARD 11) diff --git a/fpga.sh b/fpga.sh deleted file mode 100644 index e0501ac14b5269139688169017c057bd2458ab7c..0000000000000000000000000000000000000000 --- a/fpga.sh +++ /dev/null @@ -1,5 +0,0 @@ -./lite/tools/build.sh \ - --arm_os=armlinux \ - --arm_abi=armv8 \ - --arm_lang=gcc \ - test diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index f5b7ea4d9f43b2a8802cd86da98bb8e95197d896..986796b4fbd1f6100eef030e46d3cf981fe717d4 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -41,6 +41,7 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass); USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_layout_cast_pass); USE_MIR_PASS(memory_optimize_pass); +USE_MIR_PASS(kernel_place_correct_pass) USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index 81ae57fc88b25dd907c21efab7f79dfe7e524d98..9b1189c407d6d601bb3e5ba8172b1455f04710fd 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -33,7 +33,7 @@ class Debugger { void registerOutput(std::string op_type, zynqmp::Tensor* tensor) { if (op_config[op_type]) { - // tensor->saveToFile(op_type, true); + tensor->saveToFile(op_type, true); } } @@ -43,6 +43,8 @@ class Debugger { op_config["concat"] = true; op_config["pooling"] = true; op_config["conv"] = true; + op_config["dwconv"] = true; + op_config["ew_add"] = true; op_config["crop"] = true; op_config["feed"] = true; op_config["mul"] = true; diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp old mode 100644 new mode 100755 index bf5ab6212b852fdf1cb2a1b9856a1be5fccb7cf9..1408a034cb6a975e32d92da0406f98df7f2409c1 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp @@ -61,7 +61,9 @@ void reset_device() { // memory management; void *fpga_malloc(size_t size) { + #ifdef PADDLE_MOBILE_OS_LINUX + void *ptr = reinterpret_cast( mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); if (ptr == MAP_FAILED) { diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp index 37c300bd8658a9794263add630a055e27366797b..cea22e0edc647b3bf4f0ac15e43121b5d8926154 100755 --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ b/lite/backends/fpga/KD/pes/conv_process.hpp @@ -656,7 +656,6 @@ inline bool compute_conv(const ConvParam& c_conv_params) { } size_t size = params.size(); if (ret == 0 && size > 1) { - // Tensor* output = conv_params.output; Tensor& img = params[0]->output; for (int i = 0; i < 1; i++) { for (int i = 0; i < img.shape().numel(); i++) { diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp index 4bb5dc28009376307fea442093f3d9df55ecb894..9958990af6eb237d2122a63e1b7ed947ca329d31 100755 --- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp @@ -96,6 +96,7 @@ class DepthwiseConvPE : public PE { float16* scale_data = param_.scale()->data(); float16* filter_data = param.quantizedFilter()->mutableData( FP16, param.filter->shape()); + // memcpy(filter_data, scale_data, channel * sizeof(float16)); memcpy(filter_data, scale_data, diff --git a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp index 0e49a006abfc45203008bc6da99270b847889207..299ffb872b4620fc409eb8e66760a6308a814efb 100755 --- a/lite/backends/fpga/KD/pes/gru_pe.hpp +++ b/lite/backends/fpga/KD/pes/gru_pe.hpp @@ -121,7 +121,6 @@ class GRUPE : public PE { prev_hidden_.copyFrom(value.pre_output); } mul_pe_.dispatch(); - // reset_hidden_.saveToFile("reset_hidden_.txt"); update_gate_data += stride_update; reset_gate_data += stride_update; @@ -172,7 +171,6 @@ class GRUPE : public PE { zynqmp::Tensor bias_; zynqmp::Tensor weight_; zynqmp::Tensor state_weight_; - zynqmp::Tensor update_gate_; zynqmp::Tensor reset_gate_; zynqmp::Tensor cell_state_; diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp old mode 100755 new mode 100644 index 8cc3188018105f2ae93bf9b434820d24cb18a751..988bc1bb507036de8f13a6c6549c549718bd1256 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -346,19 +346,9 @@ class Tensor { if (placeHolder_ == nullptr) { return; } - std::cout << scale()[0] << " , " << scale()[1] << std::endl; } - void printScale(std::string type) { - std::cout << type << " : " - << std::to_string(shape_->num()) + "_" + - std::to_string(shape_->channel()) + "_" + - std::to_string(shape_->height()) + "_" + - std::to_string(shape_->width()) - << std::endl; - std::cout << type << " \n"; - printScale(); - } + void printScale(std::string type) { printScale(); } std::string dimsFileName() { return std::to_string(shape_->num()) + "_" + @@ -386,7 +376,6 @@ class Tensor { static int counter = 0; std::string npath = std::to_string(counter) + "_" + path; counter++; - std::cout << "======== saving file:" << npath << " ============\n"; save_file_with_name(npath); } diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h old mode 100755 new mode 100644 index 49aded3d7d7db6d293e13298d98c2f3b165f411f..311fc8a98400e5a6916ba1b9c8de1e6e0bcec4c0 --- a/lite/backends/fpga/lite_tensor.h +++ b/lite/backends/fpga/lite_tensor.h @@ -165,9 +165,6 @@ class TensorLite { TargetType target() const { return target_; } - // template - // TensorLite Slice(int64_t begin, int64_t end) const; - zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; } friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) { @@ -257,7 +254,6 @@ TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { int64_t base = numel() / dims_[0]; TensorLite dst; - dst.target_ = target_; auto dst_dims = dims_; dst_dims[0] = end - begin; diff --git a/lite/core/kernel.h b/lite/core/kernel.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index a32e0295dbfc2b3e635472649b437b64f1e93145..fe03492a78ed8573182ed1c874b07a14bd7fa912 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -25,6 +25,7 @@ lite_cc_library(mir_passes elimination/elementwise_mul_constant_eliminate_pass.cc static_kernel_pick_pass.cc variable_place_inference_pass.cc + kernel_place_correct_pass.cc type_target_cast_pass.cc type_layout_cast_pass.cc type_precision_cast_pass.cc diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc index ff5a7a1f25239d9dbfc79491bd137804b16b6cfa..2720404fb03cddaf00c9a25d8287b14d69ca86e8 100644 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc +++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc @@ -27,10 +27,24 @@ namespace mir { void QuantDequantFusePass::Apply(const std::unique_ptr& graph) { // delete quant node std::vector quant_op_types = { - "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; + "fake_quantize_abs_max", + "fake_quantize_range_abs_max", + "fake_quantize_moving_average_abs_max"}; + /* + for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) { + for (int i = 5; i >= 1; --i){ + fusion::DynamicQuantDequantOpFuser fuser("fake_quantize_abs_max", op_type, + i); + fuser(graph.get()); + } + } + */ + for (auto& op_type : quant_op_types) { fusion::DeleteQuantOpFuser fuser(op_type); fuser(graph.get()); + fusion::DeleteDynamicQuantOpFuser dfuser(op_type); + dfuser(graph.get()); } // fuse quantized node and dequant node diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc old mode 100644 new mode 100755 index da611e4490f4ba7268d9011b3dbb391a63a88305..578fac7eea151e2df95d777ffaeb20250f543b92 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -77,6 +77,55 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) { return op_desc; } +void DeleteDynamicQuantOpFuser::BuildPattern() { + auto* input_act_node = + VarNode("input_act_node")->assert_is_op_input(quant_op_type_, "X"); + auto* quant_node = + OpNode("quant_node", quant_op_type_)->assert_is_op(quant_op_type_); + auto* output_scale_node = + VarNode("output_scale_node") + ->assert_is_op_output(quant_op_type_, "OutScale"); + auto* output_act_node = + VarNode("output_act_node")->assert_is_op_output(quant_op_type_, "Out"); + + quant_node->LinksFrom({input_act_node}); + output_scale_node->LinksFrom({quant_node}); + output_act_node->LinksFrom({quant_node}); + VLOG(4) << "DeleteQuantOpFuser BuildPattern quant_op_type:" << quant_op_type_; +} + +void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched) { + auto* input_act_node = matched.at("input_act_node"); + auto* quant_node = matched.at("quant_node"); + auto* output_scale_node = matched.at("output_scale_node"); + auto* output_act_node = matched.at("output_act_node"); + + // obtain values, save values and relink node + int bit_length = quant_node->stmt()->op_info()->GetAttr("bit_length"); + int range = ((1 << (bit_length - 1)) - 1); + auto* scope = quant_node->stmt()->op()->scope(); + auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name) + ->GetMutable(); + float scale_value = scale_tensor->data()[0] / range; + + auto outlinks = output_act_node->outlinks; + for (auto* quantized_node : outlinks) { + auto* op_desc = quantized_node->stmt()->mutable_op_info(); + op_desc->SetAttr("bit_length", bit_length); + IR_NODE_LINK_TO(input_act_node, quantized_node) + } + + // delete nodes and edges + std::unordered_set nodes2rm = { + quant_node, output_scale_node, output_act_node}; + GraphSafeRemoveNodes(graph, nodes2rm); +} + +cpp::OpDesc DeleteDynamicQuantOpFuser::GenOpDesc(const key2nodes_t& matched) { + cpp::OpDesc op_desc; + return op_desc; +} void DequantOpFuser::BuildPattern() { std::string weight_name = ""; if (quantized_op_type_ == "conv2d" || @@ -130,8 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, auto& valid_places = quantized_op->stmt()->op()->valid_places(); int bit_length = quantized_op->stmt()->op_info()->GetAttr("bit_length"); int range = ((1 << (bit_length - 1)) - 1); - float input_scale = - quantized_op->stmt()->op_info()->GetAttr("input_scale"); + float input_scale = 0; + if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) { + input_scale = + quantized_op->stmt()->op_info()->GetAttr("input_scale"); + } float max_range = dequant_op->stmt()->op_info()->GetAttr("max_range"); float whole_weight_scale = static_cast(range * range) / max_range / range; @@ -162,8 +214,12 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, for (int i = 0; i < weight_scale_size; i++) { weight_scale.push_back(whole_weight_scale); } + #ifndef LITE_WITH_FPGA op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("input_scale", input_scale); + #endif + if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) { + op_desc.SetAttr("input_scale", input_scale); + } op_desc.SetAttr("weight_scale", weight_scale); // change the weight from the float type to int8 type. @@ -171,12 +227,30 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, temp_tensor.CopyDataFrom(*quantized_weight_t); float* temp_data = temp_tensor.mutable_data(); size_t weight_num = quantized_weight_t->data_size(); + +#ifdef LITE_WITH_FPGA + float* quantized_weight_data = quantized_weight_t->mutable_data(); + for (size_t i = 0; i < weight_num; i++) { + quantized_weight_data[i] = temp_data[i] * whole_weight_scale; + } + quantized_weight_t->set_persistable(true); + quantized_weight_t->set_precision(PRECISION(kFloat)); +#else int8_t* quantized_weight_data = quantized_weight_t->mutable_data(); for (size_t i = 0; i < weight_num; i++) { quantized_weight_data[i] = static_cast(temp_data[i]); } quantized_weight_t->set_persistable(true); quantized_weight_t->set_precision(PRECISION(kInt8)); +#endif + + + // int8_t* quantized_weight_data = quantized_weight_t->mutable_data(); + // for (size_t i = 0; i < weight_num; i++) { + // quantized_weight_data[i] = static_cast(temp_data[i]); + // } + // quantized_weight_t->set_persistable(true); + // quantized_weight_t->set_precision(PRECISION(kInt8)); // new op and relink nodes auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_); @@ -464,6 +538,194 @@ cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { cpp::OpDesc op_desc; return op_desc; } +// ================dynamic quant fuse============== +// #define DYNAMIC_RANGE +void DynamicQuantDequantOpFuser::BuildPattern() { + const int kNumFields = 5; + const int kQuantizedWeightOffset = 0; + const int kQuantizedOpOffset = 1; + const int kQuantizedOpOutOffset = 2; + const int kDequantOpOffset = 3; + const int kDequantOpOutOffset = 4; + + std::string weight_name = ""; + if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") { + weight_name = "Filter"; + } else { + weight_name = "Y"; + } + auto* quant_op_input = VarNode("quant_op_input") + ->assert_is_op_input(quant_type_, "X") + ->AsInput(); +#ifdef DYNAMIC_RANGE + auto* quant_op_in_scale = VarNode("quant_op_in_scale") + ->assert_is_op_input(quant_type_, "InScale") + ->AsIntermediate(); +#endif + auto* quant_op = OpNode("quant_op", quant_type_) + ->assert_is_op(quant_type_) + ->AsIntermediate(); + + auto* quant_op_out_scale = + VarNode("quant_op_out_scale") + ->assert_is_op_output(quant_type_, "OutScale") + ->assert_is_op_input("fake_dequantize_max_abs", "Scale") + ->AsIntermediate(); + + auto* quant_op_out = VarNode("quant_op_out") + ->assert_is_op_output(quant_type_, "Out") + ->assert_is_op_input(op_type_) + ->AsIntermediate(); + std::vector nodes; + for (int i = 0; i < times_; i++) { + nodes.push_back(VarNode(string_format("quantized_op_weight%d", i)) + ->assert_is_op_input(op_type_, weight_name) + ->AsInput()); + + nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_) + ->assert_is_op(op_type_) + ->AsIntermediate()); + + nodes.push_back(VarNode(string_format("quantized_op_out%d", i)) + ->assert_is_op_output(op_type_) + ->assert_is_op_input("fake_dequantize_max_abs", "X") + ->AsIntermediate()); + + nodes.push_back( + OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs") + ->assert_is_op("fake_dequantize_max_abs") + ->AsIntermediate()); + nodes.push_back(VarNode(string_format("dequant_op_out%d", i)) + ->assert_is_op_output("fake_dequantize_max_abs", "Out") + ->AsOutput()); + } + +#ifdef DYNAMIC_RANGE + quant_op->LinksFrom({quant_op_input, quant_op_in_scale}); +#endif + quant_op->LinksFrom({quant_op_input}); + quant_op_out->LinksFrom({quant_op}); + quant_op_out_scale->LinksFrom({quant_op}); + for (int i = 0; i < times_; i++) { + nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom( + {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]}); + nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom( + {nodes[i * kNumFields + kQuantizedOpOffset]}); + nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( + {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale}); + nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom( + {nodes[i * kNumFields + kDequantOpOffset]}); + } +} + +void DynamicQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched) { + const int kNumFields = 5; + const int kQuantizedWeightOffset = 0; + const int kQuantizedOpOffset = 1; + const int kDequantOpOffset = 3; + const int kDequantOpOutOffset = 4; + + auto* quant_op_input = matched.at("quant_op_input"); +#ifdef DYNAMIC_RANGE + auto* quant_op_in_scale = matched.at("quant_op_in_scale"); +#endif + auto* quant_op = matched.at("quant_op"); + + std::vector nodes; + for (int i = 0; i < times_; i++) { + nodes.push_back(matched.at(string_format("quantized_op_weight%d", i))); + nodes.push_back(matched.at(string_format("quantized_op%d", i))); + nodes.push_back(matched.at(string_format("quantized_op_out%d", i))); + nodes.push_back(matched.at(string_format("dequant_op%d", i))); + nodes.push_back(matched.at(string_format("dequant_op_out%d", i))); + } + int bit_length = quant_op->stmt()->op_info()->GetAttr("bit_length"); + auto* scope = quant_op->stmt()->op()->scope(); + auto& valid_places = quant_op->stmt()->op()->valid_places(); + int range = ((1 << (bit_length - 1)) - 1); + +#ifdef DYNAMIC_RANGE + auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name) + ->GetMutable(); + float input_scale = input_scale_t->data()[0] / range; + VLOG(4) << "range: " << range << " input_scale: " << input_scale; +#endif + for (int i = 0; i < times_; i++) { + float max_range = nodes[i * kNumFields + kDequantOpOffset] + ->stmt() + ->op_info() + ->GetAttr("max_range"); + // weight_scale = max(abs(weight)) + float whole_weight_scale = + static_cast(range * range) / max_range / range; + + cpp::OpDesc op_desc = + *nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info(); + + auto quantized_weight_var_name = + nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name; + auto quantized_weight_t = + scope->FindVar(quantized_weight_var_name)->GetMutable(); + std::vector weight_scale; + int weight_scale_size; + + if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") { + op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name}); + op_desc.SetOutput( + "Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name}); + // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should + // be Cout. + weight_scale_size = quantized_weight_t->dims()[0]; + } else if (op_type_ == "mul") { + op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name}); + op_desc.SetOutput( + "Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name}); + // Fc weight: Cin * Cout, the weight_scale_size should be Cout. + weight_scale_size = quantized_weight_t->dims()[1]; + } + for (int i = 0; i < weight_scale_size; i++) { + weight_scale.push_back(whole_weight_scale); + } + // op_desc.SetAttr("enable_int8", true); + // op_desc.SetAttr("input_scale", input_scale); + op_desc.SetAttr("weight_scale", weight_scale); + + Tensor temp_tensor; + temp_tensor.CopyDataFrom(*quantized_weight_t); + float* temp_data = temp_tensor.mutable_data(); + size_t weight_num = quantized_weight_t->data_size(); + quantized_weight_t->set_persistable(true); + std::cout << "DynamicQuantDequantOpFuser::InsertNewNode============================================================" << std::endl; +#ifdef LITE_WITH_FPGA + float* quantized_weight_data = quantized_weight_t->mutable_data(); + for (size_t i = 0; i < weight_num; i++) { + quantized_weight_data[i] = temp_data[i] * whole_weight_scale; + std::cout << whole_weight_scale << "," << temp_data[i] << "," << quantized_weight_data[i] << std::endl; + } + quantized_weight_t->set_precision(PRECISION(kFloat)); +#else + int8_t* quantized_weight_data = quantized_weight_t->mutable_data(); + for (size_t i = 0; i < weight_num; i++) { + quantized_weight_data[i] = static_cast(temp_data[i]); + } + quantized_weight_t->set_precision(PRECISION(kInt8)); +#endif + auto quantized_op = LiteOpRegistry::Global().Create(op_type_); + quantized_op->Attach(op_desc, scope); + auto* new_op_node = + graph->GraphCreateInstructNode(quantized_op, valid_places); + IR_NODE_LINK_TO(quant_op_input, new_op_node); + IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], + new_op_node); + IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]); + } +} + +cpp::OpDesc DynamicQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { + cpp::OpDesc op_desc; + return op_desc; +} } // namespace fusion } // namespace mir diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h index bef9f4d9573d049700736c166cd0d31b668f7eff..c21df350f96143a09b3229776bf5c013b1988559 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.h +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h @@ -52,6 +52,19 @@ class DeleteQuantOpFuser : public FuseBase { private: std::string quant_op_type_{}; }; +class DeleteDynamicQuantOpFuser : public FuseBase { + public: + explicit DeleteDynamicQuantOpFuser(const std::string& quant_op_type) + : quant_op_type_(quant_op_type) {} + void BuildPattern() override; + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + + private: + std::string quant_op_type_{}; +}; /* DequantOpFuser process conv2d/depthwise_conv2d/mul + fake_dequantize_max_abs. */ @@ -106,6 +119,24 @@ class DeleteQuantDequantOpFuser : public FuseBase { private: std::string quantized_op_type_{}; }; +// dynamic quantdequant op fuser +class DynamicQuantDequantOpFuser : public FuseBase { + public: + explicit DynamicQuantDequantOpFuser(const std::string& quantized_op_type, + const std::string& op_type, + int i) + : op_type_(op_type), quant_type_(quantized_op_type), times_(i) {} + void BuildPattern() override; + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + + private: + std::string op_type_{}; + std::string quant_type_{}; + int times_{1}; +}; } // namespace fusion } // namespace mir diff --git a/lite/core/mir/kernel_place_correct_pass.cc b/lite/core/mir/kernel_place_correct_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..dad7687bbec1ddbd7c8c787338005955de964f17 --- /dev/null +++ b/lite/core/mir/kernel_place_correct_pass.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/kernel_place_correct_pass.h" +#include +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +void KernelPlaceCorrectPass::Apply(const std::unique_ptr &graph) { + CorrectArgumentPlace(graph.get()); +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(kernel_place_correct_pass, + paddle::lite::mir::KernelPlaceCorrectPass) + .BindTargets({TARGET(kFPGA)}); diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..5fab5000862378976c16448f5a82f052ffbc20a5 --- /dev/null +++ b/lite/core/mir/kernel_place_correct_pass.h @@ -0,0 +1,147 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { +namespace mir { + +/* + * Correct the place of the variables in the SSAGrpah, it will inference the + * variables' place by the kernels outputs them. + */ +class KernelPlaceCorrectPass : public DebugPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + void CorrectArgumentPlace(SSAGraph* graph) { + auto& valid_places = graph->valid_places(); + auto valid_places_has_target = [&](TargetType t) -> bool { + for (auto& p : valid_places) { + if (p.target == t) { + return true; + } + } + return false; + }; + std::map lite_with_targets{ + {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))}, + {"kFPGA", valid_places_has_target(TARGET(kFPGA))}}; + VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"]; + VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"]; + + VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); + for (auto& x : graph->StmtTopologicalOrder()) { + auto& inst = x->AsStmt(); + // The IoCopyOp is a tool operator, it won't support the type inference. + // in fpga, we has io_copy+cali+layout tool ops, so we need type inference + // for + // tool operator + if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) { + VLOG(3) << "inst.op_type() == 'io_copy', continue"; + if (inst.op_type() == "io_copy") continue; + } + // deal with inputs + VLOG(4) << "checking op " << inst.op_info()->Repr(); + + auto get_argname = [&]( + const std::string& node_name, + const std::map>& argname_map) + -> std::string { + for (auto& ele : argname_map) { + auto it = + std::find(ele.second.begin(), ele.second.end(), node_name); + if (it != ele.second.end()) return ele.first; + } + return ""; + }; + + bool need_correct_place = true; + + std::vector in_types; + std::vector out_types; + for (auto* x_in : x->inlinks) { + std::string node_name = x_in->AsArg().name; + std::string arg_name = get_argname(node_name, inst.op_info()->inputs()); + CHECK(arg_name.size() > 0) << "can not found op arguments for node " + << node_name; + VLOG(4) << "-- input arg_name:" << arg_name << " " + << "-- node name:" << node_name; + auto type = inst.picked_kernel().GetInputDeclType(arg_name); + if (!x_in->AsArg().type) { + need_correct_place &= false; + } else { + if (in_types.empty()) { + in_types.push_back(x_in->AsArg().type->target()); + } else { + if (in_types[0] != x_in->AsArg().type->target()) { + need_correct_place &= false; + } + } + } + } + + for (auto* x_out : x->outlinks) { + std::string node_name = x_out->AsArg().name; + std::string arg_name = + get_argname(node_name, inst.op_info()->outputs()); + CHECK(arg_name.size() > 0) << "can not found op arguments for node " + << node_name << " in Inst " + << inst.op_type(); + VLOG(4) << "-- output arg_name " << arg_name; + auto type = inst.picked_kernel().GetOutputDeclType(arg_name); + if (!x_out->AsArg().type) { + need_correct_place &= false; + } else { + if (out_types.empty()) { + out_types.push_back(x_out->AsArg().type->target()); + } else { + if (out_types[0] != x_out->AsArg().type->target()) { + need_correct_place &= false; + } + } + } + } + + auto this_type = inst.picked_kernel().target(); + bool io_target_same = (in_types[0] == out_types[0]); + need_correct_place &= (io_target_same && (in_types[0] != this_type)); + if (need_correct_place) { + // update this kernel's valid place; + UpdateTarget(inst, in_types[0]); + } + } + } + + // Update me's kUnk fields by other's fields. + void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT + auto new_place = inst.place(); + new_place.target = new_target; + std::vector places; + places.push_back(new_place); + inst.ResetKernels(places); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/node.cc b/lite/core/mir/node.cc index 4a90e530a46c4d42d2ba032da1828973dfc1bcef..52fd39182a7132777231929d49c319bb961cf7f9 100644 --- a/lite/core/mir/node.cc +++ b/lite/core/mir/node.cc @@ -53,6 +53,11 @@ void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc, } valid_kernels_ = op_->CreateKernels(valid_places); } +void mir::Node::Stmt::ResetKernels(const std::vector &valid_places) { + CHECK(op_) << "change valid place failed, not created op"; + valid_kernels_.clear(); + valid_kernels_ = op_->CreateKernels(valid_places); +} mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) { auto &x = AsArg(); diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h index e2c8a68bde6ee18506de73a7531716695b3d54f1..e7c44d2be689a9d890158c097e198314413d1ba3 100644 --- a/lite/core/mir/node.h +++ b/lite/core/mir/node.h @@ -53,6 +53,7 @@ class Node { const std::vector& valid_places, lite::Scope* scope = nullptr); + void ResetKernels(const std::vector& valid_places); std::string op_type() const { return op_info()->Type(); } const OpInfo* op_info() const; OpInfo* mutable_op_info(); diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc old mode 100644 new mode 100755 diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 26ea72cb25e50110ebbeba52d265236730e2ecdf..bb103647c3f389b304ae7d0aa1089843fa781a0f 100755 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -76,6 +76,7 @@ class Optimizer { #endif "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's + "kernel_place_correct_pass", // info(target/precision/layout/device) // using kernel info "argument_type_display_pass", // debug pass: show arg-type-node's diff --git a/lite/core/program.cc b/lite/core/program.cc index 93ea2137a8431db3602ed34b6845a19c45e92b8a..8b1c6687463f3ca04ffb924efff8b814ae711415 100755 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -148,7 +148,7 @@ void RuntimeProgram::Run() { #ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE #ifndef LITE_WITH_FPGA -// LITE_PRECISION_PROFILE(inst) + LITE_PRECISION_PROFILE(inst) #endif #endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc old mode 100755 new mode 100644 index fa7e2c0c3ae4580f5d19e82f7c48c74db3058847..6c0523ab600ae6352fc4d7716bc2a248d19ea8b5 --- a/lite/kernels/arm/lookup_table_compute.cc +++ b/lite/kernels/arm/lookup_table_compute.cc @@ -28,7 +28,6 @@ namespace arm { void LookupTableCompute::Run() { auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); // inputs auto w = param.W; auto ids = param.Ids; @@ -76,3 +75,13 @@ REGISTER_LITE_KERNEL(lookup_table, .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +REGISTER_LITE_KERNEL(lookup_table_v2, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/fpga/calib_compute.h b/lite/kernels/fpga/calib_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc old mode 100644 new mode 100755 index 2293267f021d5a7bc003e69f3be84d8205ce2746..06e317c253cb06778162e2fa7ed08456fb4f6f17 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -71,6 +71,13 @@ void ConvCompute::PrepareForRun() { if (param.fuse_relu) { conv_param.activeParam.type = zynqmp::TYPE_RELU; } + + // conv_param.filter->saveToFile("conv_filter_", true); + // if (param.bias != nullptr) { + // std::cout << "param.bias != nullptr" << std::endl; + // conv_param.bias()->saveToFile("conv_bias_", true); + // } + conv_pe_.init(); conv_pe_.apply(); } @@ -79,26 +86,18 @@ void ConvCompute::PrepareForRun() { void ConvCompute::Run() { auto& param = this->Param(); if (param.x->ZynqTensor()->shape().channel() != 1 && - param.groups == param.x->ZynqTensor()->shape().channel()) { + param.groups == param.x->ZynqTensor()->shape().channel()) { dw_conv_pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param(); + Debugger::get_instance().registerOutput("dwconv", dwconv_param.output); +#endif } else { - zynqmp::ConvParam& conv_param = conv_pe_.param(); - - if (conv_param.output->shape().channel() == 12 && - conv_param.output->shape().height() == 13) { - conv_param.input->saveToFile("conv_in", true); - conv_param.output->saveToFile("conv_o", true); - } - + // zynqmp::ConvParam& conv_param = conv_pe_.param(); conv_pe_.dispatch(); - if (conv_param.output->shape().channel() == 12 && - conv_param.output->shape().height() == 13) { - // conv_param.input->saveToFile("conv_in", true); - conv_param.output->saveToFile("conv_out", true); - } #ifdef FPGA_PRINT_TENSOR - // zynqmp::ConvParam& conv_param = conv_pe_.param(); + zynqmp::ConvParam& conv_param = conv_pe_.param(); Debugger::get_instance().registerOutput("conv", conv_param.output); #endif } @@ -122,3 +121,17 @@ REGISTER_LITE_KERNEL( PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL( + depthwise_conv2d, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::ConvCompute, def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc index f2276cabf8445c64ea02a1dbdc761586bc5a1f9b..d22cc7abacc2ecd80e54aa5c62a7e57671b920c9 100755 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -125,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), diff --git a/lite/kernels/fpga/elementwise_compute_test.cc b/lite/kernels/fpga/elementwise_compute_test.cc old mode 100644 new mode 100755 index add60f64602105d317c3657985c0011aff246608..97b64091bb4cd54c42e721fb1c75d01c331a6ae0 --- a/lite/kernels/fpga/elementwise_compute_test.cc +++ b/lite/kernels/fpga/elementwise_compute_test.cc @@ -93,18 +93,22 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param, } // do elementwise add/sub/max... if (elt_type == "add") { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const dtype* din_ptr = x_data + offset; - const dtype diny_data = y_data[j]; - dtype* dout_ptr = out_data + offset; - for (int k = 0; k < num; ++k) { - *dout_ptr = sum(*din_ptr, diny_data); - dout_ptr++; - din_ptr++; - } - } + // for (int i = 0; i < batch; ++i) { + // for (int j = 0; j < channels; ++j) { + // int offset = (i * channels + j) * num; + // const dtype* din_ptr = x_data + offset; + // const dtype diny_data = y_data[j]; + // dtype* dout_ptr = out_data + offset; + // for (int k = 0; k < num; ++k) { + // *dout_ptr = zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr), zynqmp::half_to_float(diny_data))); + // dout_ptr++; + // din_ptr++; + // } + // } + // } + int count= x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; + for (int i = 0; i < count; ++i) { + out_data[i] = zynqmp::float_to_half(sum(zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i]))); } } else if (elt_type == "sub") { for (int i = 0; i < batch; ++i) { @@ -148,9 +152,9 @@ TEST(elementwise_add, compute) { lite::Tensor x, y, output, output_ref; for (auto n : {1}) { - for (auto c : {8}) { - for (auto h : {8}) { - for (auto w : {8}) { + for (auto h : {72}) { + for (auto w : {192}) { + for (auto c : {24}) { for (auto axis : {0}) { for (auto yd : {std::vector({n, c, h, w})}) { auto x_dim = DDim(std::vector({n, c, h, w})); @@ -174,10 +178,16 @@ TEST(elementwise_add, compute) { auto* output_ref_data = output_ref.mutable_data(TARGET(kFPGA)); for (int i = 0; i < x_dim.production(); i++) { - x_data[i] = zynqmp::float_to_half(i); + float sign = i % 3 == 0 ? -0.03 : 0.05f; + float x = sign * (i % 128); + std::cout << "x:" << x << std::endl; + x_data[i] = zynqmp::float_to_half(x); } for (int i = 0; i < y_dim.production(); i++) { - y_data[i] = zynqmp::float_to_half(i); + float sign = i % 3 == 0 ? -0.03 : 0.05f; + float y = sign * (i % 128); + std::cout << "y:" << y << std::endl; + y_data[i] = zynqmp::float_to_half(y); } param.X = &x; param.Y = &y; @@ -190,7 +200,8 @@ TEST(elementwise_add, compute) { elementwise_compute_ref(param, "add", ""); for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); + std::cout << "output_data:" << zynqmp::half_to_float(output_data[i]) << ",output_ref_data:" << zynqmp::half_to_float(output_ref_data[i]) << std::endl; + EXPECT_NEAR(zynqmp::half_to_float(output_data[i]), zynqmp::half_to_float(output_ref_data[i]), 1e-5); } } } @@ -209,73 +220,73 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) { ASSERT_TRUE(fusion_elementwise_add_activation.front()); } -TEST(fusion_elementwise_add_activation_fpga, init) { - ElementwiseAddActivationCompute fusion_elementwise_add_activation; - ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16)); - ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA)); -} +// TEST(fusion_elementwise_add_activation_fpga, init) { +// ElementwiseAddActivationCompute fusion_elementwise_add_activation; +// ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16)); +// ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA)); +// } -TEST(fusion_elementwise_add_activation_fpga, compute) { - ElementwiseAddActivationCompute fusion_elementwise_add_activation; - operators::FusionElementwiseActivationParam param; - lite::Tensor x, y, output, output_ref; +// TEST(fusion_elementwise_add_activation_fpga, compute) { +// ElementwiseAddActivationCompute fusion_elementwise_add_activation; +// operators::FusionElementwiseActivationParam param; +// lite::Tensor x, y, output, output_ref; - for (auto act_type : {"relu"}) { - for (auto n : {1}) { - for (auto c : {8}) { - for (auto h : {8}) { - for (auto w : {8}) { - for (auto axis : {0}) { - for (auto yd : {std::vector({n, c, h, w})}) { - auto x_dim = DDim(std::vector({n, c, h, w})); - auto y_dim = DDim(yd); - int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; +// for (auto act_type : {"relu"}) { +// for (auto n : {1}) { +// for (auto c : {8}) { +// for (auto h : {8}) { +// for (auto w : {8}) { +// for (auto axis : {0}) { +// for (auto yd : {std::vector({n, c, h, w})}) { +// auto x_dim = DDim(std::vector({n, c, h, w})); +// auto y_dim = DDim(yd); +// int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; - if (axis_t + y_dim.size() > 4) continue; - bool flag = false; - for (int i = 0; i < y_dim.size(); i++) { - if (x_dim[i + axis_t] != y_dim[i]) flag = true; - } - if (flag) continue; +// if (axis_t + y_dim.size() > 4) continue; +// bool flag = false; +// for (int i = 0; i < y_dim.size(); i++) { +// if (x_dim[i + axis_t] != y_dim[i]) flag = true; +// } +// if (flag) continue; - x.Resize(x_dim); - y.Resize(y_dim); - output.Resize(x_dim); - output_ref.Resize(x_dim); - auto* x_data = x.mutable_data(TARGET(kFPGA)); - auto* y_data = y.mutable_data(TARGET(kFPGA)); - auto* output_data = output.mutable_data(TARGET(kFPGA)); - auto* output_ref_data = - output_ref.mutable_data(TARGET(kFPGA)); - for (int i = 0; i < x_dim.production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - x_data[i] = zynqmp::float_to_half(i * sign); - } - for (int i = 0; i < y_dim.production(); i++) { - float sign = i % 2 == 0 ? 0.5f : -0.5f; - y_data[i] = zynqmp::float_to_half(i * sign); - } - param.X = &x; - param.Y = &y; - param.axis = axis; - param.Out = &output; - param.act_type = act_type; - fusion_elementwise_add_activation.SetParam(param); - fusion_elementwise_add_activation.PrepareForRun(); - fusion_elementwise_add_activation.Run(); - param.Out = &output_ref; - elementwise_compute_ref(param, "add", act_type); - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); - } - } - } - } - } - } - } - } -} +// x.Resize(x_dim); +// y.Resize(y_dim); +// output.Resize(x_dim); +// output_ref.Resize(x_dim); +// auto* x_data = x.mutable_data(TARGET(kFPGA)); +// auto* y_data = y.mutable_data(TARGET(kFPGA)); +// auto* output_data = output.mutable_data(TARGET(kFPGA)); +// auto* output_ref_data = +// output_ref.mutable_data(TARGET(kFPGA)); +// for (int i = 0; i < x_dim.production(); i++) { +// float sign = i % 3 == 0 ? -1.0f : 1.0f; +// x_data[i] = zynqmp::float_to_half(i * sign); +// } +// for (int i = 0; i < y_dim.production(); i++) { +// float sign = i % 2 == 0 ? 0.5f : -0.5f; +// y_data[i] = zynqmp::float_to_half(i * sign); +// } +// param.X = &x; +// param.Y = &y; +// param.axis = axis; +// param.Out = &output; +// param.act_type = act_type; +// fusion_elementwise_add_activation.SetParam(param); +// fusion_elementwise_add_activation.PrepareForRun(); +// fusion_elementwise_add_activation.Run(); +// param.Out = &output_ref; +// elementwise_compute_ref(param, "add", act_type); +// for (int i = 0; i < output.dims().production(); i++) { +// EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); +// } +// } +// } +// } +// } +// } +// } +// } +// } } // namespace fpga } // namespace kernels @@ -283,4 +294,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) { } // namespace paddle USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def); +// USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def); diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc index 57a76dee97ca889cd645a2c8f81b5a2354f9b11f..4554c24e07de656b948826c2fa6f9526f61daaa6 100755 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -191,8 +191,6 @@ class IoCopyFpgaToHostCHWCompute param.y->ZynqTensor()->flush(); auto out_lod = param.y->mutable_lod(); *out_lod = param.x->lod(); - // param.x->ZynqTensor()->saveToFile("io_x", true); - // param.y->ZynqTensor()->saveToFile("io_y", true); } std::string doc() const override { return "Copy IO from FPGA to HOST"; } }; diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc index c889df17cb72a6d3e8ab02efc729ecc93fb38a5f..afd14ccb4b4a9a4f1e93e1e38840035fb18186bb 100644 --- a/lite/kernels/fpga/prior_box_compute.cc +++ b/lite/kernels/fpga/prior_box_compute.cc @@ -78,7 +78,6 @@ void PriorBoxCompute::PrepareForRun() { param.boxes->mutable_data(); param.variances->mutable_data(); - zynqmp::PriorBoxParam& priobox_param = pe_.param(); priobox_param.input = param.input->ZynqTensor(); priobox_param.image = param.image->ZynqTensor(); diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/host/reshape_compute.cc b/lite/kernels/host/reshape_compute.cc index 7a826ed32b02a85860038482d8ca55c5db32a9bf..10c50d20b9c52f72d09c4519716e2defb047a23f 100644 --- a/lite/kernels/host/reshape_compute.cc +++ b/lite/kernels/host/reshape_compute.cc @@ -63,26 +63,6 @@ REGISTER_LITE_KERNEL(reshape, DATALAYOUT(kAny))}) .Finalize(); -// REGISTER_LITE_KERNEL(reshape, -// kFPGA, -// kFP16, -// kNHWC, -// paddle::lite::kernels::host::ReshapeCompute, -// def) -// .BindInput("X", -// {LiteType::GetTensorTy( -// TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) -// .BindInput("ShapeTensor", -// {LiteType::GetTensorTy( -// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) -// .BindInput("Shape", -// {LiteType::GetTensorTy( -// TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) -// .BindOutput("Out", -// {LiteType::GetTensorTy( -// TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))}) -// .Finalize(); - REGISTER_LITE_KERNEL(reshape2, kHost, kAny, diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc old mode 100644 new mode 100755 diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc old mode 100644 new mode 100755 diff --git a/lite/operators/fake_quantize_range_abs_max.cc b/lite/operators/fake_quantize_range_abs_max.cc index a8ce3f75a59fec5b032c60f51177f428bd15fe0d..ebf7e41f4b1af6f6961da07fe95caece19fa59f5 100644 --- a/lite/operators/fake_quantize_range_abs_max.cc +++ b/lite/operators/fake_quantize_range_abs_max.cc @@ -23,3 +23,5 @@ namespace operators {} // namespace operators REGISTER_LITE_OP(fake_quantize_range_abs_max, paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite); +REGISTER_LITE_OP(fake_quantize_abs_max, + paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite); diff --git a/lite/operators/fake_quantize_range_abs_max.h b/lite/operators/fake_quantize_range_abs_max.h index 726731595a9c4b7cd2e30db911230cc2f00b5b92..f68d1e20f6e60bb5aa99a2402ea8c9f88aa18470 100644 --- a/lite/operators/fake_quantize_range_abs_max.h +++ b/lite/operators/fake_quantize_range_abs_max.h @@ -40,13 +40,15 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite { bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override { auto x = op_desc.Input("X").front(); - auto in_scale = op_desc.Input("InScale").front(); + if (op_desc.HasInput("InScale")) { + auto in_scale = op_desc.Input("InScale").front(); + param_.in_scale = scope->FindVar(in_scale)->GetMutable(); + } auto out = op_desc.Output("Out").front(); auto out_scale = op_desc.Output("OutScale").front(); param_.x = scope->FindVar(x)->GetMutable(); - param_.in_scale = scope->FindVar(in_scale)->GetMutable(); param_.out = scope->FindVar(out)->GetMutable(); param_.out_scale = scope->FindVar(out_scale)->GetMutable(); diff --git a/lite/tools/build.sh b/lite/tools/build.sh index 6121186e7c983145f2f9f450f6a23ea1957bb496..e1610b60d3b1b104699ab175bca3bb3cf81bd40b 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -13,7 +13,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4} # global variables -BUILD_EXTRA=ON +BUILD_EXTRA=OFF BUILD_JAVA=ON BUILD_PYTHON=OFF BUILD_DIR=$(pwd) diff --git a/mobile/src/fpga/KD/pes/conv_pe.hpp b/mobile/src/fpga/KD/pes/conv_pe.hpp old mode 100644 new mode 100755 index 5ef89e920e60cd2ef1c57e1f342a342a4149563f..388672a99325c2d04d87c90fa5a6b556b676a820 --- a/mobile/src/fpga/KD/pes/conv_pe.hpp +++ b/mobile/src/fpga/KD/pes/conv_pe.hpp @@ -29,7 +29,6 @@ namespace zynqmp { class ConvPE : public PE { public: bool init() { - std::cout << "Conv init" << std::endl; return true; }