diff --git a/README.md b/README.md index b446883f7ebbe3f5abd8f64c11a6e259e70f27e8..add63566f2632a0e535504a94da0605ce0618bc7 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ ![support os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-yellow.svg) ![QQGroup](https://img.shields.io/badge/QQ_Group-1045148026-52B6EF?style=social&logo=tencent-qq&logoColor=000&logoWidth=20) -集成飞桨智能视觉领域**图像分类**、**目标检测**、**语义分割**、**实例分割**任务能力,将深度学习开发全流程从**数据准备**、**模型训练与优化**到**多端部署**端到端打通,无需分别安装不同功能模块,并提供统一任务API接口,以**低代码**的形式为开发者提供飞桨全流程开发的最佳体验。 +集成飞桨智能视觉领域**图像分类**、**目标检测**、**语义分割**、**实例分割**任务能力,将深度学习开发全流程从**数据准备**、**模型训练与优化**到**多端部署**端到端打通,并提供**统一任务API接口**及**图形化开发界面Demo**。开发者无需分别安装不同套件,以**低代码**的形式即可快速完成飞桨全流程开发。 **PaddleX** 经过**质检**、**安防**、**巡检**、**遥感**、**零售**、**医疗**等十多个行业实际应用场景验证,沉淀产业实际经验,**并提供丰富的案例实践教程**,全程助力开发者产业实践落地。 @@ -48,8 +48,6 @@ pip install paddlex -i https://mirror.baidu.com/pypi/simple - 前往[PaddleX GUI使用教程](./docs/gui/how_to_use.md)了解PaddleX GUI使用详情。 -- https://aistudio.baidu.com/aistudio/projectdetail/440197 - ## 产品模块说明 diff --git a/deploy/cpp/CMakeLists.txt b/deploy/cpp/CMakeLists.txt index 7fe49585cd17ccb076436753d8031f7fba5f6147..349afa2cae5bf40721cafdf38bbf28ddd621beeb 100644 --- a/deploy/cpp/CMakeLists.txt +++ b/deploy/cpp/CMakeLists.txt @@ -305,6 +305,19 @@ add_executable(segmenter demo/segmenter.cpp src/transforms.cpp src/paddlex.cpp s ADD_DEPENDENCIES(segmenter ext-yaml-cpp) target_link_libraries(segmenter ${DEPS}) +add_executable(video_classifier demo/video_classifier.cpp src/transforms.cpp src/paddlex.cpp src/visualize.cpp) +ADD_DEPENDENCIES(video_classifier ext-yaml-cpp) +target_link_libraries(video_classifier ${DEPS}) + +add_executable(video_detector demo/video_detector.cpp src/transforms.cpp src/paddlex.cpp src/visualize.cpp) +ADD_DEPENDENCIES(video_detector ext-yaml-cpp) +target_link_libraries(video_detector ${DEPS}) + +add_executable(video_segmenter demo/video_segmenter.cpp src/transforms.cpp src/paddlex.cpp src/visualize.cpp) +ADD_DEPENDENCIES(video_segmenter ext-yaml-cpp) +target_link_libraries(video_segmenter ${DEPS}) + + if (WIN32 AND WITH_MKL) add_custom_command(TARGET classifier POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./paddlex_inference/Release/mklml.dll @@ -326,7 +339,27 @@ if (WIN32 AND WITH_MKL) COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./paddlex_inference/Release/mkldnn.dll COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./release/mkldnn.dll + ) + add_custom_command(TARGET video_classifier POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./paddlex_inference/Release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./paddlex_inference/Release/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./paddlex_inference/Release/mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll + ) + add_custom_command(TARGET video_detector POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./paddlex_inference/Release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./paddlex_inference/Release/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./paddlex_inference/Release/mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll + ) + add_custom_command(TARGET video_segmenter POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./paddlex_inference/Release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./paddlex_inference/Release/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./paddlex_inference/Release/mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll ) # for encryption if (EXISTS "${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll") @@ -342,6 +375,18 @@ if (WIN32 AND WITH_MKL) COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./pmodel-decrypt.dll COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./release/pmodel-decrypt.dll ) + add_custom_command(TARGET video_classifier POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./pmodel-decrypt.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./release/pmodel-decrypt.dll + ) + add_custom_command(TARGET video_detector POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./pmodel-decrypt.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./release/pmodel-decrypt.dll + ) + add_custom_command(TARGET video_segmenter POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./pmodel-decrypt.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./release/pmodel-decrypt.dll + ) endif() endif() diff --git a/deploy/cpp/demo/classifier.cpp b/deploy/cpp/demo/classifier.cpp index db3687492789f47a3bb49643b87f9b946f05137d..cf3bb5ccf64c43ec42d59a9b73fdced6b50b8dc5 100644 --- a/deploy/cpp/demo/classifier.cpp +++ b/deploy/cpp/demo/classifier.cpp @@ -37,7 +37,6 @@ DEFINE_int32(batch_size, 1, "Batch size of infering"); DEFINE_int32(thread_num, omp_get_num_procs(), "Number of preprocessing threads"); -DEFINE_bool(use_ir_optim, true, "use ir optimization"); int main(int argc, char** argv) { // Parsing command-line @@ -52,16 +51,15 @@ int main(int argc, char** argv) { return -1; } - // 加载模型 + // Load model PaddleX::Model model; model.Init(FLAGS_model_dir, FLAGS_use_gpu, FLAGS_use_trt, FLAGS_gpu_id, - FLAGS_key, - FLAGS_use_ir_optim); + FLAGS_key); - // 进行预测 + // Predict int imgs = 1; if (FLAGS_image_list != "") { std::ifstream inf(FLAGS_image_list); @@ -69,7 +67,7 @@ int main(int argc, char** argv) { std::cerr << "Fail to open file " << FLAGS_image_list << std::endl; return -1; } - // 多batch预测 + // Mini-batch predict std::string image_path; std::vector image_paths; while (getline(inf, image_path)) { @@ -77,7 +75,7 @@ int main(int argc, char** argv) { } imgs = image_paths.size(); for (int i = 0; i < image_paths.size(); i += FLAGS_batch_size) { - // 读图像 + // Read image int im_vec_size = std::min(static_cast(image_paths.size()), i + FLAGS_batch_size); std::vector im_vec(im_vec_size - i); diff --git a/deploy/cpp/demo/detector.cpp b/deploy/cpp/demo/detector.cpp index 32fbaafddc9cdbcfddf69164197143238bf26ca4..ef7fd782715bef5d9cc1dae43c87ceaa123e914f 100644 --- a/deploy/cpp/demo/detector.cpp +++ b/deploy/cpp/demo/detector.cpp @@ -43,10 +43,9 @@ DEFINE_double(threshold, DEFINE_int32(thread_num, omp_get_num_procs(), "Number of preprocessing threads"); -DEFINE_bool(use_ir_optim, true, "use ir optimization"); int main(int argc, char** argv) { - // 解析命令行参数 + // Parsing command-line google::ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_model_dir == "") { @@ -57,17 +56,16 @@ int main(int argc, char** argv) { std::cerr << "--image or --image_list need to be defined" << std::endl; return -1; } - // 加载模型 + // Load model PaddleX::Model model; model.Init(FLAGS_model_dir, FLAGS_use_gpu, FLAGS_use_trt, FLAGS_gpu_id, - FLAGS_key, - FLAGS_use_ir_optim); + FLAGS_key); int imgs = 1; std::string save_dir = "output"; - // 进行预测 + // Predict if (FLAGS_image_list != "") { std::ifstream inf(FLAGS_image_list); if (!inf) { @@ -92,7 +90,7 @@ int main(int argc, char** argv) { im_vec[j - i] = std::move(cv::imread(image_paths[j], 1)); } model.predict(im_vec, &results, thread_num); - // 输出结果目标框 + // Output predicted bounding boxes for (int j = 0; j < im_vec_size - i; ++j) { for (int k = 0; k < results[j].boxes.size(); ++k) { std::cout << "image file: " << image_paths[i + j] << ", "; @@ -106,7 +104,7 @@ int main(int argc, char** argv) { << results[j].boxes[k].coordinate[3] << ")" << std::endl; } } - // 可视化 + // Visualize results for (int j = 0; j < im_vec_size - i; ++j) { cv::Mat vis_img = PaddleX::Visualize( im_vec[j], results[j], model.labels, FLAGS_threshold); @@ -120,7 +118,7 @@ int main(int argc, char** argv) { PaddleX::DetResult result; cv::Mat im = cv::imread(FLAGS_image, 1); model.predict(im, &result); - // 输出结果目标框 + // Output predicted bounding boxes for (int i = 0; i < result.boxes.size(); ++i) { std::cout << "image file: " << FLAGS_image << std::endl; std::cout << ", predict label: " << result.boxes[i].category @@ -132,7 +130,7 @@ int main(int argc, char** argv) { << result.boxes[i].coordinate[3] << ")" << std::endl; } - // 可视化 + // Visualize results cv::Mat vis_img = PaddleX::Visualize(im, result, model.labels, FLAGS_threshold); std::string save_path = diff --git a/deploy/cpp/demo/segmenter.cpp b/deploy/cpp/demo/segmenter.cpp index b3b8fad9ac2dce33722c71d9d50d354349298230..d13a328f5beecc90fe9257a4f32ee63a8fe609a5 100644 --- a/deploy/cpp/demo/segmenter.cpp +++ b/deploy/cpp/demo/segmenter.cpp @@ -39,10 +39,9 @@ DEFINE_int32(batch_size, 1, "Batch size of infering"); DEFINE_int32(thread_num, omp_get_num_procs(), "Number of preprocessing threads"); -DEFINE_bool(use_ir_optim, false, "use ir optimization"); int main(int argc, char** argv) { - // 解析命令行参数 + // Parsing command-line google::ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_model_dir == "") { @@ -54,16 +53,15 @@ int main(int argc, char** argv) { return -1; } - // 加载模型 + // Load model PaddleX::Model model; model.Init(FLAGS_model_dir, FLAGS_use_gpu, FLAGS_use_trt, FLAGS_gpu_id, - FLAGS_key, - FLAGS_use_ir_optim); + FLAGS_key); int imgs = 1; - // 进行预测 + // Predict if (FLAGS_image_list != "") { std::ifstream inf(FLAGS_image_list); if (!inf) { @@ -88,7 +86,7 @@ int main(int argc, char** argv) { im_vec[j - i] = std::move(cv::imread(image_paths[j], 1)); } model.predict(im_vec, &results, thread_num); - // 可视化 + // Visualize results for (int j = 0; j < im_vec_size - i; ++j) { cv::Mat vis_img = PaddleX::Visualize(im_vec[j], results[j], model.labels); @@ -102,7 +100,7 @@ int main(int argc, char** argv) { PaddleX::SegResult result; cv::Mat im = cv::imread(FLAGS_image, 1); model.predict(im, &result); - // 可视化 + // Visualize results cv::Mat vis_img = PaddleX::Visualize(im, result, model.labels); std::string save_path = PaddleX::generate_save_path(FLAGS_save_dir, FLAGS_image); diff --git a/deploy/cpp/demo/video_classifier.cpp b/deploy/cpp/demo/video_classifier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96be867d40800455184b7938dc829e8a0b8f8390 --- /dev/null +++ b/deploy/cpp/demo/video_classifier.cpp @@ -0,0 +1,186 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include +#include + +#include "include/paddlex/paddlex.h" +#include "include/paddlex/visualize.h" + +#if defined(__arm__) || defined(__aarch64__) +#include +#endif + +using namespace std::chrono; // NOLINT + +DEFINE_string(model_dir, "", "Path of inference model"); +DEFINE_bool(use_gpu, false, "Infering with GPU or CPU"); +DEFINE_bool(use_trt, false, "Infering with TensorRT"); +DEFINE_int32(gpu_id, 0, "GPU card id"); +DEFINE_string(key, "", "key of encryption"); +DEFINE_bool(use_camera, false, "Infering with Camera"); +DEFINE_int32(camera_id, 0, "Camera id"); +DEFINE_string(video_path, "", "Path of input video"); +DEFINE_bool(show_result, false, "show the result of each frame with a window"); +DEFINE_bool(save_result, true, "save the result of each frame to a video"); +DEFINE_string(save_dir, "output", "Path to save visualized image"); + +int main(int argc, char** argv) { + // Parsing command-line + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_model_dir == "") { + std::cerr << "--model_dir need to be defined" << std::endl; + return -1; + } + if (FLAGS_video_path == "" & FLAGS_use_camera == false) { + std::cerr << "--video_path or --use_camera need to be defined" << std::endl; + return -1; + } + + // Load model + PaddleX::Model model; + model.Init(FLAGS_model_dir, + FLAGS_use_gpu, + FLAGS_use_trt, + FLAGS_gpu_id, + FLAGS_key); + + // Open video + cv::VideoCapture capture; + if (FLAGS_use_camera) { + capture.open(FLAGS_camera_id); + if (!capture.isOpened()) { + std::cout << "Can not open the camera " + << FLAGS_camera_id << "." + << std::endl; + return -1; + } + } else { + capture.open(FLAGS_video_path); + if (!capture.isOpened()) { + std::cout << "Can not open the video " + << FLAGS_video_path << "." + << std::endl; + return -1; + } + } + + // Create a VideoWriter + cv::VideoWriter video_out; + std::string video_out_path; + if (FLAGS_save_result) { + // Get video information: resolution, fps + int video_width = static_cast(capture.get(CV_CAP_PROP_FRAME_WIDTH)); + int video_height = static_cast(capture.get(CV_CAP_PROP_FRAME_HEIGHT)); + int video_fps = static_cast(capture.get(CV_CAP_PROP_FPS)); + int video_fourcc; + if (FLAGS_use_camera) { + video_fourcc = 828601953; + } else { + video_fourcc = static_cast(capture.get(CV_CAP_PROP_FOURCC)); + } + + if (FLAGS_use_camera) { + time_t now = time(0); + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, + std::to_string(now) + ".mp4"); + } else { + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, FLAGS_video_path); + } + video_out.open(video_out_path.c_str(), + video_fourcc, + video_fps, + cv::Size(video_width, video_height), + true); + if (!video_out.isOpened()) { + std::cout << "Create video writer failed!" << std::endl; + return -1; + } + } + + PaddleX::ClsResult result; + cv::Mat frame; + int key; + while (capture.read(frame)) { + if (FLAGS_show_result || FLAGS_use_camera) { + key = cv::waitKey(1); + // When pressing `ESC`, then exit program and result video is saved + if (key == 27) { + break; + } + } else if (frame.empty()) { + break; + } + // Begin to predict + model.predict(frame, &result); + // Visualize results + cv::Mat vis_img = frame.clone(); + auto colormap = PaddleX::GenerateColorMap(model.labels.size()); + int c1 = colormap[3 * result.category_id + 0]; + int c2 = colormap[3 * result.category_id + 1]; + int c3 = colormap[3 * result.category_id + 2]; + cv::Scalar text_color = cv::Scalar(c1, c2, c3); + std::string text = result.category; + text += std::to_string(static_cast(result.score * 100)) + "%"; + int font_face = cv::FONT_HERSHEY_SIMPLEX; + double font_scale = 0.5f; + float thickness = 0.5; + cv::Size text_size = + cv::getTextSize(text, font_face, font_scale, thickness, nullptr); + cv::Point origin; + origin.x = frame.cols / 2; + origin.y = frame.rows / 2; + cv::Rect text_back = cv::Rect(origin.x, + origin.y - text_size.height, + text_size.width, + text_size.height); + cv::rectangle(vis_img, text_back, text_color, -1); + cv::putText(vis_img, + text, + origin, + font_face, + font_scale, + cv::Scalar(255, 255, 255), + thickness); + if (FLAGS_show_result || FLAGS_use_camera) { + cv::imshow("video_classifier", vis_img); + } + if (FLAGS_save_result) { + video_out.write(vis_img); + } + std::cout << "Predict label: " << result.category + << ", label_id:" << result.category_id + << ", score: " << result.score << std::endl; + } + capture.release(); + if (FLAGS_save_result) { + video_out.release(); + std::cout << "Visualized output saved as " << video_out_path << std::endl; + } + if (FLAGS_show_result || FLAGS_use_camera) { + cv::destroyAllWindows(); + } + return 0; +} diff --git a/deploy/cpp/demo/video_detector.cpp b/deploy/cpp/demo/video_detector.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ee4d5bdb138d03020042e60d41ded0ca1efde46d --- /dev/null +++ b/deploy/cpp/demo/video_detector.cpp @@ -0,0 +1,159 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include +#include + +#include "include/paddlex/paddlex.h" +#include "include/paddlex/visualize.h" + +#if defined(__arm__) || defined(__aarch64__) +#include +#endif + +using namespace std::chrono; // NOLINT + +DEFINE_string(model_dir, "", "Path of inference model"); +DEFINE_bool(use_gpu, false, "Infering with GPU or CPU"); +DEFINE_bool(use_trt, false, "Infering with TensorRT"); +DEFINE_int32(gpu_id, 0, "GPU card id"); +DEFINE_bool(use_camera, false, "Infering with Camera"); +DEFINE_int32(camera_id, 0, "Camera id"); +DEFINE_string(video_path, "", "Path of input video"); +DEFINE_bool(show_result, false, "show the result of each frame with a window"); +DEFINE_bool(save_result, true, "save the result of each frame to a video"); +DEFINE_string(key, "", "key of encryption"); +DEFINE_string(save_dir, "output", "Path to save visualized image"); +DEFINE_double(threshold, + 0.5, + "The minimum scores of target boxes which are shown"); + +int main(int argc, char** argv) { + // Parsing command-line + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_model_dir == "") { + std::cerr << "--model_dir need to be defined" << std::endl; + return -1; + } + if (FLAGS_video_path == "" & FLAGS_use_camera == false) { + std::cerr << "--video_path or --use_camera need to be defined" << std::endl; + return -1; + } + // Load model + PaddleX::Model model; + model.Init(FLAGS_model_dir, + FLAGS_use_gpu, + FLAGS_use_trt, + FLAGS_gpu_id, + FLAGS_key); + // Open video + cv::VideoCapture capture; + if (FLAGS_use_camera) { + capture.open(FLAGS_camera_id); + if (!capture.isOpened()) { + std::cout << "Can not open the camera " + << FLAGS_camera_id << "." + << std::endl; + return -1; + } + } else { + capture.open(FLAGS_video_path); + if (!capture.isOpened()) { + std::cout << "Can not open the video " + << FLAGS_video_path << "." + << std::endl; + return -1; + } + } + + // Create a VideoWriter + cv::VideoWriter video_out; + std::string video_out_path; + if (FLAGS_save_result) { + // Get video information: resolution, fps + int video_width = static_cast(capture.get(CV_CAP_PROP_FRAME_WIDTH)); + int video_height = static_cast(capture.get(CV_CAP_PROP_FRAME_HEIGHT)); + int video_fps = static_cast(capture.get(CV_CAP_PROP_FPS)); + int video_fourcc; + if (FLAGS_use_camera) { + video_fourcc = 828601953; + } else { + video_fourcc = static_cast(capture.get(CV_CAP_PROP_FOURCC)); + } + + if (FLAGS_use_camera) { + time_t now = time(0); + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, + std::to_string(now) + ".mp4"); + } else { + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, FLAGS_video_path); + } + video_out.open(video_out_path.c_str(), + video_fourcc, + video_fps, + cv::Size(video_width, video_height), + true); + if (!video_out.isOpened()) { + std::cout << "Create video writer failed!" << std::endl; + return -1; + } + } + + PaddleX::DetResult result; + cv::Mat frame; + int key; + while (capture.read(frame)) { + if (FLAGS_show_result || FLAGS_use_camera) { + key = cv::waitKey(1); + // When pressing `ESC`, then exit program and result video is saved + if (key == 27) { + break; + } + } else if (frame.empty()) { + break; + } + // Begin to predict + model.predict(frame, &result); + // Visualize results + cv::Mat vis_img = + PaddleX::Visualize(frame, result, model.labels, FLAGS_threshold); + if (FLAGS_show_result || FLAGS_use_camera) { + cv::imshow("video_detector", vis_img); + } + if (FLAGS_save_result) { + video_out.write(vis_img); + } + result.clear(); + } + capture.release(); + if (FLAGS_save_result) { + std::cout << "Visualized output saved as " << video_out_path << std::endl; + video_out.release(); + } + if (FLAGS_show_result || FLAGS_use_camera) { + cv::destroyAllWindows(); + } + return 0; +} diff --git a/deploy/cpp/demo/video_segmenter.cpp b/deploy/cpp/demo/video_segmenter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a835117cd1434b5f26e0fb660e6fe07ef56e607 --- /dev/null +++ b/deploy/cpp/demo/video_segmenter.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include +#include "include/paddlex/paddlex.h" +#include "include/paddlex/visualize.h" + +#if defined(__arm__) || defined(__aarch64__) +#include +#endif + +using namespace std::chrono; // NOLINT + +DEFINE_string(model_dir, "", "Path of inference model"); +DEFINE_bool(use_gpu, false, "Infering with GPU or CPU"); +DEFINE_bool(use_trt, false, "Infering with TensorRT"); +DEFINE_int32(gpu_id, 0, "GPU card id"); +DEFINE_string(key, "", "key of encryption"); +DEFINE_bool(use_camera, false, "Infering with Camera"); +DEFINE_int32(camera_id, 0, "Camera id"); +DEFINE_string(video_path, "", "Path of input video"); +DEFINE_bool(show_result, false, "show the result of each frame with a window"); +DEFINE_bool(save_result, true, "save the result of each frame to a video"); +DEFINE_string(save_dir, "output", "Path to save visualized image"); + +int main(int argc, char** argv) { + // Parsing command-line + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_model_dir == "") { + std::cerr << "--model_dir need to be defined" << std::endl; + return -1; + } + if (FLAGS_video_path == "" & FLAGS_use_camera == false) { + std::cerr << "--video_path or --use_camera need to be defined" << std::endl; + return -1; + } + + // Load model + PaddleX::Model model; + model.Init(FLAGS_model_dir, + FLAGS_use_gpu, + FLAGS_use_trt, + FLAGS_gpu_id, + FLAGS_key); + // Open video + cv::VideoCapture capture; + if (FLAGS_use_camera) { + capture.open(FLAGS_camera_id); + if (!capture.isOpened()) { + std::cout << "Can not open the camera " + << FLAGS_camera_id << "." + << std::endl; + return -1; + } + } else { + capture.open(FLAGS_video_path); + if (!capture.isOpened()) { + std::cout << "Can not open the video " + << FLAGS_video_path << "." + << std::endl; + return -1; + } + } + + + // Create a VideoWriter + cv::VideoWriter video_out; + std::string video_out_path; + if (FLAGS_save_result) { + // Get video information: resolution, fps + int video_width = static_cast(capture.get(CV_CAP_PROP_FRAME_WIDTH)); + int video_height = static_cast(capture.get(CV_CAP_PROP_FRAME_HEIGHT)); + int video_fps = static_cast(capture.get(CV_CAP_PROP_FPS)); + int video_fourcc; + if (FLAGS_use_camera) { + video_fourcc = 828601953; + } else { + video_fourcc = static_cast(capture.get(CV_CAP_PROP_FOURCC)); + } + + if (FLAGS_use_camera) { + time_t now = time(0); + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, + std::to_string(now) + ".mp4"); + } else { + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, FLAGS_video_path); + } + video_out.open(video_out_path.c_str(), + video_fourcc, + video_fps, + cv::Size(video_width, video_height), + true); + if (!video_out.isOpened()) { + std::cout << "Create video writer failed!" << std::endl; + return -1; + } + } + + PaddleX::SegResult result; + cv::Mat frame; + int key; + while (capture.read(frame)) { + if (FLAGS_show_result || FLAGS_use_camera) { + key = cv::waitKey(1); + // When pressing `ESC`, then exit program and result video is saved + if (key == 27) { + break; + } + } else if (frame.empty()) { + break; + } + // Begin to predict + model.predict(frame, &result); + // Visualize results + cv::Mat vis_img = PaddleX::Visualize(frame, result, model.labels); + if (FLAGS_show_result || FLAGS_use_camera) { + cv::imshow("video_segmenter", vis_img); + } + if (FLAGS_save_result) { + video_out.write(vis_img); + } + result.clear(); + } + capture.release(); + if (FLAGS_save_result) { + video_out.release(); + std::cout << "Visualized output saved as " << video_out_path << std::endl; + } + if (FLAGS_show_result || FLAGS_use_camera) { + cv::destroyAllWindows(); + } + return 0; +} diff --git a/deploy/cpp/include/paddlex/visualize.h b/deploy/cpp/include/paddlex/visualize.h index c64fa0addcca451db56766db56fe237a8ed35dc0..873cea10ad5f725a4a4c477559de0b659f94a7b5 100644 --- a/deploy/cpp/include/paddlex/visualize.h +++ b/deploy/cpp/include/paddlex/visualize.h @@ -24,8 +24,8 @@ #include // #include #if defined(__arm__) || defined(__aarch64__) // for arm -#include -#include +#include +#include #else #include #include diff --git a/deploy/cpp/scripts/bootstrap.sh b/deploy/cpp/scripts/bootstrap.sh index 283d75928a68a507d852ec61eb89e115e581146f..bb9756204e9e610365f67aa37dc78d1b5eaf80b8 100644 --- a/deploy/cpp/scripts/bootstrap.sh +++ b/deploy/cpp/scripts/bootstrap.sh @@ -7,12 +7,12 @@ if [ ! -d "./paddlex-encryption" ]; then fi # download pre-compiled opencv lib -OPENCV_URL=https://paddleseg.bj.bcebos.com/deploy/docker/opencv3gcc4.8.tar.bz2 -if [ ! -d "./deps/opencv3gcc4.8" ]; then +OPENCV_URL=https://bj.bcebos.com/paddleseg/deploy/opencv3.4.6gcc4.8ffmpeg.tar.gz2 +if [ ! -d "./deps/opencv3.4.6gcc4.8ffmpeg/" ]; then mkdir -p deps cd deps wget -c ${OPENCV_URL} - tar xvfj opencv3gcc4.8.tar.bz2 - rm -rf opencv3gcc4.8.tar.bz2 + tar xvfj opencv3.4.6gcc4.8ffmpeg.tar.gz2 + rm -rf opencv3.4.6gcc4.8ffmpeg.tar.gz2 cd .. fi diff --git a/deploy/cpp/scripts/build.sh b/deploy/cpp/scripts/build.sh index e87d7bf4797f1833d88379df0587733958639b06..6d6ad25b24170a27639f9b1d651888c4027dbeed 100644 --- a/deploy/cpp/scripts/build.sh +++ b/deploy/cpp/scripts/build.sh @@ -24,7 +24,7 @@ ENCRYPTION_DIR=$(pwd)/paddlex-encryption # OPENCV 路径, 如果使用自带预编译版本可不修改 sh $(pwd)/scripts/bootstrap.sh # 下载预编译版本的opencv -OPENCV_DIR=$(pwd)/deps/opencv3gcc4.8/ +OPENCV_DIR=$(pwd)/deps/opencv3.4.6gcc4.8ffmpeg/ # 以下无需改动 rm -rf build @@ -42,4 +42,4 @@ cmake .. \ -DCUDNN_LIB=${CUDNN_LIB} \ -DENCRYPTION_DIR=${ENCRYPTION_DIR} \ -DOPENCV_DIR=${OPENCV_DIR} -make +make -j16 diff --git a/deploy/cpp/src/paddlex.cpp b/deploy/cpp/src/paddlex.cpp index 1bd30863e894910581384296edd2f656b79ffe21..47dc5b9e9e9104e2d4983a8ac077e5a0810610cf 100644 --- a/deploy/cpp/src/paddlex.cpp +++ b/deploy/cpp/src/paddlex.cpp @@ -65,7 +65,11 @@ void Model::create_predictor(const std::string& model_dir, config.SwitchUseFeedFetchOps(false); config.SwitchSpecifyInputNames(true); // 开启图优化 +#if defined(__arm__) || defined(__aarch64__) + config.SwitchIrOptim(false); +#else config.SwitchIrOptim(use_ir_optim); +#endif // 开启内存优化 config.EnableMemoryOptim(); if (use_trt) { diff --git a/docs/deploy/nvidia-jetson.md b/docs/deploy/nvidia-jetson.md index 8a187b8f6a8fed1f15cb10b9c8cf8adb8efabc00..5cd4c76b6d24f0308023dcd49fcf053696876b6a 100644 --- a/docs/deploy/nvidia-jetson.md +++ b/docs/deploy/nvidia-jetson.md @@ -57,13 +57,6 @@ CUDA_LIB=/usr/local/cuda/lib64 # CUDNN 的 lib 路径 CUDNN_LIB=/usr/local/cuda/lib64 -# 是否加载加密后的模型 -WITH_ENCRYPTION=OFF - -# OPENCV 路径, 如果使用自带预编译版本可不修改 -sh $(pwd)/scripts/jetson_bootstrap.sh # 下载预编译版本的opencv -OPENCV_DIR=$(pwd)/deps/opencv3/ - # 以下无需改动 rm -rf build mkdir -p build @@ -77,18 +70,13 @@ cmake .. \ -DPADDLE_DIR=${PADDLE_DIR} \ -DWITH_STATIC_LIB=${WITH_STATIC_LIB} \ -DCUDA_LIB=${CUDA_LIB} \ - -DCUDNN_LIB=${CUDNN_LIB} \ - -DENCRYPTION_DIR=${ENCRYPTION_DIR} \ - -DOPENCV_DIR=${OPENCV_DIR} + -DCUDNN_LIB=${CUDNN_LIB} make ``` -**注意:** linux环境下编译会自动下载OPENCV和YAML,如果编译环境无法访问外网,可手动下载: +**注意:** linux环境下编译会自动下载YAML,如果编译环境无法访问外网,可手动下载: -- [opencv3_aarch.tgz](https://bj.bcebos.com/paddlex/deploy/tools/opencv3_aarch.tgz) - [yaml-cpp.zip](https://bj.bcebos.com/paddlex/deploy/deps/yaml-cpp.zip) -opencv3_aarch.tgz文件下载后解压,然后在script/build.sh中指定`OPENCE_DIR`为解压后的路径。 - yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https://bj.bcebos.com/paddlex/deploy/deps/yaml-cpp.zip` 中的网址,改为下载文件的路径。 修改脚本设置好主要参数后,执行`build`脚本: @@ -100,7 +88,7 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// **在加载模型前,请检查你的模型目录中文件应该包括`model.yml`、`__model__`和`__params__`三个文件。如若不满足这个条件,请参考[模型导出为Inference文档](export_model.md)将模型导出为部署格式。** -编译成功后,预测demo的可执行程序分别为`build/demo/detector`,`build/demo/classifier`,`build/demo/segmenter`,用户可根据自己的模型类型选择,其主要命令参数说明如下: +* 编译成功后,图片预测demo的可执行程序分别为`build/demo/detector`,`build/demo/classifier`,`build/demo/segmenter`,用户可根据自己的模型类型选择,其主要命令参数说明如下: | 参数 | 说明 | | ---- | ---- | @@ -111,10 +99,26 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// | use_trt | 是否使用 TensorRT 预测, 支持值为0或1(默认值为0) | | gpu_id | GPU 设备ID, 默认值为0 | | save_dir | 保存可视化结果的路径, 默认值为"output",**classfier无该参数** | -| key | 加密过程中产生的密钥信息,默认值为""表示加载的是未加密的模型 | | batch_size | 预测的批量大小,默认为1 | | thread_num | 预测的线程数,默认为cpu处理器个数 | -| use_ir_optim | 是否使用图优化策略,支持值为0或1(默认值为1,图像分割默认值为0)| + +* 编译成功后,视频预测demo的可执行程序分别为`build/demo/video_detector`,`build/demo/video_classifier`,`build/demo/video_segmenter`,用户可根据自己的模型类型选择,其主要命令参数说明如下: + +| 参数 | 说明 | +| ---- | ---- | +| model_dir | 导出的预测模型所在路径 | +| use_camera | 是否使用摄像头预测,支持值为0或1(默认值为0) | +| camera_id | 摄像头设备ID,默认值为0 | +| video_path | 视频文件的路径 | +| use_gpu | 是否使用 GPU 预测, 支持值为0或1(默认值为0) | +| use_trt | 是否使用 TensorRT 预测, 支持值为0或1(默认值为0) | +| gpu_id | GPU 设备ID, 默认值为0 | +| show_result | 对视频文件做预测时,是否在屏幕上实时显示预测可视化结果(因加入了延迟处理,故显示结果不能反映真实的帧率),支持值为0或1(默认值为0) | +| save_result | 是否将每帧的预测可视结果保存为视频文件,支持值为0或1(默认值为1) | +| save_dir | 保存可视化结果的路径, 默认值为"output" | + +**注意:若系统无GUI,则不要将show_result设置为1。当使用摄像头预测时,按`ESC`键可关闭摄像头并推出预测程序。** + ## 样例 @@ -143,3 +147,21 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// ./build/demo/detector --model_dir=/root/projects/inference_model --image_list=/root/projects/images_list.txt --use_gpu=1 --save_dir=output --batch_size=2 --thread_num=2 ``` 图片文件`可视化预测结果`会保存在`save_dir`参数设置的目录下。 + +**样例三:** + +使用摄像头预测: + +```shell +./build/demo/video_detector --model_dir=/root/projects/inference_model --use_camera=1 --use_gpu=1 --save_dir=output --save_result=1 +``` +当`save_result`设置为1时,`可视化预测结果`会以视频文件的格式保存在`save_dir`参数设置的目录下。 + +**样例四:** + +对视频文件进行预测: + +```shell +./build/demo/video_detector --model_dir=/root/projects/inference_model --video_path=/path/to/video_file --use_gpu=1 --save_dir=output --show_result=1 --save_result=1 +``` +当`save_result`设置为1时,`可视化预测结果`会以视频文件的格式保存在`save_dir`参数设置的目录下。如果系统有GUI,通过将`show_result`设置为1在屏幕上观看可视化预测结果。 diff --git a/docs/deploy/server/cpp/linux.md b/docs/deploy/server/cpp/linux.md index c7813ede08082555268eba5a46a77cbcd4cab13e..d81569e6d280d06e3637dd13a012e38169b615a2 100644 --- a/docs/deploy/server/cpp/linux.md +++ b/docs/deploy/server/cpp/linux.md @@ -116,7 +116,7 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// **在加载模型前,请检查你的模型目录中文件应该包括`model.yml`、`__model__`和`__params__`三个文件。如若不满足这个条件,请参考[模型导出为Inference文档](../../export_model.md)将模型导出为部署格式。** -编译成功后,预测demo的可执行程序分别为`build/demo/detector`,`build/demo/classifier`,`build/demo/segmenter`,用户可根据自己的模型类型选择,其主要命令参数说明如下: +* 编译成功后,图片预测demo的可执行程序分别为`build/demo/detector`,`build/demo/classifier`,`build/demo/segmenter`,用户可根据自己的模型类型选择,其主要命令参数说明如下: | 参数 | 说明 | | ---- | ---- | @@ -130,7 +130,24 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// | key | 加密过程中产生的密钥信息,默认值为""表示加载的是未加密的模型 | | batch_size | 预测的批量大小,默认为1 | | thread_num | 预测的线程数,默认为cpu处理器个数 | -| use_ir_optim | 是否使用图优化策略,支持值为0或1(默认值为1,图像分割默认值为0)| + +* 编译成功后,视频预测demo的可执行程序分别为`build/demo/video_detector`,`build/demo/video_classifier`,`build/demo/video_segmenter`,用户可根据自己的模型类型选择,其主要命令参数说明如下: + +| 参数 | 说明 | +| ---- | ---- | +| model_dir | 导出的预测模型所在路径 | +| use_camera | 是否使用摄像头预测,支持值为0或1(默认值为0) | +| camera_id | 摄像头设备ID,默认值为0 | +| video_path | 视频文件的路径 | +| use_gpu | 是否使用 GPU 预测, 支持值为0或1(默认值为0) | +| use_trt | 是否使用 TensorRT 预测, 支持值为0或1(默认值为0) | +| gpu_id | GPU 设备ID, 默认值为0 | +| show_result | 对视频文件做预测时,是否在屏幕上实时显示预测可视化结果(因加入了延迟处理,故显示结果不能反映真实的帧率),支持值为0或1(默认值为0) | +| save_result | 是否将每帧的预测可视结果保存为视频文件,支持值为0或1(默认值为1) | +| save_dir | 保存可视化结果的路径, 默认值为"output"| +| key | 加密过程中产生的密钥信息,默认值为""表示加载的是未加密的模型 | + +**注意:若系统无GUI,则不要将show_result设置为1。当使用摄像头预测时,按`ESC`键可关闭摄像头并推出预测程序。** ## 样例 @@ -138,7 +155,7 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// > 关于预测速度的说明:加载模型后前几张图片的预测速度会较慢,这是因为运行启动时涉及到内存显存初始化等步骤,通常在预测20-30张图片后模型的预测速度达到稳定。 -`样例一`: +**样例一:** 不使用`GPU`测试图片 `/root/projects/images/xiaoduxiong.jpeg` @@ -148,7 +165,7 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// 图片文件`可视化预测结果`会保存在`save_dir`参数设置的目录下。 -`样例二`: +**样例二:** 使用`GPU`预测多个图片`/root/projects/image_list.txt`,image_list.txt内容的格式如下: ``` @@ -161,3 +178,21 @@ yaml-cpp.zip文件下载后无需解压,在cmake/yaml.cmake中将`URL https:// ./build/demo/detector --model_dir=/root/projects/inference_model --image_list=/root/projects/images_list.txt --use_gpu=1 --save_dir=output --batch_size=2 --thread_num=2 ``` 图片文件`可视化预测结果`会保存在`save_dir`参数设置的目录下。 + +**样例三:** + +使用摄像头预测: + +```shell +./build/demo/video_detector --model_dir=/root/projects/inference_model --use_camera=1 --use_gpu=1 --save_dir=output --save_result=1 +``` +当`save_result`设置为1时,`可视化预测结果`会以视频文件的格式保存在`save_dir`参数设置的目录下。 + +**样例四:** + +对视频文件进行预测: + +```shell +./build/demo/video_detector --model_dir=/root/projects/inference_model --video_path=/path/to/video_file --use_gpu=1 --save_dir=output --show_result=1 --save_result=1 +``` +当`save_result`设置为1时,`可视化预测结果`会以视频文件的格式保存在`save_dir`参数设置的目录下。如果系统有GUI,通过将`show_result`设置为1在屏幕上观看可视化预测结果。 diff --git a/docs/deploy/server/cpp/windows.md b/docs/deploy/server/cpp/windows.md index 641d1cba9262e60bf43a152f288e23bda4b74464..4c5ef9e201424cca4b3bcb291ffa74df9c45546b 100644 --- a/docs/deploy/server/cpp/windows.md +++ b/docs/deploy/server/cpp/windows.md @@ -101,7 +101,7 @@ D: cd D:\projects\PaddleX\deploy\cpp\out\build\x64-Release ``` -编译成功后,预测demo的入口程序为`paddlex_inference\detector.exe`,`paddlex_inference\classifier.exe`,`paddlex_inference\segmenter.exe`,用户可根据自己的模型类型选择,其主要命令参数说明如下: +* 编译成功后,图片预测demo的入口程序为`paddlex_inference\detector.exe`,`paddlex_inference\classifier.exe`,`paddlex_inference\segmenter.exe`,用户可根据自己的模型类型选择,其主要命令参数说明如下: | 参数 | 说明 | | ---- | ---- | @@ -114,7 +114,24 @@ cd D:\projects\PaddleX\deploy\cpp\out\build\x64-Release | key | 加密过程中产生的密钥信息,默认值为""表示加载的是未加密的模型 | | batch_size | 预测的批量大小,默认为1 | | thread_num | 预测的线程数,默认为cpu处理器个数 | -| use_ir_optim | 是否使用图优化策略,支持值为0或1(默认值为1,图像分割默认值为0)| + +* 编译成功后,视频预测demo的入口程序为`paddlex_inference\video_detector.exe`,`paddlex_inference\video_classifier.exe`,`paddlex_inference\video_segmenter.exe`,用户可根据自己的模型类型选择,其主要命令参数说明如下: + +| 参数 | 说明 | +| ---- | ---- | +| model_dir | 导出的预测模型所在路径 | +| use_camera | 是否使用摄像头预测,支持值为0或1(默认值为0) | +| camera_id | 摄像头设备ID,默认值为0 | +| video_path | 视频文件的路径 | +| use_gpu | 是否使用 GPU 预测, 支持值为0或1(默认值为0) | +| gpu_id | GPU 设备ID, 默认值为0 | +| show_result | 对视频文件做预测时,是否在屏幕上实时显示预测可视化结果(因加入了延迟处理,故显示结果不能反映真实的帧率),支持值为0或1(默认值为0) | +| save_result | 是否将每帧的预测可视结果保存为视频文件,支持值为0或1(默认值为1) | +| save_dir | 保存可视化结果的路径, 默认值为"output" | +| key | 加密过程中产生的密钥信息,默认值为""表示加载的是未加密的模型 | + +**注意:若系统无GUI,则不要将show_result设置为1。当使用摄像头预测时,按`ESC`键可关闭摄像头并推出预测程序。** + ## 样例 @@ -157,3 +174,18 @@ D:\images\xiaoduxiongn.jpeg ``` `--key`传入加密工具输出的密钥,例如`kLAl1qOs5uRbFt0/RrIDTZW2+tOf5bzvUIaHGF8lJ1c=`, 图片文件可视化预测结果会保存在`save_dir`参数设置的目录下。 + +### 样例四:(使用未加密的模型开启摄像头预测) + +```shell +.\paddlex_inference\video_detector.exe --model_dir=D:\projects\inference_model --use_camera=1 --use_gpu=1 --save_dir=output +``` +当`save_result`设置为1时,`可视化预测结果`会以视频文件的格式保存在`save_dir`参数设置的目录下。 + +### 样例五:(使用未加密的模型对视频文件做预测) + + +```shell +.\paddlex_inference\video_detector.exe --model_dir=D:\projects\inference_model --video_path=D:\projects\video_test.mp4 --use_gpu=1 --show_result=1 --save_dir=output +``` +当`save_result`设置为1时,`可视化预测结果`会以视频文件的格式保存在`save_dir`参数设置的目录下。如果系统有GUI,通过将`show_result`设置为1在屏幕上观看可视化预测结果。 diff --git a/docs/deploy/server/python.md b/docs/deploy/server/python.md index 36b0891176bb9cf86078a3c9f9dfe5b48419613b..36e8d4639bc48400dc46b67e1b811ff42ac3fad1 100644 --- a/docs/deploy/server/python.md +++ b/docs/deploy/server/python.md @@ -30,6 +30,25 @@ image_list = ['xiaoduxiong_test_image/JPEGImages/WeChatIMG110.jpeg', result = predictor.predict(image_list=image_list) ``` +* 视频流预测 +``` +import cv2 +import paddlex as pdx +predictor = pdx.deploy.Predictor('./inference_model') +cap = cv2.VideoCapture(0) +while cap.isOpened(): + ret, frame = cap.read() + if ret: + result = predictor.predict(frame) + vis_img = pdx.det.visualize(frame, result, threshold=0.6, save_dir=None) + cv2.imshow('Xiaoduxiong', vis_img) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + else: + break +cap.release() +``` + > 关于预测速度的说明:加载模型后前几张图片的预测速度会较慢,这是因为运行启动时涉及到内存显存初始化等步骤,通常在预测20-30张图片后模型的预测速度达到稳定。 ## 预测性能对比 diff --git a/docs/examples/human_segmentation.md b/docs/examples/human_segmentation.md index b4c707709c9ea0304a44daec085ea4fa1ca2678c..504132bcad5476309d0944fb6d5f94787fb6025f 100644 --- a/docs/examples/human_segmentation.md +++ b/docs/examples/human_segmentation.md @@ -1,12 +1,12 @@ # 人像分割模型 -本教程基于PaddleX核心分割模型实现人像分割,开放预训练模型和测试数据、支持视频流人像分割、提供模型Fine-tune到Paddle Lite移动端部署的全流程应用指南。 +本教程基于PaddleX核心分割模型实现人像分割,开放预训练模型和测试数据、支持视频流人像分割、提供模型Fine-tune到Paddle Lite移动端及Nvidia Jeston嵌入式设备部署的全流程应用指南。 ## 预训练模型和测试数据 #### 预训练模型 -本案例开放了两个在大规模人像数据集上训练好的模型,以满足服务器端场景和移动端场景的需求。使用这些模型可以快速体验视频流人像分割,也可以部署到移动端进行实时人像分割,也可以用于完成模型Fine-tuning。 +本案例开放了两个在大规模人像数据集上训练好的模型,以满足服务器端场景和移动端场景的需求。使用这些模型可以快速体验视频流人像分割,也可以部署到移动端或嵌入式设备进行实时人像分割,也可以用于完成模型Fine-tuning。 | 模型类型 | Checkpoint Parameter | Inference Model | Quant Inference Model | 备注 | | --- | --- | --- | ---| --- | @@ -243,15 +243,17 @@ python quant_offline.py --model_dir output/best_model \ * `--save_dir`: 量化模型保存路径 * `--image_shape`: 网络输入图像大小(w, h) -## Paddle Lite移动端部署 +## 推理部署 + +### Paddle Lite移动端部署 本案例将人像分割模型在移动端进行部署,部署流程展示如下,通用的移动端部署流程参见[Paddle Lite移动端部署](../../docs/deploy/paddlelite/android.md)。 -### 1. 将PaddleX模型导出为inference模型 +#### 1. 将PaddleX模型导出为inference模型 本案例使用humanseg_mobile_quant预训练模型,该模型已经是inference模型,不需要再执行模型导出步骤。如果不使用预训练模型,则执行上一章节`模型训练`中的`模型导出`将自己训练的模型导出为inference格式。 -### 2. 将inference模型优化为Paddle Lite模型 +#### 2. 将inference模型优化为Paddle Lite模型 下载并解压 [模型优化工具opt](https://bj.bcebos.com/paddlex/deploy/lite/model_optimize_tool_11cbd50e.tar.gz),进入模型优化工具opt所在路径后,执行以下命令: @@ -273,16 +275,16 @@ python quant_offline.py --model_dir output/best_model \ 更详细的使用方法和参数含义请参考: [使用opt转化模型](https://paddle-lite.readthedocs.io/zh/latest/user_guides/opt/opt_bin.html) -### 3. 移动端预测 +#### 3. 移动端预测 PaddleX提供了基于PaddleX Android SDK的安卓demo,可供用户体验图像分类、目标检测、实例分割和语义分割,该demo位于`PaddleX/deploy/lite/android/demo`,用户将模型、配置文件和测试图片拷贝至该demo下进行预测。 -#### 3.1 前置依赖 +##### 3.1 前置依赖 * Android Studio 3.4 * Android手机或开发板 -#### 3.2 拷贝模型、配置文件和测试图片 +##### 3.2 拷贝模型、配置文件和测试图片 * 将Lite模型(.nb文件)拷贝到`PaddleX/deploy/lite/android/demo/app/src/main/assets/model/`目录下, 根据.nb文件的名字,修改文件`PaddleX/deploy/lite/android/demo/app/src/main/res/values/strings.xml`中的`MODEL_PATH_DEFAULT`; @@ -290,7 +292,7 @@ PaddleX提供了基于PaddleX Android SDK的安卓demo,可供用户体验图 * 将测试图片拷贝到`PaddleX/deploy/lite/android/demo/app/src/main/assets/images/`目录下,根据图片文件的名字,修改文件`PaddleX/deploy/lite/android/demo/app/src/main/res/values/strings.xml`中的`IMAGE_PATH_DEFAULT`。 -#### 3.3 导入工程并运行 +##### 3.3 导入工程并运行 * 打开Android Studio,在"Welcome to Android Studio"窗口点击"Open an existing Android Studio project",在弹出的路径选择窗口中进入`PaddleX/deploy/lite/android/demo`目录,然后点击右下角的"Open"按钮,导入工程; @@ -303,3 +305,58 @@ PaddleX提供了基于PaddleX Android SDK的安卓demo,可供用户体验图 测试图片及其分割结果如下所示: ![](./images/beauty.png) + +### Nvidia Jetson嵌入式设备部署 + +#### c++部署 + +step 1. 下载PaddleX源码 + +``` +git clone https://github.com/PaddlePaddle/PaddleX +``` + +step 2. 将`PaddleX/examples/human_segmentation/deploy/cpp`下的`human_segmenter.cpp`和`CMakeList.txt`拷贝至`PaddleX/deploy/cpp`目录下,拷贝之前可以将`PaddleX/deploy/cpp`下原本的`CMakeList.txt`做好备份。 + +step 3. 按照[Nvidia Jetson开发板部署](../deploy/nvidia-jetson.md)中的Step2至Step3完成C++预测代码的编译。 + +step 4. 编译成功后,可执行程为`build/human_segmenter`,其主要命令参数说明如下: + + | 参数 | 说明 | + | ---- | ---- | + | model_dir | 人像分割模型路径 | + | use_gpu | 是否使用 GPU 预测, 支持值为0或1(默认值为0)| + | gpu_id | GPU 设备ID, 默认值为0 | + | use_camera | 是否使用摄像头采集图片,支持值为0或1(默认值为0) | + | camera_id | 摄像头设备ID,默认值为0 | + | video_path | 视频文件的路径 | + | show_result | 对视频文件做预测时,是否在屏幕上实时显示预测可视化结果,支持值为0或1(默认值为0) | + | save_result | 是否将每帧的预测可视结果保存为视频文件,支持值为0或1(默认值为1) | + | image | 待预测的图片路径 | + | save_dir | 保存可视化结果的路径, 默认值为"output"| + +step 5. 推理预测 + + 用于部署推理的模型应为inference格式,本案例使用humanseg_server_inference预训练模型,该模型已经是inference模型,不需要再执行模型导出步骤。如果不使用预训练模型,则执行第2章节`模型训练`中的`模型导出`将自己训练的模型导出为inference格式。 + + * 使用未加密的模型对单张图片做预测 + + 待测试图片位于本案例提供的测试数据中,可以替换成自己的图片。 + + ```shell + ./build/human_segmenter --model_dir=/path/to/humanseg_server_inference --image=/path/to/data/mini_supervisely/Images/pexels-photo-63776.png --use_gpu=1 --save_dir=output + ``` + + * 使用未加密的模型开启摄像头做预测 + + ```shell + ./build/human_segmenter --model_dir=/path/to/humanseg_server_inference --use_camera=1 --save_result=1 --use_gpu=1 --save_dir=output + ``` + + * 使用未加密的模型对视频文件做预测 + + 待测试视频文件位于本案例提供的测试数据中,可以替换成自己的视频文件。 + + ```shell + ./build/human_segmenter --model_dir=/path/to/humanseg_server_inference --video_path=/path/to/data/mini_supervisely/video_test.mp4 --save_result=1 --use_gpu=1 --save_dir=output + ``` diff --git a/docs/examples/meter_reader.md b/docs/examples/meter_reader.md index 4fecce8a74ad5e0d2b4172a5b0f734522722f6ce..670d7d1399b55c672b17ed903663bf26c8a6ef84 100644 --- a/docs/examples/meter_reader.md +++ b/docs/examples/meter_reader.md @@ -245,7 +245,6 @@ step 5. 推理预测: ./build/meter_reader/meter_reader --det_model_dir=/path/to/det_inference_model --seg_model_dir=/path/to/seg_inference_model --use_camera=1 --use_gpu=1 --use_erode=1 --save_dir=output ``` - ## 模型训练 diff --git a/examples/human_segmentation/deploy/cpp/CMakeLists.txt b/examples/human_segmentation/deploy/cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc7a68f389710370d7e7bb0aa11f96596d3f8819 --- /dev/null +++ b/examples/human_segmentation/deploy/cpp/CMakeLists.txt @@ -0,0 +1,321 @@ +cmake_minimum_required(VERSION 3.0) +project(PaddleX CXX C) + +option(WITH_MKL "Compile human_segmenter with MKL/OpenBlas support,defaultuseMKL." ON) +option(WITH_GPU "Compile human_segmenter with GPU/CPU, default use CPU." ON) +if (NOT WIN32) + option(WITH_STATIC_LIB "Compile human_segmenter with static/shared library, default use static." OFF) +else() + option(WITH_STATIC_LIB "Compile human_segmenter with static/shared library, default use static." ON) +endif() +option(WITH_TENSORRT "Compile human_segmenter with TensorRT." OFF) +option(WITH_ENCRYPTION "Compile human_segmenter with encryption tool." OFF) + +SET(TENSORRT_DIR "" CACHE PATH "Location of libraries") +SET(PADDLE_DIR "" CACHE PATH "Location of libraries") +SET(OPENCV_DIR "" CACHE PATH "Location of libraries") +SET(ENCRYPTION_DIR"" CACHE PATH "Location of libraries") +SET(CUDA_LIB "" CACHE PATH "Location of libraries") + +if (NOT WIN32) + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +else() + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/paddlex_inference) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/paddlex_inference) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/paddlex_inference) +endif() + +if (NOT WIN32) + SET(YAML_BUILD_TYPE ON CACHE BOOL "yaml build shared library.") +else() + SET(YAML_BUILD_TYPE OFF CACHE BOOL "yaml build shared library.") +endif() +include(cmake/yaml-cpp.cmake) + +include_directories("${CMAKE_SOURCE_DIR}/") +include_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/src/ext-yaml-cpp/include") +link_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/lib") + +macro(safe_set_static_flag) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() + + +if (WITH_ENCRYPTION) +add_definitions( -DWITH_ENCRYPTION=${WITH_ENCRYPTION}) +endif() + +if (WITH_MKL) + ADD_DEFINITIONS(-DUSE_MKL) +endif() + +if (NOT DEFINED PADDLE_DIR OR ${PADDLE_DIR} STREQUAL "") + message(FATAL_ERROR "please set PADDLE_DIR with -DPADDLE_DIR=/path/paddle_influence_dir") +endif() + +if (NOT (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")) + if (NOT DEFINED OPENCV_DIR OR ${OPENCV_DIR} STREQUAL "") + message(FATAL_ERROR "please set OPENCV_DIR with -DOPENCV_DIR=/path/opencv") + endif() +endif() + +include_directories("${CMAKE_SOURCE_DIR}/") +include_directories("${PADDLE_DIR}/") +include_directories("${PADDLE_DIR}/third_party/install/protobuf/include") +include_directories("${PADDLE_DIR}/third_party/install/glog/include") +include_directories("${PADDLE_DIR}/third_party/install/gflags/include") +include_directories("${PADDLE_DIR}/third_party/install/xxhash/include") +if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/include") + include_directories("${PADDLE_DIR}/third_party/install/snappy/include") +endif() +if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/include") + include_directories("${PADDLE_DIR}/third_party/install/snappystream/include") +endif() +# zlib does not exist in 1.8.1 +if (EXISTS "${PADDLE_DIR}/third_party/install/zlib/include") + include_directories("${PADDLE_DIR}/third_party/install/zlib/include") +endif() + +include_directories("${PADDLE_DIR}/third_party/boost") +include_directories("${PADDLE_DIR}/third_party/eigen3") + +if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + link_directories("${PADDLE_DIR}/third_party/install/snappy/lib") +endif() +if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + link_directories("${PADDLE_DIR}/third_party/install/snappystream/lib") +endif() + +if (EXISTS "${PADDLE_DIR}/third_party/install/zlib/lib") + link_directories("${PADDLE_DIR}/third_party/install/zlib/lib") +endif() + +link_directories("${PADDLE_DIR}/third_party/install/protobuf/lib") +link_directories("${PADDLE_DIR}/third_party/install/glog/lib") +link_directories("${PADDLE_DIR}/third_party/install/gflags/lib") +link_directories("${PADDLE_DIR}/third_party/install/xxhash/lib") +link_directories("${PADDLE_DIR}/paddle/lib/") +link_directories("${CMAKE_CURRENT_BINARY_DIR}") + +if (WIN32) + include_directories("${PADDLE_DIR}/paddle/fluid/inference") + include_directories("${PADDLE_DIR}/paddle/include") + link_directories("${PADDLE_DIR}/paddle/fluid/inference") + find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/build/ NO_DEFAULT_PATH) + unset(OpenCV_DIR CACHE) +else () + if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") # x86_64 aarch64 + set(OpenCV_INCLUDE_DIRS "/usr/include/opencv4") + file(GLOB OpenCV_LIBS /usr/lib/aarch64-linux-gnu/libopencv_*${CMAKE_SHARED_LIBRARY_SUFFIX}) + message("OpenCV libs: ${OpenCV_LIBS}") + else() + find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/share/OpenCV NO_DEFAULT_PATH) + endif() + include_directories("${PADDLE_DIR}/paddle/include") + link_directories("${PADDLE_DIR}/paddle/lib") +endif () +include_directories(${OpenCV_INCLUDE_DIRS}) + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + find_package(OpenMP REQUIRED) + if (OPENMP_FOUND) + message("OPENMP FOUND") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${OpenMP_C_FLAGS}") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${OpenMP_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${OpenMP_CXX_FLAGS}") + endif() + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + if (WITH_STATIC_LIB) + safe_set_static_flag() + add_definitions(-DSTATIC_LIB) + endif() +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -o2 -fopenmp -std=c++11") + set(CMAKE_STATIC_LIBRARY_PREFIX "") +endif() + +if (WITH_GPU) + if (NOT DEFINED CUDA_LIB OR ${CUDA_LIB} STREQUAL "") + message(FATAL_ERROR "please set CUDA_LIB with -DCUDA_LIB=/path/cuda/lib64") + endif() + if (NOT WIN32) + if (NOT DEFINED CUDNN_LIB) + message(FATAL_ERROR "please set CUDNN_LIB with -DCUDNN_LIB=/path/cudnn/") + endif() + endif(NOT WIN32) +endif() + + +if (NOT WIN32) + if (WITH_TENSORRT AND WITH_GPU) + include_directories("${TENSORRT_DIR}/include") + link_directories("${TENSORRT_DIR}/lib") + endif() +endif(NOT WIN32) + +if (NOT WIN32) + set(NGRAPH_PATH "${PADDLE_DIR}/third_party/install/ngraph") + if(EXISTS ${NGRAPH_PATH}) + include(GNUInstallDirs) + include_directories("${NGRAPH_PATH}/include") + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if(WITH_MKL) + include_directories("${PADDLE_DIR}/third_party/install/mklml/include") + if (WIN32) + set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.lib + ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.lib) + else () + set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + execute_process(COMMAND cp -r ${PADDLE_DIR}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} /usr/lib) + endif () + set(MKLDNN_PATH "${PADDLE_DIR}/third_party/install/mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + if (WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) + else () + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif () + endif() +else() + set(MATH_LIB ${PADDLE_DIR}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() + +if (WIN32) + if(EXISTS "${PADDLE_DIR}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(DEPS + ${PADDLE_DIR}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(DEPS + ${PADDLE_DIR}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + +if(WITH_STATIC_LIB) + set(DEPS + ${PADDLE_DIR}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + if (NOT WIN32) + set(DEPS + ${PADDLE_DIR}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(DEPS + ${PADDLE_DIR}/paddle/lib/paddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if (NOT WIN32) + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf z xxhash yaml-cpp + ) + if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + set(DEPS ${DEPS} snappystream) + endif() + if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + set(DEPS ${DEPS} snappy) + endif() +else() + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags_static libprotobuf xxhash libyaml-cppmt) + + if (EXISTS "${PADDLE_DIR}/third_party/install/zlib/lib") + set(DEPS ${DEPS} zlibstatic) + endif() + set(DEPS ${DEPS} libcmt shlwapi) + if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + set(DEPS ${DEPS} snappy) + endif() + if (EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + set(DEPS ${DEPS} snappystream) + endif() +endif(NOT WIN32) + +if(WITH_GPU) + if(NOT WIN32) + if (WITH_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_DIR}/lib/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_DIR}/lib/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + +if(WITH_ENCRYPTION) + if(NOT WIN32) + include_directories("${ENCRYPTION_DIR}/include") + link_directories("${ENCRYPTION_DIR}/lib") + set(DEPS ${DEPS} ${ENCRYPTION_DIR}/lib/libpmodel-decrypt${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + include_directories("${ENCRYPTION_DIR}/include") + link_directories("${ENCRYPTION_DIR}/lib") + set(DEPS ${DEPS} ${ENCRYPTION_DIR}/lib/pmodel-decrypt${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + +if (NOT WIN32) + set(EXTERNAL_LIB "-ldl -lrt -lgomp -lz -lm -lpthread") + set(DEPS ${DEPS} ${EXTERNAL_LIB}) +endif() + +set(DEPS ${DEPS} ${OpenCV_LIBS}) +add_library(paddlex_inference SHARED src/visualize src/transforms.cpp src/paddlex.cpp) +ADD_DEPENDENCIES(paddlex_inference ext-yaml-cpp) +target_link_libraries(paddlex_inference ${DEPS}) + +add_executable(human_segmenter human_segmenter.cpp src/transforms.cpp src/paddlex.cpp src/visualize.cpp) +ADD_DEPENDENCIES(human_segmenter ext-yaml-cpp) +target_link_libraries(human_segmenter ${DEPS}) + + +if (WIN32 AND WITH_MKL) + add_custom_command(TARGET human_segmenter POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./release/mkldnn.dll + ) + # for encryption + if (EXISTS "${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll") + add_custom_command(TARGET human_segmenter POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./pmodel-decrypt.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ENCRYPTION_DIR}/lib/pmodel-decrypt.dll ./release/pmodel-decrypt.dll + ) + endif() +endif() + +file(COPY "${CMAKE_SOURCE_DIR}/include/paddlex/visualize.h" +DESTINATION "${CMAKE_BINARY_DIR}/include/" ) +file(COPY "${CMAKE_SOURCE_DIR}/include/paddlex/config_parser.h" +DESTINATION "${CMAKE_BINARY_DIR}/include/" ) +file(COPY "${CMAKE_SOURCE_DIR}/include/paddlex/transforms.h" +DESTINATION "${CMAKE_BINARY_DIR}/include/" ) +file(COPY "${CMAKE_SOURCE_DIR}/include/paddlex/results.h" +DESTINATION "${CMAKE_BINARY_DIR}/include/" ) +file(COPY "${CMAKE_SOURCE_DIR}/include/paddlex/paddlex.h" +DESTINATION "${CMAKE_BINARY_DIR}/include/" ) diff --git a/examples/human_segmentation/deploy/cpp/human_segmenter.cpp b/examples/human_segmentation/deploy/cpp/human_segmenter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..479c7a7fd469f6fcfa2cf7b980114893a4febd78 --- /dev/null +++ b/examples/human_segmentation/deploy/cpp/human_segmenter.cpp @@ -0,0 +1,208 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include +#include "include/paddlex/paddlex.h" +#include "include/paddlex/visualize.h" + +#if defined(__arm__) || defined(__aarch64__) +#include +#endif + +using namespace std::chrono; // NOLINT + +DEFINE_string(model_dir, "", "Path of inference model"); +DEFINE_bool(use_gpu, false, "Infering with GPU or CPU"); +DEFINE_bool(use_trt, false, "Infering with TensorRT"); +DEFINE_int32(gpu_id, 0, "GPU card id"); +DEFINE_string(key, "", "key of encryption"); +DEFINE_string(image, "", "Path of test image file"); +DEFINE_bool(use_camera, false, "Infering with Camera"); +DEFINE_int32(camera_id, 0, "Camera id"); +DEFINE_string(video_path, "", "Path of input video"); +DEFINE_bool(show_result, false, "show the result of each frame with a window"); +DEFINE_bool(save_result, true, "save the result of each frame to a video"); +DEFINE_string(save_dir, "output", "Path to save visualized image"); + +int main(int argc, char** argv) { + // Parsing command-line + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_model_dir == "") { + std::cerr << "--model_dir need to be defined" << std::endl; + return -1; + } + if (FLAGS_image == "" & FLAGS_video_path == "" + & FLAGS_use_camera == false) { + std::cerr << "--image or --video_path or --use_camera need to be defined" + << std::endl; + return -1; + } + + // Load model + PaddleX::Model model; + model.Init(FLAGS_model_dir, + FLAGS_use_gpu, + FLAGS_use_trt, + FLAGS_gpu_id, + FLAGS_key); + if (FLAGS_use_camera || FLAGS_video_path != "") { + // Open video + cv::VideoCapture capture; + if (FLAGS_use_camera) { + capture.open(FLAGS_camera_id); + if (!capture.isOpened()) { + std::cout << "Can not open the camera " + << FLAGS_camera_id << "." + << std::endl; + return -1; + } + } else { + capture.open(FLAGS_video_path); + if (!capture.isOpened()) { + std::cout << "Can not open the video " + << FLAGS_video_path << "." + << std::endl; + return -1; + } + } + + // Create a VideoWriter + cv::VideoWriter video_out; + std::string video_out_path; + if (FLAGS_save_result) { + // Get video information: resolution, fps + int video_width = static_cast(capture.get(CV_CAP_PROP_FRAME_WIDTH)); + int video_height = + static_cast(capture.get(CV_CAP_PROP_FRAME_HEIGHT)); + int video_fps = static_cast(capture.get(CV_CAP_PROP_FPS)); + int video_fourcc; + if (FLAGS_use_camera) { + video_fourcc = 828601953; + } else { + video_fourcc = static_cast(capture.get(CV_CAP_PROP_FOURCC)); + } + if (FLAGS_use_camera) { + time_t now = time(0); + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, + std::to_string(now) + ".mp4"); + } else { + video_out_path = + PaddleX::generate_save_path(FLAGS_save_dir, FLAGS_video_path); + } + video_out.open(video_out_path.c_str(), + video_fourcc, + video_fps, + cv::Size(video_width, video_height), + true); + if (!video_out.isOpened()) { + std::cout << "Create video writer failed!" << std::endl; + return -1; + } + } + + PaddleX::SegResult result; + cv::Mat frame; + int key; + while (capture.read(frame)) { + if (FLAGS_show_result || FLAGS_use_camera) { + key = cv::waitKey(1); + // When pressing `ESC`, then exit program and result video is saved + if (key == 27) { + break; + } + } else if (frame.empty()) { + break; + } + // Begin to predict + model.predict(frame, &result); + // Visualize results + std::vector label_map(result.label_map.data.begin(), + result.label_map.data.end()); + cv::Mat mask(result.label_map.shape[0], + result.label_map.shape[1], + CV_8UC1, + label_map.data()); + int rows = result.label_map.shape[0]; + int cols = result.label_map.shape[1]; + cv::Mat vis_img = frame.clone(); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + int category_id = static_cast(mask.at(i, j)); + if (category_id == 0) { + vis_img.at(i, j)[0] = 255; + vis_img.at(i, j)[1] = 255; + vis_img.at(i, j)[2] = 255; + } + } + } + if (FLAGS_show_result || FLAGS_use_camera) { + cv::imshow("human_seg", vis_img); + } + if (FLAGS_save_result) { + video_out.write(vis_img); + } + result.clear(); + } + capture.release(); + if (FLAGS_save_result) { + video_out.release(); + std::cout << "Visualized output saved as " << video_out_path << std::endl; + } + if (FLAGS_show_result || FLAGS_use_camera) { + cv::destroyAllWindows(); + } + } else { + PaddleX::SegResult result; + cv::Mat im = cv::imread(FLAGS_image, 1); + model.predict(im, &result); + // Visualize results + std::vector label_map(result.label_map.data.begin(), + result.label_map.data.end()); + cv::Mat mask(result.label_map.shape[0], + result.label_map.shape[1], + CV_8UC1, + label_map.data()); + int rows = result.label_map.shape[0]; + int cols = result.label_map.shape[1]; + cv::Mat vis_img = im.clone(); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + int category_id = static_cast(mask.at(i, j)); + if (category_id == 0) { + vis_img.at(i, j)[0] = 255; + vis_img.at(i, j)[1] = 255; + vis_img.at(i, j)[2] = 255; + } + } + } + std::string save_path = + PaddleX::generate_save_path(FLAGS_save_dir, FLAGS_image); + cv::imwrite(save_path, vis_img); + result.clear(); + std::cout << "Visualized output saved as " << save_path << std::endl; + } + return 0; +} diff --git a/examples/meter_reader/deploy/cpp/meter_reader/meter_reader.cpp b/examples/meter_reader/deploy/cpp/meter_reader/meter_reader.cpp index 79307fa05eb7b99c753fd978bcec9f0eb1e2f534..04c6f0e5316e9024c4f103e120a72f2f98f34203 100644 --- a/examples/meter_reader/deploy/cpp/meter_reader/meter_reader.cpp +++ b/examples/meter_reader/deploy/cpp/meter_reader/meter_reader.cpp @@ -51,7 +51,8 @@ DEFINE_string(seg_key, "", "Segmenter model key of encryption"); DEFINE_string(image, "", "Path of test image file"); DEFINE_string(image_list, "", "Path of test image list file"); DEFINE_string(save_dir, "output", "Path to save visualized image"); -DEFINE_double(score_threshold, 0.5, "Detected bbox whose score is lower than this threshlod is filtered"); +DEFINE_double(score_threshold, 0.5, + "Detected bbox whose score is lower than this threshlod is filtered"); void predict(const cv::Mat &input_image, PaddleX::Model *det_model, PaddleX::Model *seg_model, const std::string save_dir, @@ -207,7 +208,7 @@ int main(int argc, char **argv) { return -1; } - // 加载模型 + // Load model PaddleX::Model det_model; det_model.Init(FLAGS_det_model_dir, FLAGS_use_gpu, FLAGS_use_trt, FLAGS_gpu_id, FLAGS_det_key); diff --git a/paddlex/cv/__init__.py b/paddlex/cv/__init__.py index 3e05213e1ba967e20a454c9c916096db773b1f9e..0d1a546e7c0513619335dd86d6dcdfbfd0f8e042 100644 --- a/paddlex/cv/__init__.py +++ b/paddlex/cv/__init__.py @@ -26,6 +26,7 @@ ResNet50 = models.ResNet50 DarkNet53 = models.DarkNet53 # detection YOLOv3 = models.YOLOv3 +PPYOLO = models.PPYOLO #EAST = models.EAST FasterRCNN = models.FasterRCNN MaskRCNN = models.MaskRCNN diff --git a/paddlex/cv/datasets/dataset.py b/paddlex/cv/datasets/dataset.py index 8f4a5687ce634e2173df4a3685dc51a294e595bf..82a29f5443c56c9caab2ad725e72493e0bc4bd51 100644 --- a/paddlex/cv/datasets/dataset.py +++ b/paddlex/cv/datasets/dataset.py @@ -115,7 +115,7 @@ def multithread_reader(mapper, while not isinstance(sample, EndSignal): batch_data.append(sample) if len(batch_data) == batch_size: - batch_data = generate_minibatch(batch_data) + batch_data = generate_minibatch(batch_data, mapper=mapper) yield batch_data batch_data = [] sample = out_queue.get() @@ -127,11 +127,11 @@ def multithread_reader(mapper, else: batch_data.append(sample) if len(batch_data) == batch_size: - batch_data = generate_minibatch(batch_data) + batch_data = generate_minibatch(batch_data, mapper=mapper) yield batch_data batch_data = [] if not drop_last and len(batch_data) != 0: - batch_data = generate_minibatch(batch_data) + batch_data = generate_minibatch(batch_data, mapper=mapper) yield batch_data batch_data = [] @@ -188,18 +188,21 @@ def multiprocess_reader(mapper, else: batch_data.append(sample) if len(batch_data) == batch_size: - batch_data = generate_minibatch(batch_data) + batch_data = generate_minibatch(batch_data, mapper=mapper) yield batch_data batch_data = [] if len(batch_data) != 0 and not drop_last: - batch_data = generate_minibatch(batch_data) + batch_data = generate_minibatch(batch_data, mapper=mapper) yield batch_data batch_data = [] return queue_reader -def generate_minibatch(batch_data, label_padding_value=255): +def generate_minibatch(batch_data, label_padding_value=255, mapper=None): + if mapper is not None and mapper.batch_transforms is not None: + for op in mapper.batch_transforms: + batch_data = op(batch_data) # if batch_size is 1, do not pad the image if len(batch_data) == 1: return batch_data diff --git a/paddlex/cv/models/__init__.py b/paddlex/cv/models/__init__.py index 3be68c29b016570f5b797f07cc2acc09918b1e8b..679f8bf52cfe4b8a4a611dd5ad7641845e05efba 100644 --- a/paddlex/cv/models/__init__.py +++ b/paddlex/cv/models/__init__.py @@ -38,6 +38,7 @@ from .classifier import HRNet_W18 from .classifier import AlexNet from .base import BaseAPI from .yolo_v3 import YOLOv3 +from .ppyolo import PPYOLO from .faster_rcnn import FasterRCNN from .mask_rcnn import MaskRCNN from .unet import UNet diff --git a/paddlex/cv/models/base.py b/paddlex/cv/models/base.py index 07d36914a11b1e6a1178f00a2ff1b1e6bc9dc6d9..39b183c9e91c3db06634155948f683f9e0e70779 100644 --- a/paddlex/cv/models/base.py +++ b/paddlex/cv/models/base.py @@ -548,6 +548,8 @@ class BaseAPI: current_save_dir = osp.join(save_dir, "epoch_{}".format(i + 1)) if not osp.isdir(current_save_dir): os.makedirs(current_save_dir) + if hasattr(self, 'use_ema'): + self.exe.run(self.ema.apply_program) if eval_dataset is not None and eval_dataset.num_samples > 0: self.eval_metrics, self.eval_details = self.evaluate( eval_dataset=eval_dataset, @@ -574,6 +576,8 @@ class BaseAPI: log_writer.add_scalar( "Metrics/Eval(Epoch): {}".format(k), v, i + 1) self.save_model(save_dir=current_save_dir) + if hasattr(self, 'use_ema'): + self.exe.run(self.ema.restore_program) time_eval_one_epoch = time.time() - eval_epoch_start_time eval_epoch_start_time = time.time() if best_model_epoch > 0: diff --git a/paddlex/cv/models/ppyolo.py b/paddlex/cv/models/ppyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..021b2cd3b1dc5bfd2bd5a62f03d53248f749d22c --- /dev/null +++ b/paddlex/cv/models/ppyolo.py @@ -0,0 +1,555 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +import math +import tqdm +import os.path as osp +import numpy as np +from multiprocessing.pool import ThreadPool +import paddle.fluid as fluid +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter +from paddle.fluid.optimizer import ExponentialMovingAverage +import paddlex.utils.logging as logging +import paddlex +import copy +from paddlex.cv.transforms import arrange_transforms +from paddlex.cv.datasets import generate_minibatch +from .base import BaseAPI +from collections import OrderedDict +from .utils.detection_eval import eval_results, bbox2out + + +class PPYOLO(BaseAPI): + """构建PPYOLO,并实现其训练、评估、预测和模型导出。 + + Args: + num_classes (int): 类别数。默认为80。 + backbone (str): PPYOLO的backbone网络,取值范围为['ResNet50_vd']。默认为'ResNet50_vd'。 + anchors (list|tuple): anchor框的宽度和高度,为None时表示使用默认值 + [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]]。 + anchor_masks (list|tuple): 在计算PPYOLO损失时,使用anchor的mask索引,为None时表示使用默认值 + [[6, 7, 8], [3, 4, 5], [0, 1, 2]]。 + ignore_threshold (float): 在计算PPYOLO损失时,IoU大于`ignore_threshold`的预测框的置信度被忽略。默认为0.7。 + nms_score_threshold (float): 检测框的置信度得分阈值,置信度得分低于阈值的框应该被忽略。默认为0.01。 + nms_topk (int): 进行NMS时,根据置信度保留的最大检测框数。默认为1000。 + nms_keep_topk (int): 进行NMS后,每个图像要保留的总检测框数。默认为100。 + nms_iou_threshold (float): 进行NMS时,用于剔除检测框IOU的阈值。默认为0.45。 + label_smooth (bool): 是否使用label smooth。默认值为False。 + train_random_shapes (list|tuple): 训练时从列表中随机选择图像大小。默认值为[320, 352, 384, 416, 448, 480, 512, 544, 576, 608]。 + """ + + def __init__( + self, + num_classes=80, + backbone='ResNet50_vd', + with_dcn_v2=True, + # YOLO Head + anchors=None, + anchor_masks=None, + use_coord_conv=True, + use_iou_aware=True, + use_spp=True, + use_drop_block=True, + scale_x_y=1.05, + # PPYOLO Loss + ignore_threshold=0.7, + label_smooth=False, + use_iou_loss=True, + # NMS + use_matrix_nms=True, + nms_score_threshold=0.01, + nms_topk=1000, + nms_keep_topk=100, + nms_iou_threshold=0.45, + train_random_shapes=[ + 320, 352, 384, 416, 448, 480, 512, 544, 576, 608 + ]): + self.init_params = locals() + super(PPYOLO, self).__init__('detector') + backbones = ['ResNet50_vd'] + assert backbone in backbones, "backbone should be one of {}".format( + backbones) + self.backbone = backbone + self.num_classes = num_classes + self.anchors = anchors + self.anchor_masks = anchor_masks + if anchors is None: + self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]] + if anchor_masks is None: + self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + self.ignore_threshold = ignore_threshold + self.nms_score_threshold = nms_score_threshold + self.nms_topk = nms_topk + self.nms_keep_topk = nms_keep_topk + self.nms_iou_threshold = nms_iou_threshold + self.label_smooth = label_smooth + self.sync_bn = True + self.train_random_shapes = train_random_shapes + self.fixed_input_shape = None + self.use_fine_grained_loss = False + if use_coord_conv or use_iou_aware or use_spp or use_drop_block or use_iou_loss: + self.use_fine_grained_loss = True + self.use_coord_conv = use_coord_conv + self.use_iou_aware = use_iou_aware + self.use_spp = use_spp + self.use_drop_block = use_drop_block + self.use_iou_loss = use_iou_loss + self.scale_x_y = scale_x_y + self.max_height = 608 + self.max_width = 608 + self.use_matrix_nms = use_matrix_nms + self.use_ema = False + self.with_dcn_v2 = with_dcn_v2 + + def _get_backbone(self, backbone_name): + if backbone_name == 'ResNet50_vd': + backbone = paddlex.cv.nets.ResNet( + norm_type='sync_bn', + layers=50, + freeze_norm=False, + norm_decay=0., + feature_maps=[3, 4, 5], + freeze_at=0, + variant='d', + dcn_v2_stages=[5] if self.with_dcn_v2 else []) + return backbone + + def build_net(self, mode='train'): + model = paddlex.cv.nets.detection.YOLOv3( + backbone=self._get_backbone(self.backbone), + num_classes=self.num_classes, + mode=mode, + anchors=self.anchors, + anchor_masks=self.anchor_masks, + ignore_threshold=self.ignore_threshold, + label_smooth=self.label_smooth, + nms_score_threshold=self.nms_score_threshold, + nms_topk=self.nms_topk, + nms_keep_topk=self.nms_keep_topk, + nms_iou_threshold=self.nms_iou_threshold, + fixed_input_shape=self.fixed_input_shape, + coord_conv=self.use_coord_conv, + iou_aware=self.use_iou_aware, + scale_x_y=self.scale_x_y, + spp=self.use_spp, + drop_block=self.use_drop_block, + use_matrix_nms=self.use_matrix_nms, + use_fine_grained_loss=self.use_fine_grained_loss, + use_iou_loss=self.use_iou_loss, + batch_size=self.batch_size_per_gpu + if hasattr(self, 'batch_size_per_gpu') else 8) + if mode == 'train' and self.use_iou_loss or self.use_iou_aware: + model.max_height = self.max_height + model.max_width = self.max_width + inputs = model.generate_inputs() + model_out = model.build_net(inputs) + outputs = OrderedDict([('bbox', model_out)]) + if mode == 'train': + self.optimizer.minimize(model_out) + outputs = OrderedDict([('loss', model_out)]) + if self.use_ema: + global_steps = _decay_step_counter() + self.ema = ExponentialMovingAverage( + self.ema_decay, thres_steps=global_steps) + self.ema.update() + return inputs, outputs + + def default_optimizer(self, learning_rate, warmup_steps, warmup_start_lr, + lr_decay_epochs, lr_decay_gamma, + num_steps_each_epoch): + if warmup_steps > lr_decay_epochs[0] * num_steps_each_epoch: + logging.error( + "In function train(), parameters should satisfy: warmup_steps <= lr_decay_epochs[0]*num_samples_in_train_dataset", + exit=False) + logging.error( + "See this doc for more information: https://github.com/PaddlePaddle/PaddleX/blob/develop/docs/appendix/parameters.md#notice", + exit=False) + logging.error( + "warmup_steps should less than {} or lr_decay_epochs[0] greater than {}, please modify 'lr_decay_epochs' or 'warmup_steps' in train function". + format(lr_decay_epochs[0] * num_steps_each_epoch, warmup_steps + // num_steps_each_epoch)) + boundaries = [b * num_steps_each_epoch for b in lr_decay_epochs] + values = [(lr_decay_gamma**i) * learning_rate + for i in range(len(lr_decay_epochs) + 1)] + lr_decay = fluid.layers.piecewise_decay( + boundaries=boundaries, values=values) + lr_warmup = fluid.layers.linear_lr_warmup( + learning_rate=lr_decay, + warmup_steps=warmup_steps, + start_lr=warmup_start_lr, + end_lr=learning_rate) + optimizer = fluid.optimizer.Momentum( + learning_rate=lr_warmup, + momentum=0.9, + regularization=fluid.regularizer.L2DecayRegularizer(5e-04)) + return optimizer + + def train(self, + num_epochs, + train_dataset, + train_batch_size=8, + eval_dataset=None, + save_interval_epochs=20, + log_interval_steps=2, + save_dir='output', + pretrain_weights='IMAGENET', + optimizer=None, + learning_rate=1.0 / 8000, + warmup_steps=1000, + warmup_start_lr=0.0, + lr_decay_epochs=[213, 240], + lr_decay_gamma=0.1, + metric=None, + use_vdl=False, + sensitivities_file=None, + eval_metric_loss=0.05, + early_stop=False, + early_stop_patience=5, + resume_checkpoint=None, + use_ema=True, + ema_decay=0.9998): + """训练。 + + Args: + num_epochs (int): 训练迭代轮数。 + train_dataset (paddlex.datasets): 训练数据读取器。 + train_batch_size (int): 训练数据batch大小。目前检测仅支持单卡评估,训练数据batch大小与显卡 + 数量之商为验证数据batch大小。默认值为8。 + eval_dataset (paddlex.datasets): 验证数据读取器。 + save_interval_epochs (int): 模型保存间隔(单位:迭代轮数)。默认为20。 + log_interval_steps (int): 训练日志输出间隔(单位:迭代次数)。默认为10。 + save_dir (str): 模型保存路径。默认值为'output'。 + pretrain_weights (str): 若指定为路径时,则加载路径下预训练模型;若为字符串'IMAGENET', + 则自动下载在ImageNet图片数据上预训练的模型权重;若为字符串'COCO', + 则自动下载在COCO数据集上预训练的模型权重;若为None,则不使用预训练模型。默认为'IMAGENET'。 + optimizer (paddle.fluid.optimizer): 优化器。当该参数为None时,使用默认优化器: + fluid.layers.piecewise_decay衰减策略,fluid.optimizer.Momentum优化方法。 + learning_rate (float): 默认优化器的学习率。默认为1.0/8000。 + warmup_steps (int): 默认优化器进行warmup过程的步数。默认为1000。 + warmup_start_lr (int): 默认优化器warmup的起始学习率。默认为0.0。 + lr_decay_epochs (list): 默认优化器的学习率衰减轮数。默认为[213, 240]。 + lr_decay_gamma (float): 默认优化器的学习率衰减率。默认为0.1。 + metric (bool): 训练过程中评估的方式,取值范围为['COCO', 'VOC']。默认值为None。 + use_vdl (bool): 是否使用VisualDL进行可视化。默认值为False。 + sensitivities_file (str): 若指定为路径时,则加载路径下敏感度信息进行裁剪;若为字符串'DEFAULT', + 则自动下载在ImageNet图片数据上获得的敏感度信息进行裁剪;若为None,则不进行裁剪。默认为None。 + eval_metric_loss (float): 可容忍的精度损失。默认为0.05。 + early_stop (bool): 是否使用提前终止训练策略。默认值为False。 + early_stop_patience (int): 当使用提前终止训练策略时,如果验证集精度在`early_stop_patience`个epoch内 + 连续下降或持平,则终止训练。默认值为5。 + resume_checkpoint (str): 恢复训练时指定上次训练保存的模型路径。若为None,则不会恢复训练。默认值为None。 + + Raises: + ValueError: 评估类型不在指定列表中。 + ValueError: 模型从inference model进行加载。 + """ + if not self.trainable: + raise ValueError("Model is not trainable from load_model method.") + if metric is None: + if isinstance(train_dataset, paddlex.datasets.CocoDetection): + metric = 'COCO' + elif isinstance(train_dataset, paddlex.datasets.VOCDetection) or \ + isinstance(train_dataset, paddlex.datasets.EasyDataDet): + metric = 'VOC' + else: + raise ValueError( + "train_dataset should be datasets.VOCDetection or datasets.COCODetection or datasets.EasyDataDet." + ) + assert metric in ['COCO', 'VOC'], "Metric only support 'VOC' or 'COCO'" + self.metric = metric + + self.labels = train_dataset.labels + # 构建训练网络 + if optimizer is None: + # 构建默认的优化策略 + num_steps_each_epoch = train_dataset.num_samples // train_batch_size + optimizer = self.default_optimizer( + learning_rate=learning_rate, + warmup_steps=warmup_steps, + warmup_start_lr=warmup_start_lr, + lr_decay_epochs=lr_decay_epochs, + lr_decay_gamma=lr_decay_gamma, + num_steps_each_epoch=num_steps_each_epoch) + self.optimizer = optimizer + self.use_ema = use_ema + self.ema_decay = ema_decay + + self.batch_size_per_gpu = int(train_batch_size / + paddlex.env_info['num']) + if self.use_fine_grained_loss: + for transform in train_dataset.transforms.transforms: + if isinstance(transform, paddlex.det.transforms.Resize): + self.max_height = transform.target_size + self.max_width = transform.target_size + break + if train_dataset.transforms.batch_transforms is None: + train_dataset.transforms.batch_transforms = list() + define_random_shape = False + for bt in train_dataset.transforms.batch_transforms: + if isinstance(bt, paddlex.det.transforms.BatchRandomShape): + define_random_shape = True + if not define_random_shape: + if isinstance(self.train_random_shapes, + (list, tuple)) and len(self.train_random_shapes) > 0: + train_dataset.transforms.batch_transforms.append( + paddlex.det.transforms.BatchRandomShape( + random_shapes=self.train_random_shapes)) + if self.use_fine_grained_loss: + self.max_height = max(self.max_height, + max(self.train_random_shapes)) + self.max_width = max(self.max_width, + max(self.train_random_shapes)) + if self.use_fine_grained_loss: + define_generate_target = False + for bt in train_dataset.transforms.batch_transforms: + if isinstance(bt, paddlex.det.transforms.GenerateYoloTarget): + define_generate_target = True + if not define_generate_target: + train_dataset.transforms.batch_transforms.append( + paddlex.det.transforms.GenerateYoloTarget( + anchors=self.anchors, + anchor_masks=self.anchor_masks, + num_classes=self.num_classes, + downsample_ratios=[32, 16, 8])) + # 构建训练、验证、预测网络 + self.build_program() + # 初始化网络权重 + self.net_initialize( + startup_prog=fluid.default_startup_program(), + pretrain_weights=pretrain_weights, + save_dir=save_dir, + sensitivities_file=sensitivities_file, + eval_metric_loss=eval_metric_loss, + resume_checkpoint=resume_checkpoint) + # 训练 + self.train_loop( + num_epochs=num_epochs, + train_dataset=train_dataset, + train_batch_size=train_batch_size, + eval_dataset=eval_dataset, + save_interval_epochs=save_interval_epochs, + log_interval_steps=log_interval_steps, + save_dir=save_dir, + use_vdl=use_vdl, + early_stop=early_stop, + early_stop_patience=early_stop_patience) + + def evaluate(self, + eval_dataset, + batch_size=1, + epoch_id=None, + metric=None, + return_details=False): + """评估。 + + Args: + eval_dataset (paddlex.datasets): 验证数据读取器。 + batch_size (int): 验证数据批大小。默认为1。 + epoch_id (int): 当前评估模型所在的训练轮数。 + metric (bool): 训练过程中评估的方式,取值范围为['COCO', 'VOC']。默认为None, + 根据用户传入的Dataset自动选择,如为VOCDetection,则metric为'VOC'; + 如为COCODetection,则metric为'COCO'。 + return_details (bool): 是否返回详细信息。 + + Returns: + tuple (metrics, eval_details) | dict (metrics): 当return_details为True时,返回(metrics, eval_details), + 当return_details为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘, + 分别表示平均准确率平均值在各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。 + eval_details为dict,包含关键字:'bbox',对应元素预测结果列表,每个预测结果由图像id、 + 预测框类别id、预测框坐标、预测框得分;’gt‘:真实标注框相关信息。 + """ + arrange_transforms( + model_type=self.model_type, + class_name=self.__class__.__name__, + transforms=eval_dataset.transforms, + mode='eval') + if metric is None: + if hasattr(self, 'metric') and self.metric is not None: + metric = self.metric + else: + if isinstance(eval_dataset, paddlex.datasets.CocoDetection): + metric = 'COCO' + elif isinstance(eval_dataset, paddlex.datasets.VOCDetection): + metric = 'VOC' + else: + raise Exception( + "eval_dataset should be datasets.VOCDetection or datasets.COCODetection." + ) + assert metric in ['COCO', 'VOC'], "Metric only support 'VOC' or 'COCO'" + + total_steps = math.ceil(eval_dataset.num_samples * 1.0 / batch_size) + results = list() + + data_generator = eval_dataset.generator( + batch_size=batch_size, drop_last=False) + logging.info( + "Start to evaluating(total_samples={}, total_steps={})...".format( + eval_dataset.num_samples, total_steps)) + for step, data in tqdm.tqdm( + enumerate(data_generator()), total=total_steps): + images = np.array([d[0] for d in data]) + im_sizes = np.array([d[1] for d in data]) + feed_data = {'image': images, 'im_size': im_sizes} + with fluid.scope_guard(self.scope): + outputs = self.exe.run( + self.test_prog, + feed=[feed_data], + fetch_list=list(self.test_outputs.values()), + return_numpy=False) + res = { + 'bbox': (np.array(outputs[0]), + outputs[0].recursive_sequence_lengths()) + } + res_id = [np.array([d[2]]) for d in data] + res['im_id'] = (res_id, []) + if metric == 'VOC': + res_gt_box = [d[3].reshape(-1, 4) for d in data] + res_gt_label = [d[4].reshape(-1, 1) for d in data] + res_is_difficult = [d[5].reshape(-1, 1) for d in data] + res_id = [np.array([d[2]]) for d in data] + res['gt_box'] = (res_gt_box, []) + res['gt_label'] = (res_gt_label, []) + res['is_difficult'] = (res_is_difficult, []) + results.append(res) + logging.debug("[EVAL] Epoch={}, Step={}/{}".format(epoch_id, step + + 1, total_steps)) + box_ap_stats, eval_details = eval_results( + results, metric, eval_dataset.coco_gt, with_background=False) + evaluate_metrics = OrderedDict( + zip(['bbox_mmap' + if metric == 'COCO' else 'bbox_map'], box_ap_stats)) + if return_details: + return evaluate_metrics, eval_details + return evaluate_metrics + + @staticmethod + def _preprocess(images, transforms, model_type, class_name, thread_num=1): + arrange_transforms( + model_type=model_type, + class_name=class_name, + transforms=transforms, + mode='test') + pool = ThreadPool(thread_num) + batch_data = pool.map(transforms, images) + pool.close() + pool.join() + padding_batch = generate_minibatch(batch_data) + im = np.array( + [data[0] for data in padding_batch], + dtype=padding_batch[0][0].dtype) + im_size = np.array([data[1] for data in padding_batch], dtype=np.int32) + + return im, im_size + + @staticmethod + def _postprocess(res, batch_size, num_classes, labels): + clsid2catid = dict({i: i for i in range(num_classes)}) + xywh_results = bbox2out([res], clsid2catid) + preds = [[] for i in range(batch_size)] + for xywh_res in xywh_results: + image_id = xywh_res['image_id'] + del xywh_res['image_id'] + xywh_res['category'] = labels[xywh_res['category_id']] + preds[image_id].append(xywh_res) + + return preds + + def predict(self, img_file, transforms=None): + """预测。 + + Args: + img_file (str|np.ndarray): 预测图像路径,或者是解码后的排列格式为(H, W, C)且类型为float32且为BGR格式的数组。 + transforms (paddlex.det.transforms): 数据预处理操作。 + + Returns: + list: 预测结果列表,每个预测结果由预测框类别标签、 + 预测框类别名称、预测框坐标(坐标格式为[xmin, ymin, w, h])、 + 预测框得分组成。 + """ + if transforms is None and not hasattr(self, 'test_transforms'): + raise Exception("transforms need to be defined, now is None.") + if isinstance(img_file, (str, np.ndarray)): + images = [img_file] + else: + raise Exception("img_file must be str/np.ndarray") + + if transforms is None: + transforms = self.test_transforms + im, im_size = PPYOLO._preprocess(images, transforms, self.model_type, + self.__class__.__name__) + + with fluid.scope_guard(self.scope): + result = self.exe.run(self.test_prog, + feed={'image': im, + 'im_size': im_size}, + fetch_list=list(self.test_outputs.values()), + return_numpy=False, + use_program_cache=True) + + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(list(self.test_outputs.keys()), result) + } + res['im_id'] = (np.array( + [[i] for i in range(len(images))]).astype('int32'), [[]]) + preds = PPYOLO._postprocess(res, + len(images), self.num_classes, self.labels) + return preds[0] + + def batch_predict(self, img_file_list, transforms=None, thread_num=2): + """预测。 + + Args: + img_file_list (list|tuple): 对列表(或元组)中的图像同时进行预测,列表中的元素可以是图像路径,也可以是解码后的排列格式为(H,W,C) + 且类型为float32且为BGR格式的数组。 + transforms (paddlex.det.transforms): 数据预处理操作。 + thread_num (int): 并发执行各图像预处理时的线程数。 + Returns: + list: 每个元素都为列表,表示各图像的预测结果。在各图像的预测结果列表中,每个预测结果由预测框类别标签、 + 预测框类别名称、预测框坐标(坐标格式为[xmin, ymin, w, h])、 + 预测框得分组成。 + """ + if transforms is None and not hasattr(self, 'test_transforms'): + raise Exception("transforms need to be defined, now is None.") + + if not isinstance(img_file_list, (list, tuple)): + raise Exception("im_file must be list/tuple") + + if transforms is None: + transforms = self.test_transforms + im, im_size = PPYOLO._preprocess(img_file_list, transforms, + self.model_type, + self.__class__.__name__, thread_num) + + with fluid.scope_guard(self.scope): + result = self.exe.run(self.test_prog, + feed={'image': im, + 'im_size': im_size}, + fetch_list=list(self.test_outputs.values()), + return_numpy=False, + use_program_cache=True) + + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(list(self.test_outputs.keys()), result) + } + res['im_id'] = (np.array( + [[i] for i in range(len(img_file_list))]).astype('int32'), [[]]) + preds = PPYOLO._postprocess(res, + len(img_file_list), self.num_classes, + self.labels) + return preds diff --git a/paddlex/cv/models/yolo_v3.py b/paddlex/cv/models/yolo_v3.py index c324cf55ed165268c6f7880aae0487412e7f8b3c..76ce6922fa9f29d0fcf2ccc4500c8884e1fb33d6 100644 --- a/paddlex/cv/models/yolo_v3.py +++ b/paddlex/cv/models/yolo_v3.py @@ -15,21 +15,11 @@ from __future__ import absolute_import import math import tqdm -import os.path as osp -import numpy as np -from multiprocessing.pool import ThreadPool -import paddle.fluid as fluid -import paddlex.utils.logging as logging import paddlex -import copy -from paddlex.cv.transforms import arrange_transforms -from paddlex.cv.datasets import generate_minibatch -from .base import BaseAPI -from collections import OrderedDict -from .utils.detection_eval import eval_results, bbox2out +from .ppyolo import PPYOLO -class YOLOv3(BaseAPI): +class YOLOv3(PPYOLO): """构建YOLOv3,并实现其训练、评估、预测和模型导出。 Args: @@ -65,12 +55,12 @@ class YOLOv3(BaseAPI): 320, 352, 384, 416, 448, 480, 512, 544, 576, 608 ]): self.init_params = locals() - super(YOLOv3, self).__init__('detector') backbones = [ 'DarkNet53', 'ResNet34', 'MobileNetV1', 'MobileNetV3_large' ] assert backbone in backbones, "backbone should be one of {}".format( backbones) + super(YOLOv3, self).__init__('detector') self.backbone = backbone self.num_classes = num_classes self.anchors = anchors @@ -84,6 +74,16 @@ class YOLOv3(BaseAPI): self.sync_bn = True self.train_random_shapes = train_random_shapes self.fixed_input_shape = None + self.use_fine_grained_loss = False + self.use_coord_conv = False + self.use_iou_aware = False + self.use_spp = False + self.use_drop_block = False + self.use_iou_loss = False + self.scale_x_y = 1. + self.use_matrix_nms = False + self.use_ema = False + self.with_dcn_v2 = False def _get_backbone(self, backbone_name): if backbone_name == 'DarkNet53': @@ -104,59 +104,6 @@ class YOLOv3(BaseAPI): norm_type='sync_bn', model_name=model_name) return backbone - def build_net(self, mode='train'): - model = paddlex.cv.nets.detection.YOLOv3( - backbone=self._get_backbone(self.backbone), - num_classes=self.num_classes, - mode=mode, - anchors=self.anchors, - anchor_masks=self.anchor_masks, - ignore_threshold=self.ignore_threshold, - label_smooth=self.label_smooth, - nms_score_threshold=self.nms_score_threshold, - nms_topk=self.nms_topk, - nms_keep_topk=self.nms_keep_topk, - nms_iou_threshold=self.nms_iou_threshold, - train_random_shapes=self.train_random_shapes, - fixed_input_shape=self.fixed_input_shape) - inputs = model.generate_inputs() - model_out = model.build_net(inputs) - outputs = OrderedDict([('bbox', model_out)]) - if mode == 'train': - self.optimizer.minimize(model_out) - outputs = OrderedDict([('loss', model_out)]) - return inputs, outputs - - def default_optimizer(self, learning_rate, warmup_steps, warmup_start_lr, - lr_decay_epochs, lr_decay_gamma, - num_steps_each_epoch): - if warmup_steps > lr_decay_epochs[0] * num_steps_each_epoch: - logging.error( - "In function train(), parameters should satisfy: warmup_steps <= lr_decay_epochs[0]*num_samples_in_train_dataset", - exit=False) - logging.error( - "See this doc for more information: https://github.com/PaddlePaddle/PaddleX/blob/develop/docs/appendix/parameters.md#notice", - exit=False) - logging.error( - "warmup_steps should less than {} or lr_decay_epochs[0] greater than {}, please modify 'lr_decay_epochs' or 'warmup_steps' in train function". - format(lr_decay_epochs[0] * num_steps_each_epoch, warmup_steps - // num_steps_each_epoch)) - boundaries = [b * num_steps_each_epoch for b in lr_decay_epochs] - values = [(lr_decay_gamma**i) * learning_rate - for i in range(len(lr_decay_epochs) + 1)] - lr_decay = fluid.layers.piecewise_decay( - boundaries=boundaries, values=values) - lr_warmup = fluid.layers.linear_lr_warmup( - learning_rate=lr_decay, - warmup_steps=warmup_steps, - start_lr=warmup_start_lr, - end_lr=learning_rate) - optimizer = fluid.optimizer.Momentum( - learning_rate=lr_warmup, - momentum=0.9, - regularization=fluid.regularizer.L2DecayRegularizer(5e-04)) - return optimizer - def train(self, num_epochs, train_dataset, @@ -214,259 +161,11 @@ class YOLOv3(BaseAPI): ValueError: 评估类型不在指定列表中。 ValueError: 模型从inference model进行加载。 """ - if not self.trainable: - raise ValueError("Model is not trainable from load_model method.") - if metric is None: - if isinstance(train_dataset, paddlex.datasets.CocoDetection): - metric = 'COCO' - elif isinstance(train_dataset, paddlex.datasets.VOCDetection) or \ - isinstance(train_dataset, paddlex.datasets.EasyDataDet): - metric = 'VOC' - else: - raise ValueError( - "train_dataset should be datasets.VOCDetection or datasets.COCODetection or datasets.EasyDataDet." - ) - assert metric in ['COCO', 'VOC'], "Metric only support 'VOC' or 'COCO'" - self.metric = metric - - self.labels = train_dataset.labels - # 构建训练网络 - if optimizer is None: - # 构建默认的优化策略 - num_steps_each_epoch = train_dataset.num_samples // train_batch_size - optimizer = self.default_optimizer( - learning_rate=learning_rate, - warmup_steps=warmup_steps, - warmup_start_lr=warmup_start_lr, - lr_decay_epochs=lr_decay_epochs, - lr_decay_gamma=lr_decay_gamma, - num_steps_each_epoch=num_steps_each_epoch) - self.optimizer = optimizer - # 构建训练、验证、预测网络 - self.build_program() - # 初始化网络权重 - self.net_initialize( - startup_prog=fluid.default_startup_program(), - pretrain_weights=pretrain_weights, - save_dir=save_dir, - sensitivities_file=sensitivities_file, - eval_metric_loss=eval_metric_loss, - resume_checkpoint=resume_checkpoint) - # 训练 - self.train_loop( - num_epochs=num_epochs, - train_dataset=train_dataset, - train_batch_size=train_batch_size, - eval_dataset=eval_dataset, - save_interval_epochs=save_interval_epochs, - log_interval_steps=log_interval_steps, - save_dir=save_dir, - use_vdl=use_vdl, - early_stop=early_stop, - early_stop_patience=early_stop_patience) - - def evaluate(self, - eval_dataset, - batch_size=1, - epoch_id=None, - metric=None, - return_details=False): - """评估。 - - Args: - eval_dataset (paddlex.datasets): 验证数据读取器。 - batch_size (int): 验证数据批大小。默认为1。 - epoch_id (int): 当前评估模型所在的训练轮数。 - metric (bool): 训练过程中评估的方式,取值范围为['COCO', 'VOC']。默认为None, - 根据用户传入的Dataset自动选择,如为VOCDetection,则metric为'VOC'; - 如为COCODetection,则metric为'COCO'。 - return_details (bool): 是否返回详细信息。 - - Returns: - tuple (metrics, eval_details) | dict (metrics): 当return_details为True时,返回(metrics, eval_details), - 当return_details为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘, - 分别表示平均准确率平均值在各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。 - eval_details为dict,包含关键字:'bbox',对应元素预测结果列表,每个预测结果由图像id、 - 预测框类别id、预测框坐标、预测框得分;’gt‘:真实标注框相关信息。 - """ - arrange_transforms( - model_type=self.model_type, - class_name=self.__class__.__name__, - transforms=eval_dataset.transforms, - mode='eval') - if metric is None: - if hasattr(self, 'metric') and self.metric is not None: - metric = self.metric - else: - if isinstance(eval_dataset, paddlex.datasets.CocoDetection): - metric = 'COCO' - elif isinstance(eval_dataset, paddlex.datasets.VOCDetection): - metric = 'VOC' - else: - raise Exception( - "eval_dataset should be datasets.VOCDetection or datasets.COCODetection." - ) - assert metric in ['COCO', 'VOC'], "Metric only support 'VOC' or 'COCO'" - - total_steps = math.ceil(eval_dataset.num_samples * 1.0 / batch_size) - results = list() - - data_generator = eval_dataset.generator( - batch_size=batch_size, drop_last=False) - logging.info( - "Start to evaluating(total_samples={}, total_steps={})...".format( - eval_dataset.num_samples, total_steps)) - for step, data in tqdm.tqdm( - enumerate(data_generator()), total=total_steps): - images = np.array([d[0] for d in data]) - im_sizes = np.array([d[1] for d in data]) - feed_data = {'image': images, 'im_size': im_sizes} - with fluid.scope_guard(self.scope): - outputs = self.exe.run( - self.test_prog, - feed=[feed_data], - fetch_list=list(self.test_outputs.values()), - return_numpy=False) - res = { - 'bbox': (np.array(outputs[0]), - outputs[0].recursive_sequence_lengths()) - } - res_id = [np.array([d[2]]) for d in data] - res['im_id'] = (res_id, []) - if metric == 'VOC': - res_gt_box = [d[3].reshape(-1, 4) for d in data] - res_gt_label = [d[4].reshape(-1, 1) for d in data] - res_is_difficult = [d[5].reshape(-1, 1) for d in data] - res_id = [np.array([d[2]]) for d in data] - res['gt_box'] = (res_gt_box, []) - res['gt_label'] = (res_gt_label, []) - res['is_difficult'] = (res_is_difficult, []) - results.append(res) - logging.debug("[EVAL] Epoch={}, Step={}/{}".format(epoch_id, step + - 1, total_steps)) - box_ap_stats, eval_details = eval_results( - results, metric, eval_dataset.coco_gt, with_background=False) - evaluate_metrics = OrderedDict( - zip(['bbox_mmap' - if metric == 'COCO' else 'bbox_map'], box_ap_stats)) - if return_details: - return evaluate_metrics, eval_details - return evaluate_metrics - - @staticmethod - def _preprocess(images, transforms, model_type, class_name, thread_num=1): - arrange_transforms( - model_type=model_type, - class_name=class_name, - transforms=transforms, - mode='test') - pool = ThreadPool(thread_num) - batch_data = pool.map(transforms, images) - pool.close() - pool.join() - padding_batch = generate_minibatch(batch_data) - im = np.array( - [data[0] for data in padding_batch], - dtype=padding_batch[0][0].dtype) - im_size = np.array([data[1] for data in padding_batch], dtype=np.int32) - - return im, im_size - - @staticmethod - def _postprocess(res, batch_size, num_classes, labels): - clsid2catid = dict({i: i for i in range(num_classes)}) - xywh_results = bbox2out([res], clsid2catid) - preds = [[] for i in range(batch_size)] - for xywh_res in xywh_results: - image_id = xywh_res['image_id'] - del xywh_res['image_id'] - xywh_res['category'] = labels[xywh_res['category_id']] - preds[image_id].append(xywh_res) - - return preds - - def predict(self, img_file, transforms=None): - """预测。 - - Args: - img_file (str|np.ndarray): 预测图像路径,或者是解码后的排列格式为(H, W, C)且类型为float32且为BGR格式的数组。 - transforms (paddlex.det.transforms): 数据预处理操作。 - - Returns: - list: 预测结果列表,每个预测结果由预测框类别标签、 - 预测框类别名称、预测框坐标(坐标格式为[xmin, ymin, w, h])、 - 预测框得分组成。 - """ - if transforms is None and not hasattr(self, 'test_transforms'): - raise Exception("transforms need to be defined, now is None.") - if isinstance(img_file, (str, np.ndarray)): - images = [img_file] - else: - raise Exception("img_file must be str/np.ndarray") - - if transforms is None: - transforms = self.test_transforms - im, im_size = YOLOv3._preprocess(images, transforms, self.model_type, - self.__class__.__name__) - - with fluid.scope_guard(self.scope): - result = self.exe.run(self.test_prog, - feed={'image': im, - 'im_size': im_size}, - fetch_list=list(self.test_outputs.values()), - return_numpy=False, - use_program_cache=True) - - res = { - k: (np.array(v), v.recursive_sequence_lengths()) - for k, v in zip(list(self.test_outputs.keys()), result) - } - res['im_id'] = (np.array( - [[i] for i in range(len(images))]).astype('int32'), [[]]) - preds = YOLOv3._postprocess(res, - len(images), self.num_classes, self.labels) - return preds[0] - - def batch_predict(self, img_file_list, transforms=None, thread_num=2): - """预测。 - - Args: - img_file_list (list|tuple): 对列表(或元组)中的图像同时进行预测,列表中的元素可以是图像路径,也可以是解码后的排列格式为(H,W,C) - 且类型为float32且为BGR格式的数组。 - transforms (paddlex.det.transforms): 数据预处理操作。 - thread_num (int): 并发执行各图像预处理时的线程数。 - Returns: - list: 每个元素都为列表,表示各图像的预测结果。在各图像的预测结果列表中,每个预测结果由预测框类别标签、 - 预测框类别名称、预测框坐标(坐标格式为[xmin, ymin, w, h])、 - 预测框得分组成。 - """ - if transforms is None and not hasattr(self, 'test_transforms'): - raise Exception("transforms need to be defined, now is None.") - - if not isinstance(img_file_list, (list, tuple)): - raise Exception("im_file must be list/tuple") - - if transforms is None: - transforms = self.test_transforms - im, im_size = YOLOv3._preprocess(img_file_list, transforms, - self.model_type, - self.__class__.__name__, thread_num) - - with fluid.scope_guard(self.scope): - result = self.exe.run(self.test_prog, - feed={'image': im, - 'im_size': im_size}, - fetch_list=list(self.test_outputs.values()), - return_numpy=False, - use_program_cache=True) - res = { - k: (np.array(v), v.recursive_sequence_lengths()) - for k, v in zip(list(self.test_outputs.keys()), result) - } - res['im_id'] = (np.array( - [[i] for i in range(len(img_file_list))]).astype('int32'), [[]]) - preds = YOLOv3._postprocess(res, - len(img_file_list), self.num_classes, - self.labels) - return preds + return super(YOLOv3, self).train( + num_epochs, train_dataset, train_batch_size, eval_dataset, + save_interval_epochs, log_interval_steps, save_dir, + pretrain_weights, optimizer, learning_rate, warmup_steps, + warmup_start_lr, lr_decay_epochs, lr_decay_gamma, metric, use_vdl, + sensitivities_file, eval_metric_loss, early_stop, + early_stop_patience, resume_checkpoint, False) diff --git a/paddlex/cv/nets/detection/iou_aware.py b/paddlex/cv/nets/detection/iou_aware.py new file mode 100644 index 0000000000000000000000000000000000000000..7a85a70a62c41b6a10c78cbcd1250d63cd534349 --- /dev/null +++ b/paddlex/cv/nets/detection/iou_aware.py @@ -0,0 +1,85 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid + + +def _split_ioup(output, an_num, num_classes): + """ + Split new output feature map to output, predicted iou + along channel dimension + """ + ioup = fluid.layers.slice(output, axes=[1], starts=[0], ends=[an_num]) + ioup = fluid.layers.sigmoid(ioup) + + oriout = fluid.layers.slice( + output, axes=[1], starts=[an_num], ends=[an_num * (num_classes + 6)]) + + return (ioup, oriout) + + +def _de_sigmoid(x, eps=1e-7): + x = fluid.layers.clip(x, eps, 1 / eps) + one = fluid.layers.fill_constant( + shape=[1, 1, 1, 1], dtype=x.dtype, value=1.) + x = fluid.layers.clip((one / x - 1.0), eps, 1 / eps) + x = -fluid.layers.log(x) + return x + + +def _postprocess_output(ioup, output, an_num, num_classes, iou_aware_factor): + """ + post process output objectness score + """ + tensors = [] + stride = output.shape[1] // an_num + for m in range(an_num): + tensors.append( + fluid.layers.slice( + output, + axes=[1], + starts=[stride * m + 0], + ends=[stride * m + 4])) + obj = fluid.layers.slice( + output, axes=[1], starts=[stride * m + 4], ends=[stride * m + 5]) + obj = fluid.layers.sigmoid(obj) + ip = fluid.layers.slice(ioup, axes=[1], starts=[m], ends=[m + 1]) + + new_obj = fluid.layers.pow(obj, ( + 1 - iou_aware_factor)) * fluid.layers.pow(ip, iou_aware_factor) + new_obj = _de_sigmoid(new_obj) + + tensors.append(new_obj) + + tensors.append( + fluid.layers.slice( + output, + axes=[1], + starts=[stride * m + 5], + ends=[stride * m + 5 + num_classes])) + + output = fluid.layers.concat(tensors, axis=1) + + return output + + +def get_iou_aware_score(output, an_num, num_classes, iou_aware_factor): + ioup, output = _split_ioup(output, an_num, num_classes) + output = _postprocess_output(ioup, output, an_num, num_classes, + iou_aware_factor) + return output diff --git a/paddlex/cv/nets/detection/loss/iou_aware_loss.py b/paddlex/cv/nets/detection/loss/iou_aware_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..64796eb7d92543a73a053bc1349ba3806d1eea5e --- /dev/null +++ b/paddlex/cv/nets/detection/loss/iou_aware_loss.py @@ -0,0 +1,77 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import NumpyArrayInitializer + +from paddle import fluid +from .iou_loss import IouLoss + + +class IouAwareLoss(IouLoss): + """ + iou aware loss, see https://arxiv.org/abs/1912.05992 + Args: + loss_weight (float): iou aware loss weight, default is 1.0 + max_height (int): max height of input to support random shape input + max_width (int): max width of input to support random shape input + """ + + def __init__(self, loss_weight=1.0, max_height=608, max_width=608): + super(IouAwareLoss, self).__init__( + loss_weight=loss_weight, + max_height=max_height, + max_width=max_width) + + def __call__(self, + ioup, + x, + y, + w, + h, + tx, + ty, + tw, + th, + anchors, + downsample_ratio, + batch_size, + scale_x_y, + eps=1.e-10): + ''' + Args: + ioup ([Variables]): the predicted iou + x | y | w | h ([Variables]): the output of yolov3 for encoded x|y|w|h + tx |ty |tw |th ([Variables]): the target of yolov3 for encoded x|y|w|h + anchors ([float]): list of anchors for current output layer + downsample_ratio (float): the downsample ratio for current output layer + batch_size (int): training batch size + eps (float): the decimal to prevent the denominator eqaul zero + ''' + + pred = self._bbox_transform(x, y, w, h, anchors, downsample_ratio, + batch_size, False, scale_x_y, eps) + gt = self._bbox_transform(tx, ty, tw, th, anchors, downsample_ratio, + batch_size, True, scale_x_y, eps) + iouk = self._iou(pred, gt, ioup, eps) + iouk.stop_gradient = True + + loss_iou_aware = fluid.layers.cross_entropy( + ioup, iouk, soft_label=True) + loss_iou_aware = loss_iou_aware * self._loss_weight + return loss_iou_aware diff --git a/paddlex/cv/nets/detection/loss/iou_loss.py b/paddlex/cv/nets/detection/loss/iou_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..da1beeaf9b5ad6be4c61c27d71bcac24e37f2b9a --- /dev/null +++ b/paddlex/cv/nets/detection/loss/iou_loss.py @@ -0,0 +1,235 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import NumpyArrayInitializer + +from paddle import fluid + + +class IouLoss(object): + """ + iou loss, see https://arxiv.org/abs/1908.03851 + loss = 1.0 - iou * iou + Args: + loss_weight (float): iou loss weight, default is 2.5 + max_height (int): max height of input to support random shape input + max_width (int): max width of input to support random shape input + ciou_term (bool): whether to add ciou_term + loss_square (bool): whether to square the iou term + """ + + def __init__(self, + loss_weight=2.5, + max_height=608, + max_width=608, + ciou_term=False, + loss_square=True): + self._loss_weight = loss_weight + self._MAX_HI = max_height + self._MAX_WI = max_width + self.ciou_term = ciou_term + self.loss_square = loss_square + + def __call__(self, + x, + y, + w, + h, + tx, + ty, + tw, + th, + anchors, + downsample_ratio, + batch_size, + scale_x_y=1., + ioup=None, + eps=1.e-10): + ''' + Args: + x | y | w | h ([Variables]): the output of yolov3 for encoded x|y|w|h + tx |ty |tw |th ([Variables]): the target of yolov3 for encoded x|y|w|h + anchors ([float]): list of anchors for current output layer + downsample_ratio (float): the downsample ratio for current output layer + batch_size (int): training batch size + eps (float): the decimal to prevent the denominator eqaul zero + ''' + pred = self._bbox_transform(x, y, w, h, anchors, downsample_ratio, + batch_size, False, scale_x_y, eps) + gt = self._bbox_transform(tx, ty, tw, th, anchors, downsample_ratio, + batch_size, True, scale_x_y, eps) + iouk = self._iou(pred, gt, ioup, eps) + if self.loss_square: + loss_iou = 1. - iouk * iouk + else: + loss_iou = 1. - iouk + loss_iou = loss_iou * self._loss_weight + + return loss_iou + + def _iou(self, pred, gt, ioup=None, eps=1.e-10): + x1, y1, x2, y2 = pred + x1g, y1g, x2g, y2g = gt + x2 = fluid.layers.elementwise_max(x1, x2) + y2 = fluid.layers.elementwise_max(y1, y2) + + xkis1 = fluid.layers.elementwise_max(x1, x1g) + ykis1 = fluid.layers.elementwise_max(y1, y1g) + xkis2 = fluid.layers.elementwise_min(x2, x2g) + ykis2 = fluid.layers.elementwise_min(y2, y2g) + + intsctk = (xkis2 - xkis1) * (ykis2 - ykis1) + intsctk = intsctk * fluid.layers.greater_than( + xkis2, xkis1) * fluid.layers.greater_than(ykis2, ykis1) + unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g + ) - intsctk + eps + iouk = intsctk / unionk + if self.ciou_term: + ciou = self.get_ciou_term(pred, gt, iouk, eps) + iouk = iouk - ciou + return iouk + + def get_ciou_term(self, pred, gt, iouk, eps): + x1, y1, x2, y2 = pred + x1g, y1g, x2g, y2g = gt + + cx = (x1 + x2) / 2 + cy = (y1 + y2) / 2 + w = (x2 - x1) + fluid.layers.cast((x2 - x1) == 0, 'float32') + h = (y2 - y1) + fluid.layers.cast((y2 - y1) == 0, 'float32') + + cxg = (x1g + x2g) / 2 + cyg = (y1g + y2g) / 2 + wg = x2g - x1g + hg = y2g - y1g + + # A or B + xc1 = fluid.layers.elementwise_min(x1, x1g) + yc1 = fluid.layers.elementwise_min(y1, y1g) + xc2 = fluid.layers.elementwise_max(x2, x2g) + yc2 = fluid.layers.elementwise_max(y2, y2g) + + # DIOU term + dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg) + dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1) + diou_term = (dist_intersection + eps) / (dist_union + eps) + # CIOU term + ciou_term = 0 + ar_gt = wg / hg + ar_pred = w / h + arctan = fluid.layers.atan(ar_gt) - fluid.layers.atan(ar_pred) + ar_loss = 4. / np.pi / np.pi * arctan * arctan + alpha = ar_loss / (1 - iouk + ar_loss + eps) + alpha.stop_gradient = True + ciou_term = alpha * ar_loss + return diou_term + ciou_term + + def _bbox_transform(self, dcx, dcy, dw, dh, anchors, downsample_ratio, + batch_size, is_gt, scale_x_y, eps): + grid_x = int(self._MAX_WI / downsample_ratio) + grid_y = int(self._MAX_HI / downsample_ratio) + an_num = len(anchors) // 2 + + shape_fmp = fluid.layers.shape(dcx) + shape_fmp.stop_gradient = True + # generate the grid_w x grid_h center of feature map + idx_i = np.array([[i for i in range(grid_x)]]) + idx_j = np.array([[j for j in range(grid_y)]]).transpose() + gi_np = np.repeat(idx_i, grid_y, axis=0) + gi_np = np.reshape(gi_np, newshape=[1, 1, grid_y, grid_x]) + gi_np = np.tile(gi_np, reps=[batch_size, an_num, 1, 1]) + gj_np = np.repeat(idx_j, grid_x, axis=1) + gj_np = np.reshape(gj_np, newshape=[1, 1, grid_y, grid_x]) + gj_np = np.tile(gj_np, reps=[batch_size, an_num, 1, 1]) + gi_max = self._create_tensor_from_numpy(gi_np.astype(np.float32)) + gi = fluid.layers.crop(x=gi_max, shape=dcx) + gi.stop_gradient = True + gj_max = self._create_tensor_from_numpy(gj_np.astype(np.float32)) + gj = fluid.layers.crop(x=gj_max, shape=dcx) + gj.stop_gradient = True + + grid_x_act = fluid.layers.cast(shape_fmp[3], dtype="float32") + grid_x_act.stop_gradient = True + grid_y_act = fluid.layers.cast(shape_fmp[2], dtype="float32") + grid_y_act.stop_gradient = True + if is_gt: + cx = fluid.layers.elementwise_add(dcx, gi) / grid_x_act + cx.gradient = True + cy = fluid.layers.elementwise_add(dcy, gj) / grid_y_act + cy.gradient = True + else: + dcx_sig = fluid.layers.sigmoid(dcx) + dcy_sig = fluid.layers.sigmoid(dcy) + if (abs(scale_x_y - 1.0) > eps): + dcx_sig = scale_x_y * dcx_sig - 0.5 * (scale_x_y - 1) + dcy_sig = scale_x_y * dcy_sig - 0.5 * (scale_x_y - 1) + cx = fluid.layers.elementwise_add(dcx_sig, gi) / grid_x_act + cy = fluid.layers.elementwise_add(dcy_sig, gj) / grid_y_act + + anchor_w_ = [anchors[i] for i in range(0, len(anchors)) if i % 2 == 0] + anchor_w_np = np.array(anchor_w_) + anchor_w_np = np.reshape(anchor_w_np, newshape=[1, an_num, 1, 1]) + anchor_w_np = np.tile( + anchor_w_np, reps=[batch_size, 1, grid_y, grid_x]) + anchor_w_max = self._create_tensor_from_numpy( + anchor_w_np.astype(np.float32)) + anchor_w = fluid.layers.crop(x=anchor_w_max, shape=dcx) + anchor_w.stop_gradient = True + anchor_h_ = [anchors[i] for i in range(0, len(anchors)) if i % 2 == 1] + anchor_h_np = np.array(anchor_h_) + anchor_h_np = np.reshape(anchor_h_np, newshape=[1, an_num, 1, 1]) + anchor_h_np = np.tile( + anchor_h_np, reps=[batch_size, 1, grid_y, grid_x]) + anchor_h_max = self._create_tensor_from_numpy( + anchor_h_np.astype(np.float32)) + anchor_h = fluid.layers.crop(x=anchor_h_max, shape=dcx) + anchor_h.stop_gradient = True + # e^tw e^th + exp_dw = fluid.layers.exp(dw) + exp_dh = fluid.layers.exp(dh) + pw = fluid.layers.elementwise_mul(exp_dw, anchor_w) / \ + (grid_x_act * downsample_ratio) + ph = fluid.layers.elementwise_mul(exp_dh, anchor_h) / \ + (grid_y_act * downsample_ratio) + if is_gt: + exp_dw.stop_gradient = True + exp_dh.stop_gradient = True + pw.stop_gradient = True + ph.stop_gradient = True + + x1 = cx - 0.5 * pw + y1 = cy - 0.5 * ph + x2 = cx + 0.5 * pw + y2 = cy + 0.5 * ph + if is_gt: + x1.stop_gradient = True + y1.stop_gradient = True + x2.stop_gradient = True + y2.stop_gradient = True + + return x1, y1, x2, y2 + + def _create_tensor_from_numpy(self, numpy_array): + paddle_array = fluid.layers.create_parameter( + attr=ParamAttr(), + shape=numpy_array.shape, + dtype=numpy_array.dtype, + default_initializer=NumpyArrayInitializer(numpy_array)) + paddle_array.stop_gradient = True + return paddle_array diff --git a/paddlex/cv/nets/detection/loss/yolo_loss.py b/paddlex/cv/nets/detection/loss/yolo_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4d948600f6f7e00fd05734f64337efa06c208ab4 --- /dev/null +++ b/paddlex/cv/nets/detection/loss/yolo_loss.py @@ -0,0 +1,371 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence + + +class YOLOv3Loss(object): + """ + Combined loss for YOLOv3 network + + Args: + batch_size (int): training batch size + ignore_thresh (float): threshold to ignore confidence loss + label_smooth (bool): whether to use label smoothing + use_fine_grained_loss (bool): whether use fine grained YOLOv3 loss + instead of fluid.layers.yolov3_loss + """ + + def __init__(self, + batch_size=8, + ignore_thresh=0.7, + label_smooth=True, + use_fine_grained_loss=False, + iou_loss=None, + iou_aware_loss=None, + downsample=[32, 16, 8], + scale_x_y=1., + match_score=False): + self._batch_size = batch_size + self._ignore_thresh = ignore_thresh + self._label_smooth = label_smooth + self._use_fine_grained_loss = use_fine_grained_loss + self._iou_loss = iou_loss + self._iou_aware_loss = iou_aware_loss + self.downsample = downsample + self.scale_x_y = scale_x_y + self.match_score = match_score + + def __call__(self, outputs, gt_box, gt_label, gt_score, targets, anchors, + anchor_masks, mask_anchors, num_classes, prefix_name): + if self._use_fine_grained_loss: + return self._get_fine_grained_loss( + outputs, targets, gt_box, self._batch_size, num_classes, + mask_anchors, self._ignore_thresh) + else: + losses = [] + for i, output in enumerate(outputs): + scale_x_y = self.scale_x_y if not isinstance( + self.scale_x_y, Sequence) else self.scale_x_y[i] + anchor_mask = anchor_masks[i] + loss = fluid.layers.yolov3_loss( + x=output, + gt_box=gt_box, + gt_label=gt_label, + gt_score=gt_score, + anchors=anchors, + anchor_mask=anchor_mask, + class_num=num_classes, + ignore_thresh=self._ignore_thresh, + downsample_ratio=self.downsample[i], + use_label_smooth=self._label_smooth, + scale_x_y=scale_x_y, + name=prefix_name + "yolo_loss" + str(i)) + + losses.append(fluid.layers.reduce_mean(loss)) + + return {'loss': sum(losses)} + + def _get_fine_grained_loss(self, + outputs, + targets, + gt_box, + batch_size, + num_classes, + mask_anchors, + ignore_thresh, + eps=1.e-10): + """ + Calculate fine grained YOLOv3 loss + + Args: + outputs ([Variables]): List of Variables, output of backbone stages + targets ([Variables]): List of Variables, The targets for yolo + loss calculatation. + gt_box (Variable): The ground-truth boudding boxes. + batch_size (int): The training batch size + num_classes (int): class num of dataset + mask_anchors ([[float]]): list of anchors in each output layer + ignore_thresh (float): prediction bbox overlap any gt_box greater + than ignore_thresh, objectness loss will + be ignored. + + Returns: + Type: dict + xy_loss (Variable): YOLOv3 (x, y) coordinates loss + wh_loss (Variable): YOLOv3 (w, h) coordinates loss + obj_loss (Variable): YOLOv3 objectness score loss + cls_loss (Variable): YOLOv3 classification loss + + """ + + assert len(outputs) == len(targets), \ + "YOLOv3 output layer number not equal target number" + + loss_xys, loss_whs, loss_objs, loss_clss = [], [], [], [] + if self._iou_loss is not None: + loss_ious = [] + if self._iou_aware_loss is not None: + loss_iou_awares = [] + for i, (output, target, + anchors) in enumerate(zip(outputs, targets, mask_anchors)): + downsample = self.downsample[i] + an_num = len(anchors) // 2 + if self._iou_aware_loss is not None: + ioup, output = self._split_ioup(output, an_num, num_classes) + x, y, w, h, obj, cls = self._split_output(output, an_num, + num_classes) + tx, ty, tw, th, tscale, tobj, tcls = self._split_target(target) + + tscale_tobj = tscale * tobj + + scale_x_y = self.scale_x_y if not isinstance( + self.scale_x_y, Sequence) else self.scale_x_y[i] + + if (abs(scale_x_y - 1.0) < eps): + loss_x = fluid.layers.sigmoid_cross_entropy_with_logits( + x, tx) * tscale_tobj + loss_x = fluid.layers.reduce_sum(loss_x, dim=[1, 2, 3]) + loss_y = fluid.layers.sigmoid_cross_entropy_with_logits( + y, ty) * tscale_tobj + loss_y = fluid.layers.reduce_sum(loss_y, dim=[1, 2, 3]) + else: + dx = scale_x_y * fluid.layers.sigmoid(x) - 0.5 * (scale_x_y - + 1.0) + dy = scale_x_y * fluid.layers.sigmoid(y) - 0.5 * (scale_x_y - + 1.0) + loss_x = fluid.layers.abs(dx - tx) * tscale_tobj + loss_x = fluid.layers.reduce_sum(loss_x, dim=[1, 2, 3]) + loss_y = fluid.layers.abs(dy - ty) * tscale_tobj + loss_y = fluid.layers.reduce_sum(loss_y, dim=[1, 2, 3]) + + # NOTE: we refined loss function of (w, h) as L1Loss + loss_w = fluid.layers.abs(w - tw) * tscale_tobj + loss_w = fluid.layers.reduce_sum(loss_w, dim=[1, 2, 3]) + loss_h = fluid.layers.abs(h - th) * tscale_tobj + loss_h = fluid.layers.reduce_sum(loss_h, dim=[1, 2, 3]) + if self._iou_loss is not None: + loss_iou = self._iou_loss(x, y, w, h, tx, ty, tw, th, anchors, + downsample, self._batch_size, + scale_x_y) + loss_iou = loss_iou * tscale_tobj + loss_iou = fluid.layers.reduce_sum(loss_iou, dim=[1, 2, 3]) + loss_ious.append(fluid.layers.reduce_mean(loss_iou)) + + if self._iou_aware_loss is not None: + loss_iou_aware = self._iou_aware_loss( + ioup, x, y, w, h, tx, ty, tw, th, anchors, downsample, + self._batch_size, scale_x_y) + loss_iou_aware = loss_iou_aware * tobj + loss_iou_aware = fluid.layers.reduce_sum( + loss_iou_aware, dim=[1, 2, 3]) + loss_iou_awares.append( + fluid.layers.reduce_mean(loss_iou_aware)) + + loss_obj_pos, loss_obj_neg = self._calc_obj_loss( + output, obj, tobj, gt_box, self._batch_size, anchors, + num_classes, downsample, self._ignore_thresh, scale_x_y) + + loss_cls = fluid.layers.sigmoid_cross_entropy_with_logits(cls, + tcls) + loss_cls = fluid.layers.elementwise_mul(loss_cls, tobj, axis=0) + loss_cls = fluid.layers.reduce_sum(loss_cls, dim=[1, 2, 3, 4]) + + loss_xys.append(fluid.layers.reduce_mean(loss_x + loss_y)) + loss_whs.append(fluid.layers.reduce_mean(loss_w + loss_h)) + loss_objs.append( + fluid.layers.reduce_mean(loss_obj_pos + loss_obj_neg)) + loss_clss.append(fluid.layers.reduce_mean(loss_cls)) + + losses_all = { + "loss_xy": fluid.layers.sum(loss_xys), + "loss_wh": fluid.layers.sum(loss_whs), + "loss_obj": fluid.layers.sum(loss_objs), + "loss_cls": fluid.layers.sum(loss_clss), + } + if self._iou_loss is not None: + losses_all["loss_iou"] = fluid.layers.sum(loss_ious) + if self._iou_aware_loss is not None: + losses_all["loss_iou_aware"] = fluid.layers.sum(loss_iou_awares) + return losses_all + + def _split_ioup(self, output, an_num, num_classes): + """ + Split output feature map to output, predicted iou + along channel dimension + """ + ioup = fluid.layers.slice(output, axes=[1], starts=[0], ends=[an_num]) + ioup = fluid.layers.sigmoid(ioup) + oriout = fluid.layers.slice( + output, + axes=[1], + starts=[an_num], + ends=[an_num * (num_classes + 6)]) + return (ioup, oriout) + + def _split_output(self, output, an_num, num_classes): + """ + Split output feature map to x, y, w, h, objectness, classification + along channel dimension + """ + x = fluid.layers.strided_slice( + output, + axes=[1], + starts=[0], + ends=[output.shape[1]], + strides=[5 + num_classes]) + y = fluid.layers.strided_slice( + output, + axes=[1], + starts=[1], + ends=[output.shape[1]], + strides=[5 + num_classes]) + w = fluid.layers.strided_slice( + output, + axes=[1], + starts=[2], + ends=[output.shape[1]], + strides=[5 + num_classes]) + h = fluid.layers.strided_slice( + output, + axes=[1], + starts=[3], + ends=[output.shape[1]], + strides=[5 + num_classes]) + obj = fluid.layers.strided_slice( + output, + axes=[1], + starts=[4], + ends=[output.shape[1]], + strides=[5 + num_classes]) + clss = [] + stride = output.shape[1] // an_num + for m in range(an_num): + clss.append( + fluid.layers.slice( + output, + axes=[1], + starts=[stride * m + 5], + ends=[stride * m + 5 + num_classes])) + cls = fluid.layers.transpose( + fluid.layers.stack( + clss, axis=1), perm=[0, 1, 3, 4, 2]) + + return (x, y, w, h, obj, cls) + + def _split_target(self, target): + """ + split target to x, y, w, h, objectness, classification + along dimension 2 + + target is in shape [N, an_num, 6 + class_num, H, W] + """ + tx = target[:, :, 0, :, :] + ty = target[:, :, 1, :, :] + tw = target[:, :, 2, :, :] + th = target[:, :, 3, :, :] + + tscale = target[:, :, 4, :, :] + tobj = target[:, :, 5, :, :] + + tcls = fluid.layers.transpose( + target[:, :, 6:, :, :], perm=[0, 1, 3, 4, 2]) + tcls.stop_gradient = True + + return (tx, ty, tw, th, tscale, tobj, tcls) + + def _calc_obj_loss(self, output, obj, tobj, gt_box, batch_size, anchors, + num_classes, downsample, ignore_thresh, scale_x_y): + # A prediction bbox overlap any gt_bbox over ignore_thresh, + # objectness loss will be ignored, process as follows: + + # 1. get pred bbox, which is same with YOLOv3 infer mode, use yolo_box here + # NOTE: img_size is set as 1.0 to get noramlized pred bbox + bbox, prob = fluid.layers.yolo_box( + x=output, + img_size=fluid.layers.ones( + shape=[batch_size, 2], dtype="int32"), + anchors=anchors, + class_num=num_classes, + conf_thresh=0., + downsample_ratio=downsample, + clip_bbox=False, + scale_x_y=scale_x_y) + + # 2. split pred bbox and gt bbox by sample, calculate IoU between pred bbox + # and gt bbox in each sample + if batch_size > 1: + preds = fluid.layers.split(bbox, batch_size, dim=0) + gts = fluid.layers.split(gt_box, batch_size, dim=0) + else: + preds = [bbox] + gts = [gt_box] + probs = [prob] + ious = [] + for pred, gt in zip(preds, gts): + + def box_xywh2xyxy(box): + x = box[:, 0] + y = box[:, 1] + w = box[:, 2] + h = box[:, 3] + return fluid.layers.stack( + [ + x - w / 2., + y - h / 2., + x + w / 2., + y + h / 2., + ], axis=1) + + pred = fluid.layers.squeeze(pred, axes=[0]) + gt = box_xywh2xyxy(fluid.layers.squeeze(gt, axes=[0])) + ious.append(fluid.layers.iou_similarity(pred, gt)) + + iou = fluid.layers.stack(ious, axis=0) + # 3. Get iou_mask by IoU between gt bbox and prediction bbox, + # Get obj_mask by tobj(holds gt_score), calculate objectness loss + + max_iou = fluid.layers.reduce_max(iou, dim=-1) + iou_mask = fluid.layers.cast(max_iou <= ignore_thresh, dtype="float32") + if self.match_score: + max_prob = fluid.layers.reduce_max(prob, dim=-1) + iou_mask = iou_mask * fluid.layers.cast( + max_prob <= 0.25, dtype="float32") + output_shape = fluid.layers.shape(output) + an_num = len(anchors) // 2 + iou_mask = fluid.layers.reshape(iou_mask, (-1, an_num, output_shape[2], + output_shape[3])) + iou_mask.stop_gradient = True + + # NOTE: tobj holds gt_score, obj_mask holds object existence mask + obj_mask = fluid.layers.cast(tobj > 0., dtype="float32") + obj_mask.stop_gradient = True + + # For positive objectness grids, objectness loss should be calculated + # For negative objectness grids, objectness loss is calculated only iou_mask == 1.0 + loss_obj = fluid.layers.sigmoid_cross_entropy_with_logits(obj, + obj_mask) + loss_obj_pos = fluid.layers.reduce_sum(loss_obj * tobj, dim=[1, 2, 3]) + loss_obj_neg = fluid.layers.reduce_sum( + loss_obj * (1.0 - obj_mask) * iou_mask, dim=[1, 2, 3]) + + return loss_obj_pos, loss_obj_neg diff --git a/paddlex/cv/nets/detection/ops.py b/paddlex/cv/nets/detection/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..b1ff6823092f52d8f595bc7a49db3dde2d447c7a --- /dev/null +++ b/paddlex/cv/nets/detection/ops.py @@ -0,0 +1,270 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numbers import Integral +import math +import six + +import paddle +from paddle import fluid + + +def DropBlock(input, block_size, keep_prob, is_test): + if is_test: + return input + + def CalculateGamma(input, block_size, keep_prob): + input_shape = fluid.layers.shape(input) + feat_shape_tmp = fluid.layers.slice(input_shape, [0], [3], [4]) + feat_shape_tmp = fluid.layers.cast(feat_shape_tmp, dtype="float32") + feat_shape_t = fluid.layers.reshape(feat_shape_tmp, [1, 1, 1, 1]) + feat_area = fluid.layers.pow(feat_shape_t, factor=2) + + block_shape_t = fluid.layers.fill_constant( + shape=[1, 1, 1, 1], value=block_size, dtype='float32') + block_area = fluid.layers.pow(block_shape_t, factor=2) + + useful_shape_t = feat_shape_t - block_shape_t + 1 + useful_area = fluid.layers.pow(useful_shape_t, factor=2) + + upper_t = feat_area * (1 - keep_prob) + bottom_t = block_area * useful_area + output = upper_t / bottom_t + return output + + gamma = CalculateGamma(input, block_size=block_size, keep_prob=keep_prob) + input_shape = fluid.layers.shape(input) + p = fluid.layers.expand_as(gamma, input) + + input_shape_tmp = fluid.layers.cast(input_shape, dtype="int64") + random_matrix = fluid.layers.uniform_random( + input_shape_tmp, dtype='float32', min=0.0, max=1.0) + one_zero_m = fluid.layers.less_than(random_matrix, p) + one_zero_m.stop_gradient = True + one_zero_m = fluid.layers.cast(one_zero_m, dtype="float32") + + mask_flag = fluid.layers.pool2d( + one_zero_m, + pool_size=block_size, + pool_type='max', + pool_stride=1, + pool_padding=block_size // 2) + mask = 1.0 - mask_flag + + elem_numel = fluid.layers.reduce_prod(input_shape) + elem_numel_m = fluid.layers.cast(elem_numel, dtype="float32") + elem_numel_m.stop_gradient = True + + elem_sum = fluid.layers.reduce_sum(mask) + elem_sum_m = fluid.layers.cast(elem_sum, dtype="float32") + elem_sum_m.stop_gradient = True + + output = input * mask * elem_numel_m / elem_sum_m + return output + + +class MultiClassNMS(object): + def __init__(self, + score_threshold=.05, + nms_top_k=-1, + keep_top_k=100, + nms_threshold=.5, + normalized=False, + nms_eta=1.0, + background_label=0): + super(MultiClassNMS, self).__init__() + self.score_threshold = score_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.nms_threshold = nms_threshold + self.normalized = normalized + self.nms_eta = nms_eta + self.background_label = background_label + + def __call__(self, bboxes, scores): + return fluid.layers.multiclass_nms( + bboxes=bboxes, + scores=scores, + score_threshold=self.score_threshold, + nms_top_k=self.nms_top_k, + keep_top_k=self.keep_top_k, + normalized=self.normalized, + nms_threshold=self.nms_threshold, + nms_eta=self.nms_eta, + background_label=self.background_label) + + +class MatrixNMS(object): + def __init__(self, + score_threshold=.05, + post_threshold=.05, + nms_top_k=-1, + keep_top_k=100, + use_gaussian=False, + gaussian_sigma=2., + normalized=False, + background_label=0): + super(MatrixNMS, self).__init__() + self.score_threshold = score_threshold + self.post_threshold = post_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.normalized = normalized + self.use_gaussian = use_gaussian + self.gaussian_sigma = gaussian_sigma + self.background_label = background_label + + def __call__(self, bboxes, scores): + return paddle.fluid.layers.matrix_nms( + bboxes=bboxes, + scores=scores, + score_threshold=self.score_threshold, + post_threshold=self.post_threshold, + nms_top_k=self.nms_top_k, + keep_top_k=self.keep_top_k, + normalized=self.normalized, + use_gaussian=self.use_gaussian, + gaussian_sigma=self.gaussian_sigma, + background_label=self.background_label) + + +class MultiClassSoftNMS(object): + def __init__( + self, + score_threshold=0.01, + keep_top_k=300, + softnms_sigma=0.5, + normalized=False, + background_label=0, ): + super(MultiClassSoftNMS, self).__init__() + self.score_threshold = score_threshold + self.keep_top_k = keep_top_k + self.softnms_sigma = softnms_sigma + self.normalized = normalized + self.background_label = background_label + + def __call__(self, bboxes, scores): + def create_tmp_var(program, name, dtype, shape, lod_level): + return program.current_block().create_var( + name=name, dtype=dtype, shape=shape, lod_level=lod_level) + + def _soft_nms_for_cls(dets, sigma, thres): + """soft_nms_for_cls""" + dets_final = [] + while len(dets) > 0: + maxpos = np.argmax(dets[:, 0]) + dets_final.append(dets[maxpos].copy()) + ts, tx1, ty1, tx2, ty2 = dets[maxpos] + scores = dets[:, 0] + # force remove bbox at maxpos + scores[maxpos] = -1 + x1 = dets[:, 1] + y1 = dets[:, 2] + x2 = dets[:, 3] + y2 = dets[:, 4] + eta = 0 if self.normalized else 1 + areas = (x2 - x1 + eta) * (y2 - y1 + eta) + xx1 = np.maximum(tx1, x1) + yy1 = np.maximum(ty1, y1) + xx2 = np.minimum(tx2, x2) + yy2 = np.minimum(ty2, y2) + w = np.maximum(0.0, xx2 - xx1 + eta) + h = np.maximum(0.0, yy2 - yy1 + eta) + inter = w * h + ovr = inter / (areas + areas[maxpos] - inter) + weight = np.exp(-(ovr * ovr) / sigma) + scores = scores * weight + idx_keep = np.where(scores >= thres) + dets[:, 0] = scores + dets = dets[idx_keep] + dets_final = np.array(dets_final).reshape(-1, 5) + return dets_final + + def _soft_nms(bboxes, scores): + class_nums = scores.shape[-1] + + softnms_thres = self.score_threshold + softnms_sigma = self.softnms_sigma + keep_top_k = self.keep_top_k + + cls_boxes = [[] for _ in range(class_nums)] + cls_ids = [[] for _ in range(class_nums)] + + start_idx = 1 if self.background_label == 0 else 0 + for j in range(start_idx, class_nums): + inds = np.where(scores[:, j] >= softnms_thres)[0] + scores_j = scores[inds, j] + rois_j = bboxes[inds, j, :] if len( + bboxes.shape) > 2 else bboxes[inds, :] + dets_j = np.hstack((scores_j[:, np.newaxis], rois_j)).astype( + np.float32, copy=False) + cls_rank = np.argsort(-dets_j[:, 0]) + dets_j = dets_j[cls_rank] + + cls_boxes[j] = _soft_nms_for_cls( + dets_j, sigma=softnms_sigma, thres=softnms_thres) + cls_ids[j] = np.array([j] * cls_boxes[j].shape[0]).reshape(-1, + 1) + + cls_boxes = np.vstack(cls_boxes[start_idx:]) + cls_ids = np.vstack(cls_ids[start_idx:]) + pred_result = np.hstack([cls_ids, cls_boxes]) + + # Limit to max_per_image detections **over all classes** + image_scores = cls_boxes[:, 0] + if len(image_scores) > keep_top_k: + image_thresh = np.sort(image_scores)[-keep_top_k] + keep = np.where(cls_boxes[:, 0] >= image_thresh)[0] + pred_result = pred_result[keep, :] + + return pred_result + + def _batch_softnms(bboxes, scores): + batch_offsets = bboxes.lod() + bboxes = np.array(bboxes) + scores = np.array(scores) + out_offsets = [0] + pred_res = [] + if len(batch_offsets) > 0: + batch_offset = batch_offsets[0] + for i in range(len(batch_offset) - 1): + s, e = batch_offset[i], batch_offset[i + 1] + pred = _soft_nms(bboxes[s:e], scores[s:e]) + out_offsets.append(pred.shape[0] + out_offsets[-1]) + pred_res.append(pred) + else: + assert len(bboxes.shape) == 3 + assert len(scores.shape) == 3 + for i in range(bboxes.shape[0]): + pred = _soft_nms(bboxes[i], scores[i]) + out_offsets.append(pred.shape[0] + out_offsets[-1]) + pred_res.append(pred) + + res = fluid.LoDTensor() + res.set_lod([out_offsets]) + if len(pred_res) == 0: + pred_res = np.array([[1]], dtype=np.float32) + res.set(np.vstack(pred_res).astype(np.float32), fluid.CPUPlace()) + return res + + pred_result = create_tmp_var( + fluid.default_main_program(), + name='softnms_pred_result', + dtype='float32', + shape=[-1, 6], + lod_level=1) + fluid.layers.py_func( + func=_batch_softnms, x=[bboxes, scores], out=pred_result) + return pred_result diff --git a/paddlex/cv/nets/detection/yolo_v3.py b/paddlex/cv/nets/detection/yolo_v3.py index 817c60ef3f8e8cb1c01364689cd13f402e4199c8..01c729a4b673fc990ab4116092e3aeb0bf5587fe 100644 --- a/paddlex/cv/nets/detection/yolo_v3.py +++ b/paddlex/cv/nets/detection/yolo_v3.py @@ -16,25 +16,50 @@ from paddle import fluid from paddle.fluid.param_attr import ParamAttr from paddle.fluid.regularizer import L2Decay from collections import OrderedDict +from .ops import MultiClassNMS, MultiClassSoftNMS, MatrixNMS +from .ops import DropBlock +from .loss.yolo_loss import YOLOv3Loss +from .loss.iou_loss import IouLoss +from .loss.iou_aware_loss import IouAwareLoss +from .iou_aware import get_iou_aware_score +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence class YOLOv3: - def __init__(self, - backbone, - num_classes, - mode='train', - anchors=None, - anchor_masks=None, - ignore_threshold=0.7, - label_smooth=False, - nms_score_threshold=0.01, - nms_topk=1000, - nms_keep_topk=100, - nms_iou_threshold=0.45, - train_random_shapes=[ - 320, 352, 384, 416, 448, 480, 512, 544, 576, 608 - ], - fixed_input_shape=None): + def __init__( + self, + backbone, + mode='train', + # YOLOv3Head + num_classes=80, + anchors=None, + anchor_masks=None, + coord_conv=False, + iou_aware=False, + iou_aware_factor=0.4, + scale_x_y=1., + spp=False, + drop_block=False, + use_matrix_nms=False, + # YOLOv3Loss + batch_size=8, + ignore_threshold=0.7, + label_smooth=False, + use_fine_grained_loss=False, + use_iou_loss=False, + iou_loss_weight=2.5, + iou_aware_loss_weight=1.0, + max_height=608, + max_width=608, + # NMS + nms_score_threshold=0.01, + nms_topk=1000, + nms_keep_topk=100, + nms_iou_threshold=0.45, + fixed_input_shape=None): if anchors is None: anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]] @@ -46,56 +71,114 @@ class YOLOv3: self.mode = mode self.num_classes = num_classes self.backbone = backbone - self.ignore_thresh = ignore_threshold - self.label_smooth = label_smooth - self.nms_score_threshold = nms_score_threshold - self.nms_topk = nms_topk - self.nms_keep_topk = nms_keep_topk - self.nms_iou_threshold = nms_iou_threshold self.norm_decay = 0.0 self.prefix_name = '' - self.train_random_shapes = train_random_shapes + self.use_fine_grained_loss = use_fine_grained_loss self.fixed_input_shape = fixed_input_shape + self.coord_conv = coord_conv + self.iou_aware = iou_aware + self.iou_aware_factor = iou_aware_factor + self.scale_x_y = scale_x_y + self.use_spp = spp + self.drop_block = drop_block - def _head(self, feats): + if use_matrix_nms: + self.nms = MatrixNMS( + background_label=-1, + keep_top_k=nms_keep_topk, + normalized=False, + score_threshold=nms_score_threshold, + post_threshold=0.01) + else: + self.nms = MultiClassNMS( + background_label=-1, + keep_top_k=nms_keep_topk, + nms_threshold=nms_iou_threshold, + nms_top_k=nms_topk, + normalized=False, + score_threshold=nms_score_threshold) + self.iou_loss = None + self.iou_aware_loss = None + if use_iou_loss: + self.iou_loss = IouLoss( + loss_weight=iou_loss_weight, + max_height=max_height, + max_width=max_width) + if iou_aware: + self.iou_aware_loss = IouAwareLoss( + loss_weight=iou_aware_loss_weight, + max_height=max_height, + max_width=max_width) + self.yolo_loss = YOLOv3Loss( + batch_size=batch_size, + ignore_thresh=ignore_threshold, + scale_x_y=scale_x_y, + label_smooth=label_smooth, + use_fine_grained_loss=self.use_fine_grained_loss, + iou_loss=self.iou_loss, + iou_aware_loss=self.iou_aware_loss) + self.conv_block_num = 2 + self.block_size = 3 + self.keep_prob = 0.9 + self.downsample = [32, 16, 8] + self.clip_bbox = True + + def _head(self, input, is_train=True): outputs = [] + + # get last out_layer_num blocks in reverse order out_layer_num = len(self.anchor_masks) - blocks = feats[-1:-out_layer_num - 1:-1] - route = None + blocks = input[-1:-out_layer_num - 1:-1] + route = None for i, block in enumerate(blocks): - if i > 0: + if i > 0: # perform concat in first 2 detection_block block = fluid.layers.concat(input=[route, block], axis=1) route, tip = self._detection_block( block, - channel=512 // (2**i), - name=self.prefix_name + 'yolo_block.{}'.format(i)) + channel=64 * (2**out_layer_num) // (2**i), + is_first=i == 0, + is_test=(not is_train), + conv_block_num=self.conv_block_num, + name=self.prefix_name + "yolo_block.{}".format(i)) - num_filters = len(self.anchor_masks[i]) * (self.num_classes + 5) - block_out = fluid.layers.conv2d( - input=tip, - num_filters=num_filters, - filter_size=1, - stride=1, - padding=0, - act=None, - param_attr=ParamAttr(name=self.prefix_name + - 'yolo_output.{}.conv.weights'.format(i)), - bias_attr=ParamAttr( - regularizer=L2Decay(0.0), - name=self.prefix_name + - 'yolo_output.{}.conv.bias'.format(i))) - outputs.append(block_out) + # out channel number = mask_num * (5 + class_num) + if self.iou_aware: + num_filters = len(self.anchor_masks[i]) * ( + self.num_classes + 6) + else: + num_filters = len(self.anchor_masks[i]) * ( + self.num_classes + 5) + with fluid.name_scope('yolo_output'): + block_out = fluid.layers.conv2d( + input=tip, + num_filters=num_filters, + filter_size=1, + stride=1, + padding=0, + act=None, + param_attr=ParamAttr( + name=self.prefix_name + + "yolo_output.{}.conv.weights".format(i)), + bias_attr=ParamAttr( + regularizer=L2Decay(0.), + name=self.prefix_name + + "yolo_output.{}.conv.bias".format(i))) + outputs.append(block_out) if i < len(blocks) - 1: + # do not perform upsample in the last detection_block route = self._conv_bn( input=route, ch_out=256 // (2**i), filter_size=1, stride=1, padding=0, - name=self.prefix_name + 'yolo_transition.{}'.format(i)) + is_test=(not is_train), + name=self.prefix_name + "yolo_transition.{}".format(i)) + # upsample route = self._upsample(route) + return outputs def _parse_anchors(self, anchors): @@ -116,6 +199,54 @@ class YOLOv3: assert mask < anchor_num, "anchor mask index overflow" self.mask_anchors[-1].extend(anchors[mask]) + def _create_tensor_from_numpy(self, numpy_array): + paddle_array = fluid.layers.create_global_var( + shape=numpy_array.shape, value=0., dtype=numpy_array.dtype) + fluid.layers.assign(numpy_array, paddle_array) + return paddle_array + + def _add_coord(self, input, is_test=True): + if not self.coord_conv: + return input + + # NOTE: here is used for exporting model for TensorRT inference, + # only support batch_size=1 for input shape should be fixed, + # and we create tensor with fixed shape from numpy array + if is_test and input.shape[2] > 0 and input.shape[3] > 0: + batch_size = 1 + grid_x = int(input.shape[3]) + grid_y = int(input.shape[2]) + idx_i = np.array( + [[i / (grid_x - 1) * 2.0 - 1 for i in range(grid_x)]], + dtype='float32') + gi_np = np.repeat(idx_i, grid_y, axis=0) + gi_np = np.reshape(gi_np, newshape=[1, 1, grid_y, grid_x]) + gi_np = np.tile(gi_np, reps=[batch_size, 1, 1, 1]) + + x_range = self._create_tensor_from_numpy(gi_np.astype(np.float32)) + x_range.stop_gradient = True + y_range = self._create_tensor_from_numpy( + gi_np.transpose([0, 1, 3, 2]).astype(np.float32)) + y_range.stop_gradient = True + + # NOTE: in training mode, H and W is variable for random shape, + # implement add_coord with shape as Variable + else: + input_shape = fluid.layers.shape(input) + b = input_shape[0] + h = input_shape[2] + w = input_shape[3] + + x_range = fluid.layers.range(0, w, 1, 'float32') / ((w - 1.) / 2.) + x_range = x_range - 1. + x_range = fluid.layers.unsqueeze(x_range, [0, 1, 2]) + x_range = fluid.layers.expand(x_range, [b, 1, h, 1]) + x_range.stop_gradient = True + y_range = fluid.layers.transpose(x_range, [0, 1, 3, 2]) + y_range.stop_gradient = True + + return fluid.layers.concat([input, x_range, y_range], axis=1) + def _conv_bn(self, input, ch_out, @@ -151,18 +282,52 @@ class YOLOv3: out = fluid.layers.leaky_relu(x=out, alpha=0.1) return out + def _spp_module(self, input, is_test=True, name=""): + output1 = input + output2 = fluid.layers.pool2d( + input=output1, + pool_size=5, + pool_stride=1, + pool_padding=2, + ceil_mode=False, + pool_type='max') + output3 = fluid.layers.pool2d( + input=output1, + pool_size=9, + pool_stride=1, + pool_padding=4, + ceil_mode=False, + pool_type='max') + output4 = fluid.layers.pool2d( + input=output1, + pool_size=13, + pool_stride=1, + pool_padding=6, + ceil_mode=False, + pool_type='max') + output = fluid.layers.concat( + input=[output1, output2, output3, output4], axis=1) + return output + def _upsample(self, input, scale=2, name=None): out = fluid.layers.resize_nearest( input=input, scale=float(scale), name=name) return out - def _detection_block(self, input, channel, name=None): - assert channel % 2 == 0, "channel({}) cannot be divided by 2 in detection block({})".format( - channel, name) + def _detection_block(self, + input, + channel, + conv_block_num=2, + is_first=False, + is_test=True, + name=None): + assert channel % 2 == 0, \ + "channel {} cannot be divided by 2 in detection block {}" \ + .format(channel, name) - is_test = False if self.mode == 'train' else True conv = input - for i in range(2): + for j in range(conv_block_num): + conv = self._add_coord(conv, is_test=is_test) conv = self._conv_bn( conv, channel, @@ -170,7 +335,17 @@ class YOLOv3: stride=1, padding=0, is_test=is_test, - name='{}.{}.0'.format(name, i)) + name='{}.{}.0'.format(name, j)) + if self.use_spp and is_first and j == 1: + conv = self._spp_module(conv, is_test=is_test, name="spp") + conv = self._conv_bn( + conv, + 512, + filter_size=1, + stride=1, + padding=0, + is_test=is_test, + name='{}.{}.spp.conv'.format(name, j)) conv = self._conv_bn( conv, channel * 2, @@ -178,7 +353,21 @@ class YOLOv3: stride=1, padding=1, is_test=is_test, - name='{}.{}.1'.format(name, i)) + name='{}.{}.1'.format(name, j)) + if self.drop_block and j == 0 and not is_first: + conv = DropBlock( + conv, + block_size=self.block_size, + keep_prob=self.keep_prob, + is_test=is_test) + + if self.drop_block and is_first: + conv = DropBlock( + conv, + block_size=self.block_size, + keep_prob=self.keep_prob, + is_test=is_test) + conv = self._add_coord(conv, is_test=is_test) route = self._conv_bn( conv, channel, @@ -187,8 +376,9 @@ class YOLOv3: padding=0, is_test=is_test, name='{}.2'.format(name)) + new_route = self._add_coord(route, is_test=is_test) tip = self._conv_bn( - route, + new_route, channel * 2, filter_size=3, stride=1, @@ -197,54 +387,44 @@ class YOLOv3: name='{}.tip'.format(name)) return route, tip - def _get_loss(self, inputs, gt_box, gt_label, gt_score): - losses = [] - downsample = 32 - for i, input in enumerate(inputs): - loss = fluid.layers.yolov3_loss( - x=input, - gt_box=gt_box, - gt_label=gt_label, - gt_score=gt_score, - anchors=self.anchors, - anchor_mask=self.anchor_masks[i], - class_num=self.num_classes, - ignore_thresh=self.ignore_thresh, - downsample_ratio=downsample, - use_label_smooth=self.label_smooth, - name=self.prefix_name + 'yolo_loss' + str(i)) - losses.append(fluid.layers.reduce_mean(loss)) - downsample //= 2 - return sum(losses) + def _get_loss(self, inputs, gt_box, gt_label, gt_score, targets): + loss = self.yolo_loss(inputs, gt_box, gt_label, gt_score, targets, + self.anchors, self.anchor_masks, + self.mask_anchors, self.num_classes, + self.prefix_name) + total_loss = fluid.layers.sum(list(loss.values())) + return total_loss def _get_prediction(self, inputs, im_size): boxes = [] scores = [] - downsample = 32 for i, input in enumerate(inputs): + if self.iou_aware: + input = get_iou_aware_score(input, + len(self.anchor_masks[i]), + self.num_classes, + self.iou_aware_factor) + scale_x_y = self.scale_x_y if not isinstance( + self.scale_x_y, Sequence) else self.scale_x_y[i] + box, score = fluid.layers.yolo_box( x=input, img_size=im_size, anchors=self.mask_anchors[i], class_num=self.num_classes, - conf_thresh=self.nms_score_threshold, - downsample_ratio=downsample, - name=self.prefix_name + 'yolo_box' + str(i)) + conf_thresh=self.nms.score_threshold, + downsample_ratio=self.downsample[i], + name=self.prefix_name + 'yolo_box' + str(i), + clip_bbox=self.clip_bbox, + scale_x_y=self.scale_x_y) boxes.append(box) scores.append(fluid.layers.transpose(score, perm=[0, 2, 1])) - downsample //= 2 + yolo_boxes = fluid.layers.concat(boxes, axis=1) yolo_scores = fluid.layers.concat(scores, axis=2) - pred = fluid.layers.multiclass_nms( - bboxes=yolo_boxes, - scores=yolo_scores, - score_threshold=self.nms_score_threshold, - nms_top_k=self.nms_topk, - keep_top_k=self.nms_keep_topk, - nms_threshold=self.nms_iou_threshold, - normalized=False, - nms_eta=1.0, - background_label=-1) + if type(self.nms) is MultiClassSoftNMS: + yolo_scores = fluid.layers.transpose(yolo_scores, perm=[0, 2, 1]) + pred = self.nms(bboxes=yolo_boxes, scores=yolo_scores) return pred def generate_inputs(self): @@ -267,6 +447,25 @@ class YOLOv3: dtype='float32', shape=[None, None], name='gt_score') inputs['im_size'] = fluid.data( dtype='int32', shape=[None, 2], name='im_size') + if self.use_fine_grained_loss: + downsample = 32 + for i, mask in enumerate(self.anchor_masks): + if self.fixed_input_shape is not None: + target_shape = [ + self.fixed_input_shape[1] // downsample, + self.fixed_input_shape[0] // downsample + ] + else: + target_shape = [None, None] + inputs['target{}'.format(i)] = fluid.data( + dtype='float32', + lod_level=0, + shape=[ + None, len(mask), 6 + self.num_classes, + target_shape[0], target_shape[1] + ], + name='target{}'.format(i)) + downsample //= 2 elif self.mode == 'eval': inputs['im_size'] = fluid.data( dtype='int32', shape=[None, 2], name='im_size') @@ -285,28 +484,12 @@ class YOLOv3: def build_net(self, inputs): image = inputs['image'] - if self.mode == 'train': - if isinstance(self.train_random_shapes, - (list, tuple)) and len(self.train_random_shapes) > 0: - import numpy as np - shapes = np.array(self.train_random_shapes) - shapes = np.stack([shapes, shapes], axis=1).astype('float32') - shapes_tensor = fluid.layers.assign(shapes) - index = fluid.layers.uniform_random( - shape=[1], dtype='float32', min=0.0, max=1) - index = fluid.layers.cast( - index * len(self.train_random_shapes), dtype='int32') - shape = fluid.layers.gather(shapes_tensor, index) - shape = fluid.layers.reshape(shape, [-1]) - shape = fluid.layers.cast(shape, dtype='int32') - image = fluid.layers.resize_nearest( - image, out_shape=shape, align_corners=False) feats = self.backbone(image) if isinstance(feats, OrderedDict): feat_names = list(feats.keys()) feats = [feats[name] for name in feat_names] - head_outputs = self._head(feats) + head_outputs = self._head(feats, self.mode == 'train') if self.mode == 'train': gt_box = inputs['gt_box'] gt_label = inputs['gt_label'] @@ -320,8 +503,15 @@ class YOLOv3: whwh = fluid.layers.cast(whwh, dtype='float32') whwh.stop_gradient = True normalized_box = fluid.layers.elementwise_div(gt_box, whwh) + + targets = [] + if self.use_fine_grained_loss: + for i, mask in enumerate(self.anchor_masks): + k = 'target{}'.format(i) + if k in inputs: + targets.append(inputs[k]) return self._get_loss(head_outputs, normalized_box, gt_label, - gt_score) + gt_score, targets) else: im_size = inputs['im_size'] return self._get_prediction(head_outputs, im_size) diff --git a/paddlex/cv/transforms/__init__.py b/paddlex/cv/transforms/__init__.py index c8018a34d8bc03022263c0896ca0386fa6decba2..445ab164546f62dbc992588a4f9252c07df617c1 100644 --- a/paddlex/cv/transforms/__init__.py +++ b/paddlex/cv/transforms/__init__.py @@ -91,7 +91,10 @@ def arrange_transforms(model_type, class_name, transforms, mode='train'): elif model_type == 'segmenter': arrange_transform = seg_transforms.ArrangeSegmenter elif model_type == 'detector': - arrange_name = 'Arrange{}'.format(class_name) + if class_name == "PPYOLO": + arrange_name = 'ArrangeYOLOv3' + else: + arrange_name = 'Arrange{}'.format(class_name) arrange_transform = getattr(det_transforms, arrange_name) else: raise Exception("Unrecognized model type: {}".format(self.model_type)) diff --git a/paddlex/cv/transforms/cls_transforms.py b/paddlex/cv/transforms/cls_transforms.py index 69dcb02dde38aa7ccb97e1349dfae0b9a53b8555..361d9a00649502c522fbe50d3366d95570506e7f 100644 --- a/paddlex/cv/transforms/cls_transforms.py +++ b/paddlex/cv/transforms/cls_transforms.py @@ -46,7 +46,7 @@ class Compose(ClsTransform): raise ValueError('The length of transforms ' + \ 'must be equal or larger than 1!') self.transforms = transforms - + self.batch_transforms = None # 检查transforms里面的操作,目前支持PaddleX定义的或者是imgaug操作 for op in self.transforms: if not isinstance(op, ClsTransform): diff --git a/paddlex/cv/transforms/det_transforms.py b/paddlex/cv/transforms/det_transforms.py index 26ad49fd33b8971c23e9ded9ddfdfa5cd4f973fc..32603bac5141c10c7ceedb59bf438b281f86ccf0 100644 --- a/paddlex/cv/transforms/det_transforms.py +++ b/paddlex/cv/transforms/det_transforms.py @@ -55,6 +55,7 @@ class Compose(DetTransform): raise ValueError('The length of transforms ' + \ 'must be equal or larger than 1!') self.transforms = transforms + self.batch_transforms = None self.use_mixup = False for t in self.transforms: if type(t).__name__ == 'MixupImage': @@ -1385,3 +1386,187 @@ class ComposedYOLOv3Transforms(Compose): mean=mean, std=std) ] super(ComposedYOLOv3Transforms, self).__init__(transforms) + + +class BatchRandomShape(DetTransform): + """调整图像大小(resize)。 + + 对batch数据中的每张图像全部resize到random_shapes中任意一个大小。 + 注意:当插值方式为“RANDOM”时,则随机选取一种插值方式进行resize。 + + Args: + random_shapes (list): resize大小选择列表。 + 默认为[320, 352, 384, 416, 448, 480, 512, 544, 576, 608]。 + interp (str): resize的插值方式,与opencv的插值方式对应,取值范围为 + ['NEAREST', 'LINEAR', 'CUBIC', 'AREA', 'LANCZOS4', 'RANDOM']。默认为"RANDOM"。 + Raises: + ValueError: 插值方式不在['NEAREST', 'LINEAR', 'CUBIC', + 'AREA', 'LANCZOS4', 'RANDOM']中。 + """ + + # The interpolation mode + interp_dict = { + 'NEAREST': cv2.INTER_NEAREST, + 'LINEAR': cv2.INTER_LINEAR, + 'CUBIC': cv2.INTER_CUBIC, + 'AREA': cv2.INTER_AREA, + 'LANCZOS4': cv2.INTER_LANCZOS4 + } + + def __init__( + self, + random_shapes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608], + interp='RANDOM'): + if not (interp == "RANDOM" or interp in self.interp_dict): + raise ValueError("interp should be one of {}".format( + self.interp_dict.keys())) + self.random_shapes = random_shapes + self.interp = interp + + def __call__(self, batch_data): + """ + Args: + batch_data (list): 由与图像相关的各种信息组成的batch数据。 + Returns: + list: 由与图像相关的各种信息组成的batch数据。 + """ + shape = np.random.choice(self.random_shapes) + + if self.interp == "RANDOM": + interp = random.choice(list(self.interp_dict.keys())) + else: + interp = self.interp + for data_id, data in enumerate(batch_data): + data_list = list(data) + im = data_list[0] + im = np.swapaxes(im, 1, 0) + im = np.swapaxes(im, 1, 2) + im = resize(im, shape, self.interp_dict[interp]) + im = np.swapaxes(im, 1, 2) + im = np.swapaxes(im, 1, 0) + data_list[0] = im + batch_data[data_id] = tuple(data_list) + return batch_data + + +class GenerateYoloTarget(object): + """生成YOLOv3的ground truth(真实标注框)在不同特征层的位置转换信息。 + 该transform只在YOLOv3计算细粒度loss时使用。 + + Args: + anchors (list|tuple): anchor框的宽度和高度。 + anchor_masks (list|tuple): 在计算损失时,使用anchor的mask索引。 + num_classes (int): 类别数。默认为80。 + iou_thresh (float): iou阈值,当anchor和真实标注框的iou大于该阈值时,计入target。默认为1.0。 + """ + + def __init__(self, + anchors, + anchor_masks, + downsample_ratios, + num_classes=80, + iou_thresh=1.): + super(GenerateYoloTarget, self).__init__() + self.anchors = anchors + self.anchor_masks = anchor_masks + self.downsample_ratios = downsample_ratios + self.num_classes = num_classes + self.iou_thresh = iou_thresh + + def __call__(self, batch_data): + """ + Args: + batch_data (list): 由与图像相关的各种信息组成的batch数据。 + Returns: + list: 由与图像相关的各种信息组成的batch数据。 + 其中,每个数据新添加的字段为: + - target0 (np.ndarray): YOLOv3的ground truth在特征层0的位置转换信息, + 形状为(特征层0的anchor数量, 6+类别数, 特征层0的h, 特征层0的w)。 + - target1 (np.ndarray): YOLOv3的ground truth在特征层1的位置转换信息, + 形状为(特征层1的anchor数量, 6+类别数, 特征层1的h, 特征层1的w)。 + - ... + -targetn (np.ndarray): YOLOv3的ground truth在特征层n的位置转换信息, + 形状为(特征层n的anchor数量, 6+类别数, 特征层n的h, 特征层n的w)。 + n的是大小由anchor_masks的长度决定。 + """ + im = batch_data[0][0] + h = im.shape[1] + w = im.shape[2] + an_hw = np.array(self.anchors) / np.array([[w, h]]) + for data_id, data in enumerate(batch_data): + gt_bbox = data[1] + gt_class = data[2] + gt_score = data[3] + im_shape = data[4] + origin_h = float(im_shape[0]) + origin_w = float(im_shape[1]) + data_list = list(data) + for i, ( + mask, downsample_ratio + ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)): + grid_h = int(h / downsample_ratio) + grid_w = int(w / downsample_ratio) + target = np.zeros( + (len(mask), 6 + self.num_classes, grid_h, grid_w), + dtype=np.float32) + for b in range(gt_bbox.shape[0]): + gx = gt_bbox[b, 0] / float(origin_w) + gy = gt_bbox[b, 1] / float(origin_h) + gw = gt_bbox[b, 2] / float(origin_w) + gh = gt_bbox[b, 3] / float(origin_h) + cls = gt_class[b] + score = gt_score[b] + if gw <= 0. or gh <= 0. or score <= 0.: + continue + # find best match anchor index + best_iou = 0. + best_idx = -1 + for an_idx in range(an_hw.shape[0]): + iou = jaccard_overlap( + [0., 0., gw, gh], + [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]]) + if iou > best_iou: + best_iou = iou + best_idx = an_idx + gi = int(gx * grid_w) + gj = int(gy * grid_h) + # gtbox should be regresed in this layes if best match + # anchor index in anchor mask of this layer + if best_idx in mask: + best_n = mask.index(best_idx) + # x, y, w, h, scale + target[best_n, 0, gj, gi] = gx * grid_w - gi + target[best_n, 1, gj, gi] = gy * grid_h - gj + target[best_n, 2, gj, gi] = np.log( + gw * w / self.anchors[best_idx][0]) + target[best_n, 3, gj, gi] = np.log( + gh * h / self.anchors[best_idx][1]) + target[best_n, 4, gj, gi] = 2.0 - gw * gh + # objectness record gt_score + target[best_n, 5, gj, gi] = score + # classification + target[best_n, 6 + cls, gj, gi] = 1. + # For non-matched anchors, calculate the target if the iou + # between anchor and gt is larger than iou_thresh + if self.iou_thresh < 1: + for idx, mask_i in enumerate(mask): + if mask_i == best_idx: continue + iou = jaccard_overlap( + [0., 0., gw, gh], + [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]]) + if iou > self.iou_thresh: + # x, y, w, h, scale + target[idx, 0, gj, gi] = gx * grid_w - gi + target[idx, 1, gj, gi] = gy * grid_h - gj + target[idx, 2, gj, gi] = np.log( + gw * w / self.anchors[mask_i][0]) + target[idx, 3, gj, gi] = np.log( + gh * h / self.anchors[mask_i][1]) + target[idx, 4, gj, gi] = 2.0 - gw * gh + # objectness record gt_score + target[idx, 5, gj, gi] = score + # classification + target[idx, 6 + cls, gj, gi] = 1. + data_list.append(target) + batch_data[data_id] = tuple(data_list) + return batch_data diff --git a/paddlex/cv/transforms/seg_transforms.py b/paddlex/cv/transforms/seg_transforms.py index c22fcb9d6ead11eab6632877fdecfde63e99d2a2..4661eb2e9c8438bde4035287a6a07db64a0cdfe2 100644 --- a/paddlex/cv/transforms/seg_transforms.py +++ b/paddlex/cv/transforms/seg_transforms.py @@ -49,6 +49,7 @@ class Compose(SegTransform): raise ValueError('The length of transforms ' + \ 'must be equal or larger than 1!') self.transforms = transforms + self.batch_transforms = None self.to_rgb = False # 检查transforms里面的操作,目前支持PaddleX定义的或者是imgaug操作 for op in self.transforms: diff --git a/paddlex/det.py b/paddlex/det.py index 1590d051ff530f11eb9bf49836d5f7174c9550e9..4f38068c4b1950450a39f3949adac8021c61da80 100644 --- a/paddlex/det.py +++ b/paddlex/det.py @@ -17,6 +17,7 @@ from . import cv FasterRCNN = cv.models.FasterRCNN YOLOv3 = cv.models.YOLOv3 +PPYOLO = cv.models.PPYOLO MaskRCNN = cv.models.MaskRCNN transforms = cv.transforms.det_transforms visualize = cv.models.utils.visualize.visualize_detection diff --git a/tutorials/train/object_detection/ppyolo.py b/tutorials/train/object_detection/ppyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..63b47a95671692e89761251e9a1059cac9b542eb --- /dev/null +++ b/tutorials/train/object_detection/ppyolo.py @@ -0,0 +1,58 @@ +# 环境变量配置,用于控制是否使用GPU +# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +from paddlex.det import transforms +import paddlex as pdx + +# 下载和解压昆虫检测数据集 +insect_dataset = 'https://bj.bcebos.com/paddlex/datasets/insect_det.tar.gz' +pdx.utils.download_and_decompress(insect_dataset, path='./') + +# 定义训练和验证时的transforms +# API说明 https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html +train_transforms = transforms.Compose([ + transforms.MixupImage(mixup_epoch=250), transforms.RandomDistort(), + transforms.RandomExpand(), transforms.RandomCrop(), transforms.Resize( + target_size=608, interp='RANDOM'), transforms.RandomHorizontalFlip(), + transforms.Normalize() +]) + +eval_transforms = transforms.Compose([ + transforms.Resize( + target_size=608, interp='CUBIC'), transforms.Normalize() +]) + +# 定义训练和验证所用的数据集 +# API说明:https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#paddlex-datasets-vocdetection +train_dataset = pdx.datasets.VOCDetection( + data_dir='insect_det', + file_list='insect_det/train_list.txt', + label_list='insect_det/labels.txt', + transforms=train_transforms, + shuffle=True) +eval_dataset = pdx.datasets.VOCDetection( + data_dir='insect_det', + file_list='insect_det/val_list.txt', + label_list='insect_det/labels.txt', + transforms=eval_transforms) + +# 初始化模型,并进行训练 +# 可使用VisualDL查看训练指标,参考https://paddlex.readthedocs.io/zh_CN/develop/train/visualdl.html +num_classes = len(train_dataset.labels) + +# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-yolov3 +model = pdx.det.PPYOLO(num_classes=num_classes) + +# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#train +# 各参数介绍与调整说明:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html +model.train( + num_epochs=270, + train_dataset=train_dataset, + train_batch_size=8, + eval_dataset=eval_dataset, + learning_rate=0.000125, + lr_decay_epochs=[210, 240], + save_dir='output/ppyolo', + use_vdl=True)