Merge remote-tracking branch 'upstream/3.4' into merge-3.4

fa25faa2 · Alexander Alekhin · dbab8d8c · 1067cd06 · fa25faa2 · fa25faa2
52 changed file
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@@ -40,4 +40,5 @@ if(WITH_NEON)
    target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
 endif()

-add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")
+# we add dummy file to fix XCode build
+add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>" "${CAROTENE_SOURCE_DIR}/dummy.cpp")
--- a/3rdparty/carotene/hal/CMakeLists.txt
+++ b/3rdparty/carotene/hal/CMakeLists.txt
@@ -82,7 +82,8 @@ set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
 #    set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
  endif()

-add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
+# we add dummy file to fix XCode build
+add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs> "dummy.cpp")
 set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
 set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
 if(NOT BUILD_SHARED_LIBS)

--- a/3rdparty/carotene/hal/dummy.cpp
+++ b/3rdparty/carotene/hal/dummy.cpp
+// This file is needed for compilation on some platforms e.g. with XCode generator
+// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
--- a/3rdparty/carotene/src/dummy.cpp
+++ b/3rdparty/carotene/src/dummy.cpp
+// This file is needed for compilation on some platforms e.g. with XCode generator
+// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -88,7 +88,12 @@ if(CUDA_FOUND)

  message(STATUS "CUDA detected: " ${CUDA_VERSION})

-  set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere")
+  OCV_OPTION(CUDA_ENABLE_DEPRECATED_GENERATION "Enable deprecated generations in the list" OFF)
+  set(_generations "Maxwell" "Pascal" "Volta" "Turing" "Ampere")
+  if(CUDA_ENABLE_DEPRECATED_GENERATION)
+    set(_generations "Fermi" "${_generations}")
+    set(_generations "Kepler" "${_generations}")
+  endif()
  set(_arch_fermi   "2.0")
  set(_arch_kepler  "3.0;3.5;3.7")
  set(_arch_maxwell "5.0;5.2")
@@ -209,10 +214,6 @@ if(CUDA_FOUND)
    endif()
  endmacro()

-  macro(ocv_wipeout_deprecated _arch_bin_list)
-    string(REPLACE "2.1" "2.1(2.0)" ${_arch_bin_list} "${${_arch_bin_list}}")
-  endmacro()
-
  set(__cuda_arch_ptx "")
  if(CUDA_GENERATION STREQUAL "Fermi")
    set(__cuda_arch_bin ${_arch_fermi})
@@ -275,7 +276,6 @@ if(CUDA_FOUND)
      )
    endif()
  endif()
-  ocv_wipeout_deprecated(__cuda_arch_bin)

  set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
  set(CUDA_ARCH_PTX ${__cuda_arch_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
@@ -283,10 +283,14 @@ if(CUDA_FOUND)
  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
  string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")

-  # Check if user specified 1.0 compute capability: we don't support it
-  if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " 1.0")
-    message(SEND_ERROR "CUDA: 1.0 compute capability is not supported - exclude it from ARCH/PTX list are re-run CMake")
-  endif()
+  # Check if user specified 1.0/2.1 compute capability: we don't support it
+  macro(ocv_wipeout_deprecated_cc target_cc)
+    if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " ${target_cc}")
+      message(SEND_ERROR "CUDA: ${target_cc} compute capability is not supported - exclude it from ARCH/PTX list and re-run CMake")
+    endif()
+  endmacro()
+  ocv_wipeout_deprecated_cc("1.0")
+  ocv_wipeout_deprecated_cc("2.1")

  # NVCC flags to be set
  set(NVCC_FLAGS_EXTRA "")

--- a/doc/js_tutorials/js_assets/opencv_logo.jpg
+++ b/doc/js_tutorials/js_assets/opencv_logo.jpg
--- a/doc/opencv-logo-small.png
+++ b/doc/opencv-logo-small.png
--- a/doc/opencv-logo-white.png
+++ b/doc/opencv-logo-white.png
--- a/doc/opencv-logo.png
+++ b/doc/opencv-logo.png
--- a/doc/opencv-logo2.png
+++ b/doc/opencv-logo2.png
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -584,6 +584,16 @@
  pages = {1033--1040},
  publisher = {IEEE}
 }
+@article{YM11,
+  author = {Yu, Guoshen and Morel, Jean-Michel},
+  title = {ASIFT: An Algorithm for Fully Affine Invariant Comparison},
+  year = {2011},
+  pages = {11--38},
+  journal = {Image Processing On Line},
+  volume = {1},
+  doi = {10.5201/ipol.2011.my-asift},
+  url = {http://www.ipol.im/pub/algo/my_affine_sift/}
+}
 @inproceedings{LCS11,
  author = {Leutenegger, Stefan and Chli, Margarita and Siegwart, Roland Yves},
  title = {BRISK: Binary robust invariant scalable keypoints},

--- a/doc/opencv.ico
+++ b/doc/opencv.ico
--- a/doc/py_tutorials/py_setup/images/opencv_logo.jpg
+++ b/doc/py_tutorials/py_setup/images/opencv_logo.jpg
--- a/modules/calib3d/src/calibration_handeye.cpp
+++ b/modules/calib3d/src/calibration_handeye.cpp
@@ -712,7 +712,10 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
    {
        Mat m = Mat::eye(4, 4, CV_64FC1);
        Mat R = m(Rect(0, 0, 3, 3));
-        R_gripper2base_[i].convertTo(R, CV_64F);
+        if(R_gripper2base_[i].size() == Size(3, 3))
+            R_gripper2base_[i].convertTo(R, CV_64F);
+        else
+            Rodrigues(R_gripper2base_[i], R);

        Mat t = m(Rect(3, 0, 1, 3));
        t_gripper2base_[i].convertTo(t, CV_64F);
@@ -727,7 +730,10 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
    {
        Mat m = Mat::eye(4, 4, CV_64FC1);
        Mat R = m(Rect(0, 0, 3, 3));
-        R_target2cam_[i].convertTo(R, CV_64F);
+        if(R_target2cam_[i].size() == Size(3, 3))
+            R_target2cam_[i].convertTo(R, CV_64F);
+        else
+            Rodrigues(R_target2cam_[i], R);

        Mat t = m(Rect(3, 0, 1, 3));
        t_target2cam_[i].convertTo(t, CV_64F);

--- a/modules/calib3d/test/test_calibration_hand_eye.cpp
+++ b/modules/calib3d/test/test_calibration_hand_eye.cpp
@@ -317,7 +317,10 @@ void CV_CalibrateHandEyeTest::simulateData(RNG& rng, int nPoses,
            t_gripper2base_noise.at<double>(2,0) += rng.gaussian(0.001);
        }

-        R_target2cam.push_back(T_target2cam(Rect(0, 0, 3, 3)));
+        // test rvec represenation
+        Mat rvec_target2cam;
+        cv::Rodrigues(T_target2cam(Rect(0, 0, 3, 3)), rvec_target2cam);
+        R_target2cam.push_back(rvec_target2cam);
        t_target2cam.push_back(T_target2cam(Rect(3, 0, 1, 3)));
    }
 }

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -1614,7 +1614,9 @@ elements.
 CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
                            double minVal = -DBL_MAX, double maxVal = DBL_MAX);

-/** @brief converts NaN's to the given number
+/** @brief converts NaNs to the given number
+@param a input/output matrix (CV_32F type).
+@param val value to convert the NaNs
 */
 CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0);


--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -600,6 +600,14 @@ CV__DNN_INLINE_NS_BEGIN
        static Ptr<RegionLayer> create(const LayerParams& params);
    };

+    /**
+     * @brief Detection output layer.
+     *
+     * The layer size is: @f$ (1 \times 1 \times N \times 7) @f$
+     *    where N is [keep_top_k] parameter multiplied by batch size. Each row is:
+     *    [image_id, label, confidence, xmin, ymin, xmax, ymax]
+     *    where image_id is the index of image input in the batch.
+     */
    class CV_EXPORTS DetectionOutputLayer : public Layer
    {
    public:

--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -221,6 +221,10 @@ namespace cv {
                {
                    cv::dnn::LayerParams activation_param;
                    if (type == "relu")
+                    {
+                        activation_param.type = "ReLU";
+                    }
+                    else if (type == "leaky")
                    {
                        activation_param.set<float>("negative_slope", 0.1f);
                        activation_param.type = "ReLU";
@@ -862,24 +866,8 @@ namespace cv {
                    }

                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
-                    if (activation == "leaky")
-                    {
-                        setParams.setActivation("relu");
-                    }
-                    else if (activation == "swish")
-                    {
-                        setParams.setActivation("swish");
-                    }
-                    else if (activation == "mish")
-                    {
-                        setParams.setActivation("mish");
-                    }
-                    else if (activation == "logistic")
-                    {
-                        setParams.setActivation("logistic");
-                    }
-                    else if (activation != "linear")
-                        CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
+                    if (activation != "linear")
+                        setParams.setActivation(activation);

                    net->out_channels_vec[layers_counter] = tensor_shape[0];
                }

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -114,18 +114,19 @@ public:
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);

-        CV_Assert(inputs.size() > 0);
+        CV_Assert((inputs.size() > outputs.size() && blobs.empty()) ||
+                  (!inputs.empty() && (blobs.size() == 1 || blobs.size() == 2)));
+        MatSize weightShape = blobs.empty() ? inputs[1].size : blobs[0].size;

-        CV_Assert(blobs.size() == 1 || blobs.size() == 2);
        CV_Assert(inputs[0].dims == outputs[0].dims);
-        CV_Assert(blobs[0].dims == kernel_size.size() + 2);
+        CV_Assert(weightShape.dims() == kernel_size.size() + 2);
        for (int i = 0; i < kernel_size.size(); i++) {
-            CV_Assert(blobs[0].size[i + 2] == kernel_size[i]);
+            CV_Assert(weightShape[i + 2] == kernel_size[i]);
        }

        const Mat &input = inputs[0];
        CV_Assert((input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
-        for (size_t i = 0; i < inputs.size(); i++)
+        for (size_t i = 0; i < outputs.size(); i++)
        {
            CV_Assert(inputs[i].type() == input.type());
            CV_Assert((inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
@@ -270,6 +271,7 @@ public:

    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
    {
+        CV_Assert(!blobs.empty());
        int dims = inpShape.size();
        int inpD = dims == 5 ? inpShape[2] : 1;
        int inpH = inpShape[dims - 2];
@@ -296,6 +298,8 @@ public:
        {
            if (kernel_size.size() == 3)
                return preferableTarget == DNN_TARGET_CPU;
+            if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || preferableTarget != DNN_TARGET_MYRIAD) && blobs.empty())
+                return false;
            return (preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height);
        }
        else
@@ -305,7 +309,7 @@ public:
                return (preferableTarget == DNN_TARGET_CPU && backendId == DNN_BACKEND_OPENCV);
            else if (kernel_size.size() == 2)
                return backendId == DNN_BACKEND_OPENCV ||
-                       backendId == DNN_BACKEND_HALIDE ||
+                       (backendId == DNN_BACKEND_HALIDE && !blobs.empty()) ||
                       (backendId == DNN_BACKEND_VKCOM && haveVulkan());
            else
                return false;
@@ -317,16 +321,16 @@ public:
                         std::vector<MatShape> &outputs,
                         std::vector<MatShape> &internals) const CV_OVERRIDE
    {
-        CV_Assert(blobs.size() != 0);
-        CV_Assert(!hasBias() || blobs[1].total() == (size_t)blobs[0].size[0]);
-        CV_Assert(inputs.size() == (size_t)1);
+        CV_Assert(!blobs.empty() || inputs.size() > 1);
+        const int* weightShape = blobs.empty() ? &inputs[1][0] : blobs[0].size.p;
+        CV_Assert(!hasBias() || blobs[1].total() == (size_t)weightShape[0]);

        internals.clear();

        CV_Assert(inputs.size() != 0);
        std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());

-        int outCn = blobs[0].size[0];
+        int outCn = weightShape[0];
        std::vector<int> outShape;
        outShape.push_back(inputs[0][0]);
        outShape.push_back(outCn);
@@ -342,10 +346,10 @@ public:
            getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape);
        }

-        int ngroups = inpCn / blobs[0].size[1];
-        if (ngroups == 0 || ngroups * blobs[0].size[1] != inpCn)
+        int ngroups = inpCn / weightShape[1];
+        if (ngroups == 0 || ngroups * weightShape[1] != inpCn)
            CV_Error(Error::StsError, format("Number of input channels should "
-                     "be multiple of %d but got %d", blobs[0].size[1], inpCn));
+                     "be multiple of %d but got %d", weightShape[1], inpCn));
        CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);

        outputs.resize(1, outShape);
@@ -357,15 +361,15 @@ public:
    {
        BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);

-        CV_Assert(!blobs.empty());
-        const int outCn = blobs[0].size[0];
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
        // prepare weightsMat where each row is aligned and has enough zero padding on the right to
        // use vectorized (i.e. with intrinsics) loops without tail processing
-        Mat wm = blobs[0].reshape(1, outCn);
+        Mat wm = blobs.empty() ? inputs[1].reshape(1, numOutput) : blobs[0].reshape(1, numOutput);
        if( wm.step1() % VEC_ALIGN != 0 )
        {
            int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
-            Mat wm_buffer = Mat(outCn, newcols, wm.type());
+            Mat wm_buffer = Mat(numOutput, newcols, wm.type());
            Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
            wm_padding.setTo(Scalar::all(0.));
            Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
@@ -373,18 +377,18 @@ public:
            wm = wm_aligned;
        }
        weightsMat = wm;
-        weightsMultipliers.assign(outCn, 1.0);
+        weightsMultipliers.assign(numOutput, 1.0);

-        Mat biasMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
-        biasvec.resize(outCn+2);
+        Mat biasMat = hasBias() ? blobs[1].reshape(1, numOutput) : Mat();
+        biasvec.resize(numOutput+2);
        if( biasMat.empty() )
        {
-            for(int i = 0; i < outCn; i++ )
+            for(int i = 0; i < numOutput; i++ )
                biasvec[i] = 0.f;
        }
        else
        {
-            for(int i = 0; i < outCn; i++ )
+            for(int i = 0; i < numOutput; i++ )
                biasvec[i] = biasMat.at<float>(i);
        }
 #ifdef HAVE_OPENCL
@@ -394,7 +398,7 @@ public:

    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
    {
-        if (!activ.empty() && !layer.empty())
+        if ((!activ.empty() && !layer.empty()) || blobs.empty())
            return false;

        activ = layer;
@@ -743,37 +747,48 @@ public:
    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
-        CV_Assert_N(inputs.size() == 1, nodes.size() == 1);
+        CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
        std::vector<size_t> dims = ieInpNode->get_shape();
        CV_Assert(dims.size() == 4 || dims.size() == 5);
+        std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
        const int inpCn = dims[1];
-        const int outCn = blobs[0].size[0];
-        const int inpGroupCn = blobs[0].size[1];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights->get_shape()[1] : blobs[0].size[1];
        const int group = inpCn / inpGroupCn;

-        std::vector<size_t> kernel_shape = getShape<size_t>(blobs[0]);
+        std::vector<size_t> kernel_shape;
        if (group != 1)
        {
-            kernel_shape[0] /= group;
-            kernel_shape.insert(kernel_shape.begin(), group);
+            kernel_shape.push_back(group);
        }
+        kernel_shape.push_back(numOutput / group);
+        kernel_shape.push_back(inpCn / group);
+        std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));

-        auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
-        if (fusedWeights)
+        if (nodes.size() == 1)
        {
-            if (weightsMat.isContinuous())
-            {
-                ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, weightsMat.data);
-            }
-            else
+            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
+            if (fusedWeights)
            {
-                Mat newWeights;
-                Mat cvWeights = weightsMat.colRange(0, blobs[0].total() / outCn);
-                cvWeights.copyTo(newWeights);
-                ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
+                if (weightsMat.isContinuous())
+                {
+                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, weightsMat.data);
+                }
+                else
+                {
+                    Mat newWeights;
+                    Mat cvWeights = weightsMat.colRange(0, blobs[0].total() / numOutput);
+                    cvWeights.copyTo(newWeights);
+                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
+                }
            }
        }
+        else
+        {
+            auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                             ngraph::Shape{kernel_shape.size()}, kernel_shape.data());
+            ieWeights  = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
+        }

        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
        if (!padMode.empty())
@@ -798,11 +813,21 @@ public:
                                pad_type);
        }

-        if (hasBias() || fusedBias)
+        if (hasBias() || fusedBias || nodes.size() == 3)
        {
            std::vector<size_t> shape(conv_node->get_shape().size(), 1);
-            shape[1] = outCn;
-            auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), biasvec.data());
+            shape[1] = conv_node->get_shape()[1];
+            std::shared_ptr<ngraph::Node> bias;
+            if (nodes.size() == 3)
+            {
+                auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                    ngraph::Shape{shape.size()}, shape.data());
+                bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
+            }
+            else
+            {
+                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), biasvec.data());
+            }
            auto conv_bias = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
            return Ptr<BackendNode>(new InfEngineNgraphNode(conv_bias));
        }
@@ -1516,6 +1541,26 @@ public:
        for (int i = 0; i < inputs.size(); ++i)
            CV_Assert(inputs[i].u != outputs[0].u);

+        if (blobs.empty())
+        {
+            size_t n = inputs.size() - 1;
+            umat_blobs.resize(n);
+            for (size_t i = 0; i < n; i++)
+            {
+                if (use_half)
+                {
+                    Mat matFP32;
+                    convertFp16(inputs[i + 1], matFP32);
+                    matFP32.copyTo(umat_blobs[i]);
+                }
+                else
+                {
+                    inputs[i + 1].copyTo(umat_blobs[i]);
+                }
+            }
+            inputs.resize(1);
+        }
+
        if (umat_blobs.empty())
        {
            size_t n = blobs.size();
@@ -1526,7 +1571,7 @@ public:
            }
        }

-        if (convolutionOp.empty())
+        if (convolutionOp.empty() || blobs.empty())
        {
            OCL4DNNConvConfig config;
            config.in_shape = shape(inputs[0]);
@@ -1536,7 +1581,7 @@ public:
            config.stride = stride;
            config.dilation = dilation;
            config.group = inputs[0].size[1] / umat_blobs[0].size[1];
-            config.bias_term = (hasBias()) ? true : false;
+            config.bias_term = umat_blobs.size() == 2;
            config.use_half = use_half;

            convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
@@ -1663,16 +1708,37 @@ public:
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);

+        int outCn = blobs.empty() ? inputs[1].size[0] : blobs[0].size[0];
+        // Need to align non-const blobs
+        if (blobs.empty())
+        {
+            Mat wm = inputs[1].reshape(1, outCn);
+            if( wm.step1() % VEC_ALIGN != 0 )
+            {
+                wm.copyTo(weightsMat);
+                if (inputs.size() > 2)
+                {
+                    Mat biasMat = inputs[2].reshape(1, outCn);
+                    biasMat.col(0).copyTo(biasvec);
+                    biasvec.resize(outCn + 2);
+                }
+                else
+                {
+                    biasvec.resize(outCn + 2, 0);
+                }
+            }
+        }
+
        /*printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
               name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
               kernel.width, kernel.height, pad.width, pad.height,
               stride.width, stride.height, dilation.width, dilation.height);*/
-        CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % blobs[0].size[1] == 0,
+        int inpGroupCn = blobs.empty() ? inputs[1].size[1] : blobs[0].size[1];
+        CV_Assert_N(inputs.size() >= (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
                    outputs.size() == 1, inputs[0].data != outputs[0].data);

-        int ngroups = inputs[0].size[1]/blobs[0].size[1];
+        int ngroups = inputs[0].size[1] / inpGroupCn;
        CV_Assert(outputs[0].size[1] % ngroups == 0);
-        int outCn = blobs[0].size[0];

        reluslope.clear();
        if( activ )
@@ -1810,11 +1876,11 @@ public:
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
-        CV_Assert(inputs.size() == outputs.size());
+        CV_Assert(inputs.size() == outputs.size() || inputs.size() == outputs.size() + blobs.size());

        int64 flops = 0;
        int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<size_t>());
-        for (int i = 0; i < inputs.size(); i++)
+        for (int i = 0; i < outputs.size(); i++)
        {
            flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1);
        }

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -587,7 +587,7 @@ public:
        }
        else
        {
-            std::vector<size_t> data = {(size_t)ieInpNode->get_shape()[0], (size_t)blobs[0].size[1]};
+            std::vector<int64_t> data = {(int64_t)ieInpNode->get_shape()[0], (int64_t)blobs[0].size[1]};
            auto new_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, data.data());
            auto inp = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, new_shape, true);


--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -397,8 +397,9 @@ public:
                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<int64_t> order(_order.begin(), _order.end());
        auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                       ngraph::Shape({_order.size()}), _order.data());
+                       ngraph::Shape({order.size()}), order.data());
        auto transpose = std::make_shared<ngraph::op::Transpose>(ieInpNode, tr_axes);
        return Ptr<BackendNode>(new InfEngineNgraphNode(transpose));
    }

--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -167,6 +167,10 @@ public:

    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
    {
+#ifdef HAVE_OPENCL
+        ocl_exec_cache.clear();
+#endif
+
        std::vector<Mat> inputs, outputs;
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);
@@ -221,26 +225,33 @@ public:
    }

 #ifdef HAVE_OPENCL
-    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    struct OpenCLExecInfo
    {
-        std::vector<UMat> inputs;
-        std::vector<UMat> outputs;
+        std::string kernel_name;
+        std::string build_opts;
+        size_t local_size[2];
+        size_t global_size[2];

-        inputs_.getUMatVector(inputs);
-        outputs_.getUMatVector(outputs);
+        OpenCLExecInfo()
+        {
+            local_size[0] = local_size[1] = 0;
+            global_size[0] = global_size[1] = 0;
+        }
+    };
+    std::vector<OpenCLExecInfo> ocl_exec_cache;
+
+    void ocl_prepare(const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
+    {
+        CV_TRACE_FUNCTION();

        CV_Assert(outputs.size() == finalSliceRanges.size());
+        ocl_exec_cache.resize(outputs.size());

        const UMat& input = inputs[0];
-        if (input.dims > 5)
-        {
-            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
-            return false;
-        }
+        const int dims = input.dims;

        size_t WSZ = 128;

-        const int dims = input.dims;
        const int elemSize = (int)input.elemSize();
        String opts0 = cv::format(
                "-DDIMS=%d -DELEMSIZE=%d",
@@ -250,10 +261,11 @@ public:
        {
            opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
        }
-        String kname = cv::format("slice_%d", dims);
        for (size_t i = 0; i < outputs.size(); i++)
        {
-            UMat& output = outputs[i];
+            OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            const UMat& output = outputs[i];
            const std::vector<Range>& range = finalSliceRanges[i];

            String opts = opts0;
@@ -269,6 +281,8 @@ public:
                CV_CheckEQ(range[d].size(), (int)output.size[d], "");
            }

+            const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64;
+
            int block_dims = 0;
            size_t block_size = elemSize;
            for (int i = dims - 1; i >= 0; --i)
@@ -277,12 +291,14 @@ public:
                    break;
                block_size *= output.size[i];
                block_dims++;
+                if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
+                    break;
            }

            const size_t total = output.total() * elemSize;
            size_t num_blocks = total / block_size;

-            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
+            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG))
            {
                // use 1D copy mode
                opts += cv::format(" -DUSE_COPY_1D=1");
@@ -352,23 +368,98 @@ public:

            opts += cv::format(" -DWSZ=%d", (int)WSZ);

-            size_t local[] = { WSZ, 1 };
-            size_t global[] = { WSZ, num_blocks };
+            std::ostringstream kernel_suffix;
+            kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size;
+            kernel_suffix << "__src_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.size[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';
+            /*for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.step[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';*/

-            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
+            kernel_suffix << "dst_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.size[dims - 1 - d] << '_';
+            }
+            /*kernel_suffix << '_';
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.step[dims - 1 - d] << '_';
+            }*/
+            kernel_suffix << "_slice_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << range[dims - 1 - d].start << '_';
+            }
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << '_' << range[dims - 1 - d].end;
+            }
+
+            std::string kernel_suffix_str = kernel_suffix.str();
+            opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str());
+
+            ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str());
+            ocl.build_opts = opts;
+            ocl.local_size[0] = WSZ;
+            ocl.local_size[1] = 1;
+            ocl.global_size[0] = WSZ;
+            ocl.global_size[1] = num_blocks;
+        }  // for outputs.size()
+    }  // ocl_prepare
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        CV_TRACE_FUNCTION();
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+
+        const UMat& input = inputs[0];
+        const int dims = input.dims;
+        if (dims > 5)
+        {
+            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU");
+            return false;
+        }
+
+        if (ocl_exec_cache.empty())
+        {
+            ocl_prepare(inputs, outputs);
+        }
+        CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), "");
+
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            const OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            UMat& output = outputs[i];
+
+            ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts);
            if (kernel.empty())
                return false;
            bool ret = kernel.args(
                    ocl::KernelArg::PtrReadOnly(input),
                    ocl::KernelArg::PtrWriteOnly(output)
                )
-                .run(2, global, local, false);
+                .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
            if (!ret)
                return false;
        }  // for outputs.size()

        return true;
-        }
+    }  // forward_ocl
 #endif

    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE

--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -1003,10 +1003,13 @@ void ONNXImporter::populateNet(Net dstNet)
            CV_Assert(node_proto.input_size() >= 2);
            layerParams.type = "Convolution";
            for (int j = 1; j < node_proto.input_size(); j++) {
-                layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
+                if (constBlobs.find(node_proto.input(j)) != constBlobs.end())
+                {
+                    layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
+                }
            }
-            layerParams.set("num_output", layerParams.blobs[0].size[0]);
-            layerParams.set("bias_term", node_proto.input_size() == 3);
+            int outCn = layerParams.blobs.empty() ? outShapes[node_proto.input(1)][0] : layerParams.blobs[0].size[0];
+            layerParams.set("num_output", outCn);
        }
        else if (layer_type == "ConvTranspose")
        {

--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@@ -48,19 +48,85 @@ global: <WSZ, number_of_copy_blocks, 1>
 #define BLOCK_COLS_X4 (BLOCK_COLS / 4)
 #define BLOCK_COLS_X16 (BLOCK_COLS / 16)

-#ifdef USE_COPY_1D
-
-static inline
-__attribute__((always_inline))
-void copy_block_1d(
+__attribute__((reqd_work_group_size(WSZ, 1, 1)))
+__kernel void
+CONCAT(slice_, SLICE_KERNEL_SUFFIX)(
    __global const uchar* src0,
-    const uint src_offset,
-    __global uchar* dst0,
-    const uint dst_offset
+    __global uchar* dst0
 )
 {
-    __global const uchar* src = src0 + src_offset;
-    __global uchar* dst = dst0 + dst_offset;
+    uint block_id = get_global_id(1);
+    uint dst_offset0 = block_id * BLOCK_SIZE;
+    uint src_offset0 = 0;
+
+    {  // calculate src_offset0
+
+#define CALC_SRC_INDEX(dim) \
+    { \
+    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
+    CONCAT(idx_, dim) = block_id / plane_sz; \
+    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
+    }
+#define UPDATE_SRC_OFFSET(dim) \
+    src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0);
+/*
+    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
+        printf("(%d, %d): @%d src_offset0=%d   idx_dim=%d   block_id=%d\n", \
+            get_global_id(0), get_global_id(1), \
+            dim, src_offset0, CONCAT(idx_, dim), block_id \
+        );
+*/
+
+#if DIMS > 5
+#error "invalid configuration"
+#endif
+#if DIMS > 4
+    uint idx_4 = 0;
+#if BLOCK_DIMS <= 4
+    CALC_SRC_INDEX(4)
+#endif
+    UPDATE_SRC_OFFSET(4)
+#endif
+#if DIMS > 3
+    uint idx_3 = 0;
+#if BLOCK_DIMS <= 3
+    CALC_SRC_INDEX(3)
+#endif
+    UPDATE_SRC_OFFSET(3)
+#endif
+#if DIMS > 2
+    uint idx_2 = 0;
+#if BLOCK_DIMS <= 2
+    CALC_SRC_INDEX(2)
+#endif
+    UPDATE_SRC_OFFSET(2)
+#endif
+#if DIMS > 1
+    uint idx_1 = 0;
+#if BLOCK_DIMS <= 1
+    CALC_SRC_INDEX(1)
+#endif
+    UPDATE_SRC_OFFSET(1)
+#endif
+#if DIMS > 0
+    uint idx_0 = 0;
+    UPDATE_SRC_OFFSET(0)
+#endif
+
+/*
+    if (get_global_id(0) == 0)
+        printf("(%d, %d): src_offset0=%d dst_offset0=%d\n",
+            get_global_id(0), get_global_id(1),
+            src_offset0, dst_offset0
+        );
+*/
+
+    }  // calculate src_offset0
+
+#ifdef USE_COPY_1D
+    {  // copy_block_1d
+    __global const uchar* src = src0 + src_offset0;
+    __global uchar* dst = dst0 + dst_offset0;

    uint processed = 0;

@@ -70,8 +136,9 @@ void copy_block_1d(
        uint i = get_local_id(0) * 16;  // uchar16
        while (i < BLOCK_COLS_X16 * 16)
        {
-            uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ);
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16));
+            uint4 idx0 = (uint4)i;
+            uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ);
+            idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16));

            uchar16 a0 = vload16(0, src + idx.s0);
            uchar16 a1 = vload16(0, src + idx.s1);
@@ -97,8 +164,9 @@ void copy_block_1d(
        uint i = get_local_id(0) * 4 + processed;  // uchar4
        while (i < BLOCK_COLS_X4 * 4)
        {
-            uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ);
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4));
+            uint4 idx0 = (uint4)i;
+            uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ);
+            idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4));

            uchar4 a0 = vload4(0, src + idx.s0);
            uchar4 a1 = vload4(0, src + idx.s1);
@@ -130,19 +198,11 @@ void copy_block_1d(
        }
    }
 #endif
-}
+    }  // copy_block_1d

-#else  // USE_COPY_1D
+#else

-static inline
-__attribute__((always_inline))
-void copy_block_2d(
-    __global const uchar* src0,
-    const uint src_offset0,
-    __global uchar* dst0,
-    const uint dst_offset0
-)
-{
+    {  // copy_block_2d
    __global const uchar* src = src0 + src_offset0;
    __global uchar* dst = dst0 + dst_offset0;

@@ -199,85 +259,6 @@ void copy_block_2d(
 #endif  // BLOCK_COLS_FILL_X4 != BLOCK_COLS
        i += WSZ * 4;
    }
-}
-
-#endif  // USE_COPY_1D
-
-__kernel void
-CONCAT(slice_, DIMS)(
-    __global const uchar* src,
-    __global uchar* dst
-)
-{
-    uint block_id = get_global_id(1);
-
-    uint dst_offset = block_id * BLOCK_SIZE;
-
-    uint src_offset = 0;
-
-#define CALC_SRC_INDEX(dim) \
-    { \
-    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
-    CONCAT(idx_, dim) = block_id / plane_sz; \
-    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
-    }
-#define UPDATE_SRC_OFFSET(dim) \
-    src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset);
-/*
-    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
-        printf("(%d, %d): @%d src_offset=%d   idx_dim=%d   block_id=%d\n", \
-            get_global_id(0), get_global_id(1), \
-            dim, src_offset, CONCAT(idx_, dim), block_id \
-        );
-*/
-
-#if DIMS > 5
-#error "invalid configuration"
-#endif
-#if DIMS > 4
-    uint idx_4 = 0;
-#if BLOCK_DIMS <= 4
-    CALC_SRC_INDEX(4)
-#endif
-    UPDATE_SRC_OFFSET(4)
-#endif
-#if DIMS > 3
-    uint idx_3 = 0;
-#if BLOCK_DIMS <= 3
-    CALC_SRC_INDEX(3)
-#endif
-    UPDATE_SRC_OFFSET(3)
-#endif
-#if DIMS > 2
-    uint idx_2 = 0;
-#if BLOCK_DIMS <= 2
-    CALC_SRC_INDEX(2)
-#endif
-    UPDATE_SRC_OFFSET(2)
-#endif
-#if DIMS > 1
-    uint idx_1 = 0;
-#if BLOCK_DIMS <= 1
-    CALC_SRC_INDEX(1)
-#endif
-    UPDATE_SRC_OFFSET(1)
-#endif
-#if DIMS > 0
-    uint idx_0 = 0;
-    UPDATE_SRC_OFFSET(0)
-#endif
-
-/*
-    if (get_global_id(0) == 0)
-        printf("(%d, %d): src_offset=%d dst_offset=%d\n",
-            get_global_id(0), get_global_id(1),
-            src_offset, dst_offset
-        );
-*/
-
-#ifdef USE_COPY_1D
-    copy_block_1d(src, src_offset, dst, dst_offset);
-#else
-    copy_block_2d(src, src_offset, dst, dst_offset);
+    }  // copy_block_2d
 #endif
 }
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -784,6 +784,11 @@ TEST_P(Test_Darknet_layers, connected)
    testDarknetLayer("connected", true);
 }

+TEST_P(Test_Darknet_layers, relu)
+{
+    testDarknetLayer("relu");
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_layers, dnnBackendsAndTargets());

 }} // namespace
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -1133,6 +1133,9 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());

+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");

@@ -1143,9 +1146,8 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
    else
        FAIL() << "Unknown backendId";

-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
    Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt"));
-    Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
+    Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));

    Mat inp = blobFromNPY(_tf("blob.npy"));

@@ -1165,7 +1167,10 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)

    std::vector<int> outLayers = net.getUnconnectedOutLayers();
    ASSERT_EQ(net.getLayer(outLayers[0])->name, "output");
-    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
+    else
+        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Add");
 }

 TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
@@ -1173,6 +1178,9 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());

+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");

@@ -1189,12 +1197,10 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
    randu(inputs[0], 0, 255);
    inputs[0].convertTo(inputs[1], CV_32F);

-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-
    Mat outs[2];
    for (int i = 0; i < 2; ++i)
    {
-        Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
+        Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
        net.setPreferableBackend(backendId);
        net.setPreferableTarget(targetId);
        net.setInput(inputs[i]);
@@ -1210,6 +1216,9 @@ TEST_P(Layer_Test_Convolution_DLDT, multithreading)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());

+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");

@@ -1220,9 +1229,8 @@ TEST_P(Layer_Test_Convolution_DLDT, multithreading)
    else
        FAIL() << "Unknown backendId";

-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    std::string xmlPath = _tf("layer_convolution" + suffix + ".xml");
-    std::string binPath = _tf("layer_convolution" + suffix + ".bin");
+    std::string xmlPath = _tf("layer_convolution.xml");
+    std::string binPath = _tf("layer_convolution.bin");
    Net firstNet = readNet(xmlPath, binPath);
    Net secondNet = readNet(xmlPath, binPath);
    Mat inp = blobFromNPY(_tf("blob.npy"));
@@ -1281,8 +1289,7 @@ TEST_P(Test_DLDT_two_inputs_3dim, as_IR)
    int secondInpType = get<1>(GetParam());
    Target targetId = get<2>(GetParam());

-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    Net net = readNet(_tf("net_two_inputs" + suffix + ".xml"), _tf("net_two_inputs.bin"));
+    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
    std::vector<int> inpSize = get<3>(GetParam());
    Mat firstInp(3, inpSize.data(), firstInpType);
    Mat secondInp(3, inpSize.data(), secondInpType);

--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -444,12 +444,14 @@ TEST_P(Async, model_optimizer_pipeline_set_and_forward_single)
    const Backend backendId = get<0>(get<1>(GetParam()));
    const Target targetId = get<1>(get<1>(GetParam()));

+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");

-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");

    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -503,12 +505,14 @@ TEST_P(Async, model_optimizer_pipeline_set_and_forward_all)
    const Backend backendId = get<0>(get<1>(GetParam()));
    const Target targetId = get<1>(get<1>(GetParam()));

+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");

-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");

    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -677,9 +681,11 @@ TEST_P(Test_Model_Optimizer, forward_two_nets)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());

-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");

    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -716,12 +722,14 @@ TEST_P(Test_Model_Optimizer, readFromBuffer)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());

+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");

-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
-    const std::string& weightsFile = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
-    const std::string& modelFile = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+    const std::string& weightsFile = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& modelFile = findDataFile("dnn/layers/layer_convolution.xml");

    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@@ -769,8 +777,11 @@ TEST_P(Test_Model_Optimizer, flexible_inputs)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());

-    const std::string& model = findDataFile("dnn/layers/layer_convolution_fp16.bin");
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution_fp16.xml");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");

    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);

--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -114,6 +114,62 @@ TEST_P(Test_ONNX_layers, Convolution)
    testONNXModels("convolution");
 }

+TEST_P(Test_ONNX_layers, Convolution_variable_weight)
+{
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    String basename = "conv_variable_w";
+    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
+    ASSERT_FALSE(net.empty());
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    for (int i = 0; i < 2; i++)
+    {
+        Mat input = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_0.npy"));
+        Mat weights = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_1.npy"));
+        Mat ref = blobFromNPY(_tf("data/output_" + basename + format("_%d", i) + ".npy"));
+
+        net.setInput(input, "0");
+        net.setInput(weights, "1");
+
+        Mat out = net.forward();
+        normAssert(ref, out, "", default_l1, default_lInf);
+    }
+}
+
+TEST_P(Test_ONNX_layers, Convolution_variable_weight_bias)
+{
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    String basename = "conv_variable_wb";
+    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
+    ASSERT_FALSE(net.empty());
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    for (int i = 0; i < 2; i++)
+    {
+        Mat input = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_0.npy"));
+        Mat weights = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_1.npy"));
+        Mat bias = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_2.npy"));
+        Mat ref = blobFromNPY(_tf("data/output_" + basename + format("_%d", i) + ".npy"));
+
+        net.setInput(input, "0");
+        net.setInput(weights, "1");
+        net.setInput(bias, "bias");
+
+        Mat out = net.forward();
+        normAssert(ref, out, "", default_l1, default_lInf);
+    }
+}
+
 TEST_P(Test_ONNX_layers, Gather)
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)

--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -245,6 +245,31 @@ typedef Feature2D DescriptorExtractor;
 //! @{


+/** @brief Class for implementing the wrapper which makes detectors and extractors to be affine invariant,
+described as ASIFT in @cite YM11 .
+*/
+class CV_EXPORTS_W AffineFeature : public Feature2D
+{
+public:
+    /**
+    @param backend The detector/extractor you want to use as backend.
+    @param maxTilt The highest power index of tilt factor. 5 is used in the paper as tilt sampling range n.
+    @param minTilt The lowest power index of tilt factor. 0 is used in the paper.
+    @param tiltStep Tilt sampling step \f$\delta_t\f$ in Algorithm 1 in the paper.
+    @param rotateStepBase Rotation sampling step factor b in Algorithm 1 in the paper.
+    */
+    CV_WRAP static Ptr<AffineFeature> create(const Ptr<Feature2D>& backend,
+        int maxTilt = 5, int minTilt = 0, float tiltStep = 1.4142135623730951f, float rotateStepBase = 72);
+
+    CV_WRAP virtual void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) = 0;
+    CV_WRAP virtual void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+typedef AffineFeature AffineFeatureDetector;
+typedef AffineFeature AffineDescriptorExtractor;
+
+
 /** @brief Class for extracting keypoints and computing descriptors using the Scale Invariant Feature Transform
 (SIFT) algorithm by D. Lowe @cite Lowe04 .
 */

--- a/modules/features2d/src/affine_feature.cpp
+++ b/modules/features2d/src/affine_feature.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// This file is based on code issued with the following license.
+/*********************************************************************
+* Software License Agreement (BSD License)
+*
+*  Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+*  Copyright (C) 2008-2013, Willow Garage Inc., all rights reserved.
+*  Copyright (C) 2013, Evgeny Toropov, all rights reserved.
+*  Third party copyrights are property of their respective owners.
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions
+*  are met:
+*
+*   * Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*   * Redistributions in binary form must reproduce the above
+*     copyright notice, this list of conditions and the following
+*     disclaimer in the documentation and/or other materials provided
+*     with the distribution.
+*   * The name of the copyright holders may not be used to endorse
+*     or promote products derived from this software without specific
+*     prior written permission.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+*  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+*  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+*  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+*  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+*  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+*  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+*  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+*  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+*  POSSIBILITY OF SUCH DAMAGE.
+*********************************************************************/
+
+/*
+ Guoshen Yu, Jean-Michel Morel, ASIFT: An Algorithm for Fully Affine
+ Invariant Comparison,  Image Processing On Line, 1 (2011), pp. 11–38.
+ https://doi.org/10.5201/ipol.2011.my-asift
+ */
+
+#include "precomp.hpp"
+#include <iostream>
+namespace cv {
+
+class AffineFeature_Impl CV_FINAL : public AffineFeature
+{
+public:
+    explicit AffineFeature_Impl(const Ptr<Feature2D>& backend,
+            int maxTilt, int minTilt, float tiltStep, float rotateStepBase);
+
+    int descriptorSize() const CV_OVERRIDE
+    {
+        return backend_->descriptorSize();
+    }
+
+    int descriptorType() const CV_OVERRIDE
+    {
+        return backend_->descriptorType();
+    }
+
+    int defaultNorm() const CV_OVERRIDE
+    {
+        return backend_->defaultNorm();
+    }
+
+    void detectAndCompute(InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints,
+            OutputArray descriptors, bool useProvidedKeypoints=false) CV_OVERRIDE;
+
+    void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) CV_OVERRIDE;
+    void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const CV_OVERRIDE;
+
+protected:
+    void splitKeypointsByView(const std::vector<KeyPoint>& keypoints_,
+            std::vector< std::vector<KeyPoint> >& keypointsByView) const;
+
+    const Ptr<Feature2D> backend_;
+    int maxTilt_;
+    int minTilt_;
+    float tiltStep_;
+    float rotateStepBase_;
+
+    // Tilt factors.
+    std::vector<float> tilts_;
+    // Roll factors.
+    std::vector<float> rolls_;
+
+private:
+    AffineFeature_Impl(const AffineFeature_Impl &); // copy disabled
+    AffineFeature_Impl& operator=(const AffineFeature_Impl &); // assign disabled
+};
+
+AffineFeature_Impl::AffineFeature_Impl(const Ptr<FeatureDetector>& backend,
+        int maxTilt, int minTilt, float tiltStep, float rotateStepBase)
+    : backend_(backend), maxTilt_(maxTilt), minTilt_(minTilt), tiltStep_(tiltStep), rotateStepBase_(rotateStepBase)
+{
+    int i = minTilt_;
+    if( i == 0 )
+    {
+        tilts_.push_back(1);
+        rolls_.push_back(0);
+        i++;
+    }
+    float tilt = 1;
+    for( ; i <= maxTilt_; i++ )
+    {
+        tilt *= tiltStep_;
+        float rotateStep = rotateStepBase_ / tilt;
+        int rollN = cvFloor(180.0f / rotateStep);
+        if( rollN * rotateStep == 180.0f )
+            rollN--;
+        for( int j = 0; j <= rollN; j++ )
+        {
+            tilts_.push_back(tilt);
+            rolls_.push_back(rotateStep * j);
+        }
+    }
+}
+
+void AffineFeature_Impl::setViewParams(const std::vector<float>& tilts,
+        const std::vector<float>& rolls)
+{
+    CV_Assert(tilts.size() == rolls.size());
+    tilts_ = tilts;
+    rolls_ = rolls;
+}
+
+void AffineFeature_Impl::getViewParams(std::vector<float>& tilts,
+        std::vector<float>& rolls) const
+{
+    tilts = tilts_;
+    rolls = rolls_;
+}
+
+void AffineFeature_Impl::splitKeypointsByView(const std::vector<KeyPoint>& keypoints_,
+        std::vector< std::vector<KeyPoint> >& keypointsByView) const
+{
+    for( size_t i = 0; i < keypoints_.size(); i++ )
+    {
+        const KeyPoint& kp = keypoints_[i];
+        CV_Assert( kp.class_id >= 0 && kp.class_id < (int)tilts_.size() );
+        keypointsByView[kp.class_id].push_back(kp);
+    }
+}
+
+class skewedDetectAndCompute : public ParallelLoopBody
+{
+public:
+    skewedDetectAndCompute(
+        const std::vector<float>& _tilts,
+        const std::vector<float>& _rolls,
+        std::vector< std::vector<KeyPoint> >& _keypointsCollection,
+        std::vector<Mat>& _descriptorCollection,
+        const Mat& _image,
+        const Mat& _mask,
+        const bool _do_keypoints,
+        const bool _do_descriptors,
+        const Ptr<Feature2D>& _backend)
+        : tilts(_tilts),
+          rolls(_rolls),
+          keypointsCollection(_keypointsCollection),
+          descriptorCollection(_descriptorCollection),
+          image(_image),
+          mask(_mask),
+          do_keypoints(_do_keypoints),
+          do_descriptors(_do_descriptors),
+          backend(_backend) {}
+
+    void operator()( const cv::Range& range ) const CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+
+        const int begin = range.start;
+        const int end = range.end;
+
+        for( int a = begin; a < end; a++ )
+        {
+            Mat warpedImage, warpedMask;
+            Matx23f pose, invPose;
+            affineSkew(tilts[a], rolls[a], warpedImage, warpedMask, pose);
+            invertAffineTransform(pose, invPose);
+
+            std::vector<KeyPoint> wKeypoints;
+            Mat wDescriptors;
+            if( !do_keypoints )
+            {
+                const std::vector<KeyPoint>& keypointsInView = keypointsCollection[a];
+                if( keypointsInView.size() == 0 ) // when there are no keypoints in this affine view
+                    continue;
+
+                std::vector<Point2f> pts_, pts;
+                KeyPoint::convert(keypointsInView, pts_);
+                transform(pts_, pts, pose);
+                wKeypoints.resize(keypointsInView.size());
+                for( size_t wi = 0; wi < wKeypoints.size(); wi++ )
+                {
+                    wKeypoints[wi] = keypointsInView[wi];
+                    wKeypoints[wi].pt = pts[wi];
+                }
+            }
+            backend->detectAndCompute(warpedImage, warpedMask, wKeypoints, wDescriptors, !do_keypoints);
+            if( do_keypoints )
+            {
+                // KeyPointsFilter::runByPixelsMask( wKeypoints, warpedMask );
+                if( wKeypoints.size() == 0 )
+                {
+                    keypointsCollection[a].clear();
+                    continue;
+                }
+                std::vector<Point2f> pts_, pts;
+                KeyPoint::convert(wKeypoints, pts_);
+                transform(pts_, pts, invPose);
+
+                keypointsCollection[a].resize(wKeypoints.size());
+                for( size_t wi = 0; wi < wKeypoints.size(); wi++ )
+                {
+                    keypointsCollection[a][wi] = wKeypoints[wi];
+                    keypointsCollection[a][wi].pt = pts[wi];
+                    keypointsCollection[a][wi].class_id = a;
+                }
+            }
+            if( do_descriptors )
+                wDescriptors.copyTo(descriptorCollection[a]);
+        }
+    }
+private:
+    void affineSkew(float tilt, float phi,
+            Mat& warpedImage, Mat& warpedMask, Matx23f& pose) const
+    {
+        int h = image.size().height;
+        int w = image.size().width;
+        Mat rotImage;
+
+        Mat mask0;
+        if( mask.empty() )
+            mask0 = Mat(h, w, CV_8UC1, 255);
+        else
+            mask0 = mask;
+        pose = Matx23f(1,0,0,
+                    0,1,0);
+
+        if( phi == 0 )
+            image.copyTo(rotImage);
+        else
+        {
+            phi = phi * (float)CV_PI / 180;
+            float s = std::sin(phi);
+            float c = std::cos(phi);
+            Matx22f A(c, -s, s, c);
+            Matx<float, 4, 2> corners(0, 0, (float)w, 0, (float)w,(float)h, 0, (float)h);
+            Mat tf(corners * A.t());
+            Mat tcorners;
+            tf.convertTo(tcorners, CV_32S);
+            Rect rect = boundingRect(tcorners);
+            h = rect.height; w = rect.width;
+            pose = Matx23f(c, -s, -(float)rect.x,
+                        s,  c, -(float)rect.y);
+            warpAffine(image, rotImage, pose, Size(w, h), INTER_LINEAR, BORDER_REPLICATE);
+        }
+        if( tilt == 1 )
+            warpedImage = rotImage;
+        else
+        {
+            float s = 0.8f * sqrt(tilt * tilt - 1);
+            GaussianBlur(rotImage, rotImage, Size(0, 0), s, 0.01);
+            resize(rotImage, warpedImage, Size(0, 0), 1.0/tilt, 1.0, INTER_NEAREST);
+            pose(0, 0) /= tilt;
+            pose(0, 1) /= tilt;
+            pose(0, 2) /= tilt;
+        }
+        if( phi != 0 || tilt != 1 )
+            warpAffine(mask0, warpedMask, pose, warpedImage.size(), INTER_NEAREST);
+    }
+
+
+    const std::vector<float>& tilts;
+    const std::vector<float>& rolls;
+    std::vector< std::vector<KeyPoint> >& keypointsCollection;
+    std::vector<Mat>& descriptorCollection;
+    const Mat& image;
+    const Mat& mask;
+    const bool do_keypoints;
+    const bool do_descriptors;
+    const Ptr<Feature2D>& backend;
+};
+
+void AffineFeature_Impl::detectAndCompute(InputArray _image, InputArray _mask,
+        std::vector<KeyPoint>& keypoints,
+        OutputArray _descriptors,
+        bool useProvidedKeypoints)
+{
+    CV_TRACE_FUNCTION();
+
+    bool do_keypoints = !useProvidedKeypoints;
+    bool do_descriptors = _descriptors.needed();
+    Mat image = _image.getMat(), mask = _mask.getMat();
+    Mat descriptors;
+
+    if( (!do_keypoints && !do_descriptors) || _image.empty() )
+        return;
+
+    std::vector< std::vector<KeyPoint> > keypointsCollection(tilts_.size());
+    std::vector< Mat > descriptorCollection(tilts_.size());
+
+    if( do_keypoints )
+        keypoints.clear();
+    else
+        splitKeypointsByView(keypoints, keypointsCollection);
+
+    parallel_for_(Range(0, (int)tilts_.size()), skewedDetectAndCompute(tilts_, rolls_, keypointsCollection, descriptorCollection,
+        image, mask, do_keypoints, do_descriptors, backend_));
+
+    if( do_keypoints )
+        for( size_t i = 0; i < keypointsCollection.size(); i++ )
+        {
+            const std::vector<KeyPoint>& keys = keypointsCollection[i];
+            keypoints.insert(keypoints.end(), keys.begin(), keys.end());
+        }
+
+    if( do_descriptors )
+    {
+        _descriptors.create((int)keypoints.size(), backend_->descriptorSize(), backend_->descriptorType());
+        descriptors = _descriptors.getMat();
+        int iter = 0;
+        for( size_t i = 0; i < descriptorCollection.size(); i++ )
+        {
+            const Mat& descs = descriptorCollection[i];
+            if( descs.empty() )
+                continue;
+            Mat roi(descriptors, Rect(0, iter, descriptors.cols, descs.rows));
+            descs.copyTo(roi);
+            iter += descs.rows;
+        }
+    }
+}
+
+
+Ptr<AffineFeature> AffineFeature::create(const Ptr<Feature2D>& backend,
+                                         int maxTilt, int minTilt, float tiltStep, float rotateStepBase)
+{
+    CV_Assert(minTilt < maxTilt);
+    CV_Assert(tiltStep > 0);
+    CV_Assert(rotateStepBase > 0);
+    return makePtr<AffineFeature_Impl>(backend, maxTilt, minTilt, tiltStep, rotateStepBase);
+}
+
+String AffineFeature::getDefaultName() const
+{
+    return (Feature2D::getDefaultName() + ".AffineFeature");
+}
+
+} // namespace
--- a/modules/features2d/test/test_affine_feature.cpp
+++ b/modules/features2d/test/test_affine_feature.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+
+// #define GENERATE_DATA // generate data in debug mode
+
+namespace opencv_test { namespace {
+
+#ifndef GENERATE_DATA
+static bool isSimilarKeypoints( const KeyPoint& p1, const KeyPoint& p2 )
+{
+    const float maxPtDif = 1.f;
+    const float maxSizeDif = 1.f;
+    const float maxAngleDif = 2.f;
+    const float maxResponseDif = 0.1f;
+
+    float dist = (float)cv::norm( p1.pt - p2.pt );
+    return (dist < maxPtDif &&
+            fabs(p1.size - p2.size) < maxSizeDif &&
+            abs(p1.angle - p2.angle) < maxAngleDif &&
+            abs(p1.response - p2.response) < maxResponseDif &&
+            (p1.octave & 0xffff) == (p2.octave & 0xffff)     // do not care about sublayers and class_id
+            );
+}
+#endif
+
+TEST(Features2d_AFFINE_FEATURE, regression)
+{
+    Mat image = imread(cvtest::findDataFile("features2d/tsukuba.png"));
+    string xml = cvtest::TS::ptr()->get_data_path() + "asift/regression_cpp.xml.gz";
+    ASSERT_FALSE(image.empty());
+
+    Mat gray;
+    cvtColor(image, gray, COLOR_BGR2GRAY);
+
+    // Default ASIFT generates too large descriptors. This test uses small maxTilt to suppress the size of testdata.
+    Ptr<AffineFeature> ext = AffineFeature::create(SIFT::create(), 2, 0, 1.4142135623730951f, 144.0f);
+    Mat mpt, msize, mangle, mresponse, moctave, mclass_id;
+#ifdef GENERATE_DATA
+    // calculate
+    vector<KeyPoint> calcKeypoints;
+    Mat calcDescriptors;
+    ext->detectAndCompute(gray, Mat(), calcKeypoints, calcDescriptors, false);
+
+    // create keypoints XML
+    FileStorage fs(xml, FileStorage::WRITE);
+    ASSERT_TRUE(fs.isOpened()) << xml;
+    std::cout << "Creating keypoints XML..." << std::endl;
+
+    mpt = Mat(calcKeypoints.size(), 2, CV_32F);
+    msize = Mat(calcKeypoints.size(), 1, CV_32F);
+    mangle = Mat(calcKeypoints.size(), 1, CV_32F);
+    mresponse = Mat(calcKeypoints.size(), 1, CV_32F);
+    moctave = Mat(calcKeypoints.size(), 1, CV_32S);
+    mclass_id = Mat(calcKeypoints.size(), 1, CV_32S);
+
+    for( size_t i = 0; i < calcKeypoints.size(); i++ )
+    {
+        const KeyPoint& key = calcKeypoints[i];
+        mpt.at<float>(i, 0) = key.pt.x;
+        mpt.at<float>(i, 1) = key.pt.y;
+        msize.at<float>(i, 0) = key.size;
+        mangle.at<float>(i, 0) = key.angle;
+        mresponse.at<float>(i, 0) = key.response;
+        moctave.at<int>(i, 0) = key.octave;
+        mclass_id.at<int>(i, 0) = key.class_id;
+    }
+
+    fs << "keypoints_pt" << mpt;
+    fs << "keypoints_size" << msize;
+    fs << "keypoints_angle" << mangle;
+    fs << "keypoints_response" << mresponse;
+    fs << "keypoints_octave" << moctave;
+    fs << "keypoints_class_id" << mclass_id;
+
+    // create descriptor XML
+    fs << "descriptors" << calcDescriptors;
+    fs.release();
+#else
+    const float badCountsRatio = 0.01f;
+    const float badDescriptorDist = 1.0f;
+    const float maxBadKeypointsRatio = 0.15f;
+    const float maxBadDescriptorRatio = 0.15f;
+
+    // read keypoints
+    vector<KeyPoint> validKeypoints;
+    Mat validDescriptors;
+    FileStorage fs(xml, FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened()) << xml;
+
+    fs["keypoints_pt"] >> mpt;
+    ASSERT_EQ(mpt.type(), CV_32F);
+    fs["keypoints_size"] >> msize;
+    ASSERT_EQ(msize.type(), CV_32F);
+    fs["keypoints_angle"] >> mangle;
+    ASSERT_EQ(mangle.type(), CV_32F);
+    fs["keypoints_response"] >> mresponse;
+    ASSERT_EQ(mresponse.type(), CV_32F);
+    fs["keypoints_octave"] >> moctave;
+    ASSERT_EQ(moctave.type(), CV_32S);
+    fs["keypoints_class_id"] >> mclass_id;
+    ASSERT_EQ(mclass_id.type(), CV_32S);
+
+    validKeypoints.resize(mpt.rows);
+    for( int i = 0; i < (int)validKeypoints.size(); i++ )
+    {
+        validKeypoints[i].pt.x = mpt.at<float>(i, 0);
+        validKeypoints[i].pt.y = mpt.at<float>(i, 1);
+        validKeypoints[i].size = msize.at<float>(i, 0);
+        validKeypoints[i].angle = mangle.at<float>(i, 0);
+        validKeypoints[i].response = mresponse.at<float>(i, 0);
+        validKeypoints[i].octave = moctave.at<int>(i, 0);
+        validKeypoints[i].class_id = mclass_id.at<int>(i, 0);
+    }
+
+    // read descriptors
+    fs["descriptors"] >> validDescriptors;
+    fs.release();
+
+    // calc and compare keypoints
+    vector<KeyPoint> calcKeypoints;
+    ext->detectAndCompute(gray, Mat(), calcKeypoints, noArray(), false);
+
+    float countRatio = (float)validKeypoints.size() / (float)calcKeypoints.size();
+    ASSERT_LT(countRatio, 1 + badCountsRatio) << "Bad keypoints count ratio.";
+    ASSERT_GT(countRatio, 1 - badCountsRatio) << "Bad keypoints count ratio.";
+
+    int badPointCount = 0, commonPointCount = max((int)validKeypoints.size(), (int)calcKeypoints.size());
+    for( size_t v = 0; v < validKeypoints.size(); v++ )
+    {
+        int nearestIdx = -1;
+        float minDist = std::numeric_limits<float>::max();
+        float angleDistOfNearest = std::numeric_limits<float>::max();
+
+        for( size_t c = 0; c < calcKeypoints.size(); c++ )
+        {
+            if( validKeypoints[v].class_id != calcKeypoints[c].class_id )
+                continue;
+            float curDist = (float)cv::norm( calcKeypoints[c].pt - validKeypoints[v].pt );
+            if( curDist < minDist )
+            {
+                minDist = curDist;
+                nearestIdx = (int)c;
+                angleDistOfNearest = abs( calcKeypoints[c].angle - validKeypoints[v].angle );
+            }
+            else if( curDist == minDist ) // the keypoints whose positions are same but angles are different
+            {
+                float angleDist = abs( calcKeypoints[c].angle - validKeypoints[v].angle );
+                if( angleDist < angleDistOfNearest )
+                {
+                    nearestIdx = (int)c;
+                    angleDistOfNearest = angleDist;
+                }
+            }
+        }
+        if( nearestIdx == -1 || !isSimilarKeypoints( validKeypoints[v], calcKeypoints[nearestIdx] ) )
+            badPointCount++;
+    }
+    float badKeypointsRatio = (float)badPointCount / (float)commonPointCount;
+    std::cout << "badKeypointsRatio: " << badKeypointsRatio << std::endl;
+    ASSERT_LT( badKeypointsRatio , maxBadKeypointsRatio ) << "Bad accuracy!";
+
+    // Calc and compare descriptors. This uses validKeypoints for extraction.
+    Mat calcDescriptors;
+    ext->detectAndCompute(gray, Mat(), validKeypoints, calcDescriptors, true);
+
+    int dim = validDescriptors.cols;
+    int badDescriptorCount = 0;
+    L1<float> distance;
+
+    for( int i = 0; i < (int)validKeypoints.size(); i++ )
+    {
+        float dist = distance( validDescriptors.ptr<float>(i), calcDescriptors.ptr<float>(i), dim );
+        if( dist > badDescriptorDist )
+            badDescriptorCount++;
+    }
+    float badDescriptorRatio = (float)badDescriptorCount / (float)validKeypoints.size();
+    std::cout << "badDescriptorRatio: " << badDescriptorRatio << std::endl;
+    ASSERT_LT( badDescriptorRatio, maxBadDescriptorRatio ) << "Too many descriptors mismatched.";
+#endif
+}
+
+}} // namespace
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@@ -191,8 +191,28 @@ public:
            KDTreeIndexParams( int trees = 4 );
        };
        @endcode
+        - **HierarchicalClusteringIndexParams** When passing an object of this type the index constructed
+        will be a hierarchical tree of clusters, dividing each set of points into n clusters whose centers
+        are picked among the points without further refinement of their position.
+        This algorithm fits both floating, integer and binary vectors. :
+        @code
+        struct HierarchicalClusteringIndexParams : public IndexParams
+        {
+            HierarchicalClusteringIndexParams(
+                int branching = 32,
+                flann_centers_init_t centers_init = CENTERS_RANDOM,
+                int trees = 4,
+                int leaf_size = 100);
+
+        };
+        @endcode
        - **KMeansIndexParams** When passing an object of this type the index constructed will be a
-        hierarchical k-means tree. :
+        hierarchical k-means tree (one tree by default), dividing each set of points into n clusters
+        whose barycenters are refined iteratively.
+        Note that this algorithm has been extended to the support of binary vectors as an alternative
+        to LSH when knn search speed is the criterium. It will also outperform LSH when processing
+        directly (i.e. without the use of MCA/PCA) datasets whose points share mostly the same values
+        for most of the dimensions. It is recommended to set more than one tree with binary data. :
        @code
        struct KMeansIndexParams : public IndexParams
        {
@@ -201,6 +221,13 @@ public:
                int iterations = 11,
                flann_centers_init_t centers_init = CENTERS_RANDOM,
                float cb_index = 0.2 );
+
+            KMeansIndexParams(
+                int branching,
+                int iterations,
+                flann_centers_init_t centers_init,
+                float cb_index,
+                int trees );
        };
        @endcode
        - **CompositeIndexParams** When using a parameters object of this type the index created
@@ -219,7 +246,8 @@ public:
        - **LshIndexParams** When using a parameters object of this type the index created uses
        multi-probe LSH (by Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity Search
        by Qin Lv, William Josephson, Zhe Wang, Moses Charikar, Kai Li., Proceedings of the 33rd
-        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007) :
+        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007).
+        This algorithm is designed for binary vectors. :
        @code
        struct LshIndexParams : public IndexParams
        {

--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@@ -404,34 +404,16 @@ public:
     */
    virtual ~HierarchicalClusteringIndex()
    {
-        free_elements();
-
        if (root!=NULL) {
            delete[] root;
        }

        if (indices!=NULL) {
+            free_indices();
            delete[] indices;
        }
    }

-
-    /**
-     * Release the inner elements of indices[]
-     */
-    void free_elements()
-    {
-        if (indices!=NULL) {
-            for(int i=0; i<trees_; ++i) {
-                if (indices[i]!=NULL) {
-                    delete[] indices[i];
-                    indices[i] = NULL;
-                }
-            }
-        }
-    }
-
-
    /**
     *  Returns size of index.
     */
@@ -467,7 +449,7 @@ public:
            throw FLANNException("Branching factor must be at least 2");
        }

-        free_elements();
+        free_indices();

        for (int i=0; i<trees_; ++i) {
            indices[i] = new int[size_];
@@ -503,13 +485,12 @@ public:

    void loadIndex(FILE* stream) CV_OVERRIDE
    {
-        free_elements();
-
        if (root!=NULL) {
            delete[] root;
        }

        if (indices!=NULL) {
+            free_indices();
            delete[] indices;
        }

@@ -650,6 +631,20 @@ private:
    }


+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices[i]!=NULL) {
+                    delete[] indices[i];
+                    indices[i] = NULL;
+                }
+            }
+        }
+    }


    void computeLabels(int* dsindices, int indices_length,  int* centers, int centers_length, int* labels, DistanceType& cost)

--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -57,8 +57,8 @@ namespace cvflann

 struct KMeansIndexParams : public IndexParams
 {
-    KMeansIndexParams(int branching = 32, int iterations = 11,
-                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    void indexParams(int branching, int iterations,
+                     flann_centers_init_t centers_init, float cb_index, int trees)
    {
        (*this)["algorithm"] = FLANN_INDEX_KMEANS;
        // branching factor
@@ -69,6 +69,20 @@ struct KMeansIndexParams : public IndexParams
        (*this)["centers_init"] = centers_init;
        // cluster boundary index. Used when searching the kmeans tree
        (*this)["cb_index"] = cb_index;
+        // number of kmeans trees to search in
+        (*this)["trees"] = trees;
+    }
+
+    KMeansIndexParams(int branching = 32, int iterations = 11,
+                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    {
+        indexParams(branching, iterations, centers_init, cb_index, 1);
+    }
+
+    KMeansIndexParams(int branching, int iterations,
+                      flann_centers_init_t centers_init, float cb_index, int trees)
+    {
+        indexParams(branching, iterations, centers_init, cb_index, trees);
    }
 };

@@ -347,6 +361,7 @@ public:
        veclen_ = dataset_.cols;

        branching_ = get_param(params,"branching",32);
+        trees_ = get_param(params,"trees",1);
        iterations_ = get_param(params,"iterations",11);
        if (iterations_<0) {
            iterations_ = (std::numeric_limits<int>::max)();
@@ -367,6 +382,13 @@ public:
        }
        cb_index_ = 0.4f;

+        root_ = new KMeansNodePtr[trees_];
+        indices_ = new int*[trees_];
+
+        for (int i=0; i<trees_; ++i) {
+            root_[i] = NULL;
+            indices_[i] = NULL;
+        }
    }


@@ -382,9 +404,11 @@ public:
    virtual ~KMeansIndex()
    {
        if (root_ != NULL) {
-            free_centers(root_);
+            free_centers();
+            delete[] root_;
        }
        if (indices_!=NULL) {
+            free_indices();
            delete[] indices_;
        }
    }
@@ -429,23 +453,24 @@ public:
            throw FLANNException("Branching factor must be at least 2");
        }

-        indices_ = new int[size_];
-        for (size_t i=0; i<size_; ++i) {
-            indices_[i] = int(i);
-        }
+        free_indices();

-        root_ = pool_.allocate<KMeansNode>();
-        std::memset(root_, 0, sizeof(KMeansNode));
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            for (size_t j=0; j<size_; ++j) {
+                indices_[i][j] = int(j);
+            }
+            root_[i] = pool_.allocate<KMeansNode>();
+            std::memset(root_[i], 0, sizeof(KMeansNode));

-        if(is_kdtree_distance::val || is_vector_space_distance::val)
-        {
-            computeNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeClustering(root_, indices_, (int)size_, branching_,0);
-        }
-        else
-        {
-            computeBitfieldNodeStatistics(root_, indices_, (unsigned int)size_);
-            computeBitfieldClustering(root_, indices_, (int)size_, branching_,0);
+            if(is_kdtree_distance::val || is_vector_space_distance::val) {
+                computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
+            else {
+                computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
+                computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
+            }
        }
    }

@@ -456,35 +481,43 @@ public:
        save_value(stream, iterations_);
        save_value(stream, memoryCounter_);
        save_value(stream, cb_index_);
-        save_value(stream, *indices_, (int)size_);
-
-        save_tree(stream, root_);
+        save_value(stream, trees_);
+        for (int i=0; i<trees_; ++i) {
+            save_value(stream, *indices_[i], (int)size_);
+            save_tree(stream, root_[i], i);
+        }
    }


    void loadIndex(FILE* stream) CV_OVERRIDE
    {
+        if (indices_!=NULL) {
+            free_indices();
+            delete[] indices_;
+        }
+        if (root_!=NULL) {
+            free_centers();
+        }
+
        load_value(stream, branching_);
        load_value(stream, iterations_);
        load_value(stream, memoryCounter_);
        load_value(stream, cb_index_);
-        if (indices_!=NULL) {
-            delete[] indices_;
-        }
-        indices_ = new int[size_];
-        load_value(stream, *indices_, size_);
+        load_value(stream, trees_);

-        if (root_!=NULL) {
-            free_centers(root_);
+        indices_ = new int*[trees_];
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            load_value(stream, *indices_[i], size_);
+            load_tree(stream, root_[i], i);
        }
-        load_tree(stream, root_);

        index_params_["algorithm"] = getType();
        index_params_["branching"] = branching_;
+        index_params_["trees"] = trees_;
        index_params_["iterations"] = iterations_;
        index_params_["centers_init"] = centers_init_;
        index_params_["cb_index"] = cb_index_;
-
    }


@@ -500,17 +533,21 @@ public:
    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
    {

-        int maxChecks = get_param(searchParams,"checks",32);
+        const int maxChecks = get_param(searchParams,"checks",32);

        if (maxChecks==FLANN_CHECKS_UNLIMITED) {
-            findExactNN(root_, result, vec);
+            findExactNN(root_[0], result, vec);
        }
        else {
            // Priority queue storing intermediate branches in the best-bin-first search
            Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);

            int checks = 0;
-            findNN(root_, result, vec, checks, maxChecks, heap);
+            for (int i=0; i<trees_; ++i) {
+                findNN(root_[i], result, vec, checks, maxChecks, heap);
+                if ((checks >= maxChecks) && result.full())
+                    break;
+            }

            BranchSt branch;
            while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
@@ -521,7 +558,6 @@ public:

            CV_Assert(result.full());
        }
-
    }

    /**
@@ -541,7 +577,7 @@ public:
        DistanceType variance;
        KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];

-        int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance);
+        int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance);

        Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);

@@ -611,23 +647,23 @@ private:



-    void save_tree(FILE* stream, KMeansNodePtr node)
+    void save_tree(FILE* stream, KMeansNodePtr node, int num)
    {
        save_value(stream, *node);
        save_value(stream, *(node->pivot), (int)veclen_);
        if (node->childs==NULL) {
-            int indices_offset = (int)(node->indices - indices_);
+            int indices_offset = (int)(node->indices - indices_[num]);
            save_value(stream, indices_offset);
        }
        else {
            for(int i=0; i<branching_; ++i) {
-                save_tree(stream, node->childs[i]);
+                save_tree(stream, node->childs[i], num);
            }
        }
    }


-    void load_tree(FILE* stream, KMeansNodePtr& node)
+    void load_tree(FILE* stream, KMeansNodePtr& node, int num)
    {
        node = pool_.allocate<KMeansNode>();
        load_value(stream, *node);
@@ -636,12 +672,12 @@ private:
        if (node->childs==NULL) {
            int indices_offset;
            load_value(stream, indices_offset);
-            node->indices = indices_ + indices_offset;
+            node->indices = indices_[num] + indices_offset;
        }
        else {
            node->childs = pool_.allocate<KMeansNodePtr>(branching_);
            for(int i=0; i<branching_; ++i) {
-                load_tree(stream, node->childs[i]);
+                load_tree(stream, node->childs[i], num);
            }
        }
    }
@@ -660,6 +696,32 @@ private:
        }
    }

+    void free_centers()
+    {
+       if (root_ != NULL) {
+           for(int i=0; i<trees_; ++i) {
+               if (root_[i] != NULL) {
+                   free_centers(root_[i]);
+               }
+           }
+       }
+    }
+
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices_!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices_[i]!=NULL) {
+                    delete[] indices_[i];
+                    indices_[i] = NULL;
+                }
+            }
+        }
+    }
+
    /**
     * Computes the statistics of a node (mean, radius, variance).
     *
@@ -960,7 +1022,45 @@ private:
    }


-
+    /**
+     * The method responsible with doing the recursive hierarchical clustering on
+     * binary vectors.
+     * As some might have heared that KMeans on binary data doesn't make sense,
+     * it's worth a little explanation why it actually fairly works. As
+     * with the Hierarchical Clustering algortihm, we seed several centers for the
+     * current node by picking some of its points. Then in a first pass each point
+     * of the node is then related to its closest center. Now let's have a look at
+     * the 5 central dimensions of the 9 following points:
+     *
+     * xxxxxx11100xxxxx (1)
+     * xxxxxx11010xxxxx (2)
+     * xxxxxx11001xxxxx (3)
+     * xxxxxx10110xxxxx (4)
+     * xxxxxx10101xxxxx (5)
+     * xxxxxx10011xxxxx (6)
+     * xxxxxx01110xxxxx (7)
+     * xxxxxx01101xxxxx (8)
+     * xxxxxx01011xxxxx (9)
+     * sum   _____
+     * of 1: 66555
+     *
+     * Even if the barycenter notion doesn't apply, we can set a center
+     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
+     * on for these points.
+     *
+     * Note that convergence isn't ensured anymore. In practice, using Gonzales
+     * as seeding algorithm should be fine for getting convergence ("iterations"
+     * value can be set to -1). But with KMeans++ seeding you should definitely
+     * set a maximum number of iterations (but make it higher than the "iterations"
+     * default value of 11).
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     indices_length = number of points in the current node
+     *     branching = the branching factor to use in the clustering
+     *     level = 0 for the root node, it increases with the subdivision levels
+     */
    void computeBitfieldClustering(KMeansNodePtr node, int* indices,
                                   int indices_length, int branching, int level)
    {
@@ -1195,8 +1295,8 @@ private:
        }

        if (node->childs==NULL) {
-            if (checks>=maxChecks) {
-                if (result.full()) return;
+            if ((checks>=maxChecks) && result.full()) {
+                return;
            }
            checks += node->size;
            for (int i=0; i<node->size; ++i) {
@@ -1397,6 +1497,9 @@ private:
    /** The branching factor used in the hierarchical k-means clustering */
    int branching_;

+    /** Number of kmeans trees (default is one) */
+    int trees_;
+
    /** Maximum number of iterations to use when performing k-means clustering */
    int iterations_;

@@ -1432,12 +1535,12 @@ private:
    /**
     * The root node in the tree.
     */
-    KMeansNodePtr root_;
+    KMeansNodePtr* root_;

    /**
     *  Array of indices to vectors in the dataset.
     */
-    int* indices_;
+    int** indices_;

    /**
     * The distance

--- a/modules/imgcodecs/src/grfmt_jpeg2000.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000.cpp
@@ -378,7 +378,7 @@ bool  Jpeg2KDecoder::readComponent8u( uchar *data, void *_buffer,

    for( y = 0; y < yend - ystart; )
    {
-        jas_seqent_t* pix_row = &jas_matrix_get( buffer, y / ystep, 0 );
+        jas_seqent_t* pix_row = jas_matrix_getref( buffer, y / ystep, 0 );
        uchar* dst = data + (y - yoffset) * step - xoffset;

        if( xstep == 1 )
@@ -444,7 +444,7 @@ bool  Jpeg2KDecoder::readComponent16u( unsigned short *data, void *_buffer,

    for( y = 0; y < yend - ystart; )
    {
-        jas_seqent_t* pix_row = &jas_matrix_get( buffer, y / ystep, 0 );
+        jas_seqent_t* pix_row = jas_matrix_getref( buffer, y / ystep, 0 );
        ushort* dst = data + (y - yoffset) * step - xoffset;

        if( xstep == 1 )

--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -2310,7 +2310,7 @@ CV_EXPORTS_W void warpAffine( InputArray src, OutputArray dst,
                              const Scalar& borderValue = Scalar());

 /** @example samples/cpp/warpPerspective_demo.cpp
-An example program shows using cv::findHomography and cv::warpPerspective for image warping
+An example program shows using cv::getPerspectiveTransform and cv::warpPerspective for image warping
 */

 /** @brief Applies a perspective transformation to an image.

--- a/modules/stitching/include/opencv2/stitching.hpp
+++ b/modules/stitching/include/opencv2/stitching.hpp
@@ -272,7 +272,7 @@ public:
    @param pano Final pano.
    @return Status code.
     */
-    Status composePanorama(InputArrayOfArrays images, OutputArray pano);
+    CV_WRAP Status composePanorama(InputArrayOfArrays images, OutputArray pano);

    /** @overload */
    CV_WRAP Status stitch(InputArrayOfArrays images, OutputArray pano);

--- a/modules/stitching/misc/python/test/test_stitching.py
+++ b/modules/stitching/misc/python/test/test_stitching.py
@@ -19,6 +19,7 @@ class stitching_test(NewOpenCVTests):
        self.assertAlmostEqual(pano.shape[0], 685, delta=100, msg="rows: %r" % list(pano.shape))
        self.assertAlmostEqual(pano.shape[1], 1025, delta=100, msg="cols: %r" % list(pano.shape))

+
 class stitching_detail_test(NewOpenCVTests):

    def test_simple(self):
@@ -82,5 +83,37 @@ class stitching_detail_test(NewOpenCVTests):
        timelapser = cv.detail.Timelapser_createDefault(cv.detail.Timelapser_CROP);
        self.assertIsNotNone(timelapser)

+
+class stitching_compose_panorama_test_no_args(NewOpenCVTests):
+
+    def test_simple(self):
+
+        img1 = self.get_sample('stitching/a1.png')
+        img2 = self.get_sample('stitching/a2.png')
+
+        stitcher = cv.Stitcher.create(cv.Stitcher_PANORAMA)
+
+        stitcher.estimateTransform((img1, img2))
+
+        result, _ = stitcher.composePanorama()
+
+        assert result == 0
+
+
+class stitching_compose_panorama_args(NewOpenCVTests):
+
+    def test_simple(self):
+
+        img1 = self.get_sample('stitching/a1.png')
+        img2 = self.get_sample('stitching/a2.png')
+
+        stitcher = cv.Stitcher.create(cv.Stitcher_PANORAMA)
+
+        stitcher.estimateTransform((img1, img2))
+        result, _ = stitcher.composePanorama((img1, img2))
+
+        assert result == 0
+
+
 if __name__ == '__main__':
    NewOpenCVTests.bootstrap()
--- a/samples/cpp/asift.cpp
+++ b/samples/cpp/asift.cpp
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/features2d.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/calib3d.hpp>
+#include <iostream>
+#include <iomanip>
+
+using namespace std;
+using namespace cv;
+
+static void help(char** argv)
+{
+    cout
+    << "This is a sample usage of AffineFeature detector/extractor.\n"
+    << "And this is a C++ version of samples/python/asift.py\n"
+    << "Usage: " << argv[0] << "\n"
+    << "     [ --feature=<sift|orb|brisk> ]         # Feature to use.\n"
+    << "     [ --flann ]                            # use Flann-based matcher instead of bruteforce.\n"
+    << "     [ --maxlines=<number(50 as default)> ] # The maximum number of lines in visualizing the matching result.\n"
+    << "     [ --image1=<image1(aero1.jpg as default)> ]\n"
+    << "     [ --image2=<image2(aero3.jpg as default)> ] # Path to images to compare."
+    << endl;
+}
+
+static double timer()
+{
+    return getTickCount() / getTickFrequency();
+}
+
+int main(int argc, char** argv)
+{
+    vector<String> fileName;
+    cv::CommandLineParser parser(argc, argv,
+        "{help h ||}"
+        "{feature|brisk|}"
+        "{flann||}"
+        "{maxlines|50|}"
+        "{image1|aero1.jpg|}{image2|aero3.jpg|}");
+    if (parser.has("help"))
+    {
+        help(argv);
+        return 0;
+    }
+    string feature = parser.get<string>("feature");
+    bool useFlann = parser.has("flann");
+    int maxlines = parser.get<int>("maxlines");
+    fileName.push_back(samples::findFile(parser.get<string>("image1")));
+    fileName.push_back(samples::findFile(parser.get<string>("image2")));
+    if (!parser.check())
+    {
+        parser.printErrors();
+        cout << "See --help (or missing '=' between argument name and value?)" << endl;
+        return 1;
+    }
+
+    Mat img1 = imread(fileName[0], IMREAD_GRAYSCALE);
+    Mat img2 = imread(fileName[1], IMREAD_GRAYSCALE);
+    if (img1.empty())
+    {
+        cerr << "Image " << fileName[0] << " is empty or cannot be found" << endl;
+        return 1;
+    }
+    if (img2.empty())
+    {
+        cerr << "Image " << fileName[1] << " is empty or cannot be found" << endl;
+        return 1;
+    }
+
+    Ptr<Feature2D> backend;
+    Ptr<DescriptorMatcher> matcher;
+
+    if (feature == "sift")
+    {
+        backend = SIFT::create();
+        if (useFlann)
+            matcher = DescriptorMatcher::create("FlannBased");
+        else
+            matcher = DescriptorMatcher::create("BruteForce");
+    }
+    else if (feature == "orb")
+    {
+        backend = ORB::create();
+        if (useFlann)
+            matcher = makePtr<FlannBasedMatcher>(makePtr<flann::LshIndexParams>(6, 12, 1));
+        else
+            matcher = DescriptorMatcher::create("BruteForce-Hamming");
+    }
+    else if (feature == "brisk")
+    {
+        backend = BRISK::create();
+        if (useFlann)
+            matcher = makePtr<FlannBasedMatcher>(makePtr<flann::LshIndexParams>(6, 12, 1));
+        else
+            matcher = DescriptorMatcher::create("BruteForce-Hamming");
+    }
+    else
+    {
+        cerr << feature << " is not supported. See --help" << endl;
+        return 1;
+    }
+
+    cout << "extracting with " << feature << "..." << endl;
+    Ptr<AffineFeature> ext = AffineFeature::create(backend);
+    vector<KeyPoint> kp1, kp2;
+    Mat desc1, desc2;
+
+    ext->detectAndCompute(img1, Mat(), kp1, desc1);
+    ext->detectAndCompute(img2, Mat(), kp2, desc2);
+    cout << "img1 - " << kp1.size() << " features, "
+         << "img2 - " << kp2.size() << " features"
+         << endl;
+
+    cout << "matching with " << (useFlann ? "flann" : "bruteforce") << "..." << endl;
+    double start = timer();
+    // match and draw
+    vector< vector<DMatch> > rawMatches;
+    vector<Point2f> p1, p2;
+    vector<float> distances;
+    matcher->knnMatch(desc1, desc2, rawMatches, 2);
+    // filter_matches
+    for (size_t i = 0; i < rawMatches.size(); i++)
+    {
+        const vector<DMatch>& m = rawMatches[i];
+        if (m.size() == 2 && m[0].distance < m[1].distance * 0.75)
+        {
+            p1.push_back(kp1[m[0].queryIdx].pt);
+            p2.push_back(kp2[m[0].trainIdx].pt);
+            distances.push_back(m[0].distance);
+        }
+    }
+    vector<uchar> status;
+    vector< pair<Point2f, Point2f> > pointPairs;
+    Mat H = findHomography(p1, p2, status, RANSAC);
+    int inliers = 0;
+    for (size_t i = 0; i < status.size(); i++)
+    {
+        if (status[i])
+        {
+            pointPairs.push_back(make_pair(p1[i], p2[i]));
+            distances[inliers] = distances[i];
+            // CV_Assert(inliers <= (int)i);
+            inliers++;
+        }
+    }
+    distances.resize(inliers);
+
+    cout << "execution time: " << fixed << setprecision(2) << (timer()-start)*1000 << " ms" << endl;
+    cout << inliers << " / " << status.size() << " inliers/matched" << endl;
+
+    cout << "visualizing..." << endl;
+    vector<int> indices(inliers);
+    cv::sortIdx(distances, indices, SORT_EVERY_ROW+SORT_ASCENDING);
+
+    // explore_match
+    int h1 = img1.size().height;
+    int w1 = img1.size().width;
+    int h2 = img2.size().height;
+    int w2 = img2.size().width;
+    Mat vis = Mat::zeros(max(h1, h2), w1+w2, CV_8U);
+    img1.copyTo(Mat(vis, Rect(0, 0, w1, h1)));
+    img2.copyTo(Mat(vis, Rect(w1, 0, w2, h2)));
+    cvtColor(vis, vis, COLOR_GRAY2BGR);
+
+    vector<Point2f> corners(4);
+    corners[0] = Point2f(0, 0);
+    corners[1] = Point2f((float)w1, 0);
+    corners[2] = Point2f((float)w1, (float)h1);
+    corners[3] = Point2f(0, (float)h1);
+    vector<Point2i> icorners;
+    perspectiveTransform(corners, corners, H);
+    transform(corners, corners, Matx23f(1,0,(float)w1,0,1,0));
+    Mat(corners).convertTo(icorners, CV_32S);
+    polylines(vis, icorners, true, Scalar(255,255,255));
+
+    for (int i = 0; i < min(inliers, maxlines); i++)
+    {
+        int idx = indices[i];
+        const Point2f& pi1 = pointPairs[idx].first;
+        const Point2f& pi2 = pointPairs[idx].second;
+        circle(vis, pi1, 2, Scalar(0,255,0), -1);
+        circle(vis, pi2 + Point2f((float)w1,0), 2, Scalar(0,255,0), -1);
+        line(vis, pi1, pi2 + Point2f((float)w1,0), Scalar(0,255,0));
+    }
+    if (inliers > maxlines)
+        cout << "only " << maxlines << " inliers are visualized" << endl;
+    imshow("affine find_obj", vis);
+
+    // Mat vis2 = Mat::zeros(max(h1, h2), w1+w2, CV_8U);
+    // Mat warp1;
+    // warpPerspective(img1, warp1, H, Size(w1, h1));
+    // warp1.copyTo(Mat(vis2, Rect(0, 0, w1, h1)));
+    // img2.copyTo(Mat(vis2, Rect(w1, 0, w2, h2)));
+    // imshow("warped", vis2);
+
+    waitKey();
+    cout << "done" << endl;
+    return 0;
+}
--- a/samples/cpp/warpPerspective_demo.cpp
+++ b/samples/cpp/warpPerspective_demo.cpp
@@ -8,7 +8,6 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/highgui.hpp"
-#include "opencv2/calib3d.hpp"
 #include <iostream>

 using namespace std;
@@ -36,6 +35,7 @@ Mat warping(Mat image, Size warped_image_size, vector< Point2f> srcPoints, vecto
 String windowTitle = "Perspective Transformation Demo";
 String labels[4] = { "TL","TR","BR","BL" };
 vector< Point2f> roi_corners;
+vector< Point2f> midpoints(4);
 vector< Point2f> dst_corners(4);
 int roiIndex = 0;
 bool dragging;
@@ -99,21 +99,26 @@ int main(int argc, char** argv)

            imshow( windowTitle, image );

+            midpoints[0] = (roi_corners[0] + roi_corners[1]) / 2;
+            midpoints[1] = (roi_corners[1] + roi_corners[2]) / 2;
+            midpoints[2] = (roi_corners[2] + roi_corners[3]) / 2;
+            midpoints[3] = (roi_corners[3] + roi_corners[0]) / 2;
+
            dst_corners[0].x = 0;
            dst_corners[0].y = 0;
-            dst_corners[1].x = (float)std::max(norm(roi_corners[0] - roi_corners[1]), norm(roi_corners[2] - roi_corners[3]));
+            dst_corners[1].x = (float)norm(midpoints[1] - midpoints[3]);
            dst_corners[1].y = 0;
-            dst_corners[2].x = (float)std::max(norm(roi_corners[0] - roi_corners[1]), norm(roi_corners[2] - roi_corners[3]));
-            dst_corners[2].y = (float)std::max(norm(roi_corners[1] - roi_corners[2]), norm(roi_corners[3] - roi_corners[0]));
+            dst_corners[2].x = dst_corners[1].x;
+            dst_corners[2].y = (float)norm(midpoints[0] - midpoints[2]);
            dst_corners[3].x = 0;
-            dst_corners[3].y = (float)std::max(norm(roi_corners[1] - roi_corners[2]), norm(roi_corners[3] - roi_corners[0]));
+            dst_corners[3].y = dst_corners[2].y;

            Size warped_image_size = Size(cvRound(dst_corners[2].x), cvRound(dst_corners[2].y));

-            Mat H = findHomography(roi_corners, dst_corners); //get homography
+            Mat M = getPerspectiveTransform(roi_corners, dst_corners);

            Mat warped_image;
-            warpPerspective(original_image, warped_image, H, warped_image_size); // do perspective transformation
+            warpPerspective(original_image, warped_image, M, warped_image_size); // do perspective transformation

            imshow("Warped Image", warped_image);
        }

--- a/samples/data/opencv-logo-white.png
+++ b/samples/data/opencv-logo-white.png
--- a/samples/data/opencv-logo.png
+++ b/samples/data/opencv-logo.png
--- a/samples/winrt/ImageManipulations/assets/StoreLogo.png
+++ b/samples/winrt/ImageManipulations/assets/StoreLogo.png
--- a/samples/winrt/ImageManipulations/assets/opencv-logo-150.png
+++ b/samples/winrt/ImageManipulations/assets/opencv-logo-150.png
--- a/samples/winrt/ImageManipulations/assets/opencv-logo-30.png
+++ b/samples/winrt/ImageManipulations/assets/opencv-logo-30.png
--- a/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png
+++ b/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png
--- a/samples/winrt/JavaScript/images/logo.scale-100.png
+++ b/samples/winrt/JavaScript/images/logo.scale-100.png
--- a/samples/winrt/JavaScript/images/smalllogo.scale-100.png
+++ b/samples/winrt/JavaScript/images/smalllogo.scale-100.png
--- a/samples/winrt/JavaScript/images/windows-sdk.png
+++ b/samples/winrt/JavaScript/images/windows-sdk.png
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png