Merge pull request #15692 from alalek:core_tls_handle_thread_termination

bad4e5c3 · Alexander Alekhin · e86c888a · 17e2bf57 · bad4e5c3 · bad4e5c3
14 changed file
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -53,6 +53,10 @@

 #include <opencv2/core/utils/trace.hpp>

+#ifdef ENABLE_INSTRUMENTATION
+#include "opencv2/core/utils/instrumentation.hpp"
+#endif
+
 #ifdef HAVE_EIGEN
 #  if defined __GNUC__ && defined __APPLE__
 #    pragma GCC diagnostic ignored "-Wshadow"

--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -63,30 +63,6 @@
 namespace cv
 {

-#ifdef CV_COLLECT_IMPL_DATA
-CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays
-CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays
-// Get stored implementation flags and functions names arrays
-// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function
-CV_EXPORTS int getImpl(std::vector<int> &impl, std::vector<String> &funName);
-
-CV_EXPORTS bool useCollection(); // return implementation collection state
-CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state
-
-#define CV_IMPL_PLAIN  0x01 // native CPU OpenCV implementation
-#define CV_IMPL_OCL    0x02 // OpenCL implementation
-#define CV_IMPL_IPP    0x04 // IPP implementation
-#define CV_IMPL_MT     0x10 // multithreaded implementation
-
-#define CV_IMPL_ADD(impl)                                                   \
-    if(cv::useCollection())                                                 \
-    {                                                                       \
-        cv::addImpl(impl, CV_Func);                                         \
-    }
-#else
-#define CV_IMPL_ADD(impl)
-#endif
-
 //! @addtogroup core_utils
 //! @{

@@ -726,61 +702,6 @@ private:
    AutoLock& operator = (const AutoLock&);
 };

-// TLS interface
-class CV_EXPORTS TLSDataContainer
-{
-protected:
-    TLSDataContainer();
-    virtual ~TLSDataContainer();
-
-    void  gatherData(std::vector<void*> &data) const;
-#if OPENCV_ABI_COMPATIBILITY > 300
-    void* getData() const;
-    void  release();
-
-private:
-#else
-    void  release();
-
-public:
-    void* getData() const;
-#endif
-    virtual void* createDataInstance() const = 0;
-    virtual void  deleteDataInstance(void* pData) const = 0;
-
-    int key_;
-
-public:
-    void cleanup(); //! Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid.
-};
-
-// Main TLS data class
-template <typename T>
-class TLSData : protected TLSDataContainer
-{
-public:
-    inline TLSData()        {}
-    inline ~TLSData()       { release();            } // Release key and delete associated data
-    inline T* get() const   { return (T*)getData(); } // Get data associated with key
-    inline T& getRef() const { T* ptr = (T*)getData(); CV_Assert(ptr); return *ptr; } // Get data associated with key
-
-    // Get data from all threads
-    inline void gather(std::vector<T*> &data) const
-    {
-        std::vector<void*> &dataVoid = reinterpret_cast<std::vector<void*>&>(data);
-        gatherData(dataVoid);
-    }
-
-    inline void cleanup() { TLSDataContainer::cleanup(); }
-
-private:
-    virtual void* createDataInstance() const CV_OVERRIDE {return new T;}                // Wrapper to allocate data by template
-    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE {delete (T*)pData;} // Wrapper to release data by template
-
-    // Disable TLS copy operations
-    TLSData(TLSData &) {}
-    TLSData& operator =(const TLSData &) {return *this;}
-};

 /** @brief Designed for command line parsing

@@ -1199,88 +1120,6 @@ public:
    std::vector<Node<OBJECT>*> m_childs;
 };

-// Instrumentation external interface
-namespace instr
-{
-
-#if !defined OPENCV_ABI_CHECK
-
-enum TYPE
-{
-    TYPE_GENERAL = 0,   // OpenCV API function, e.g. exported function
-    TYPE_MARKER,        // Information marker
-    TYPE_WRAPPER,       // Wrapper function for implementation
-    TYPE_FUN,           // Simple function call
-};
-
-enum IMPL
-{
-    IMPL_PLAIN = 0,
-    IMPL_IPP,
-    IMPL_OPENCL,
-};
-
-struct NodeDataTls
-{
-    NodeDataTls()
-    {
-        m_ticksTotal = 0;
-    }
-    uint64      m_ticksTotal;
-};
-
-class CV_EXPORTS NodeData
-{
-public:
-    NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN);
-    NodeData(NodeData &ref);
-    ~NodeData();
-    NodeData& operator=(const NodeData&);
-
-    cv::String          m_funName;
-    cv::instr::TYPE     m_instrType;
-    cv::instr::IMPL     m_implType;
-    const char*         m_fileName;
-    int                 m_lineNum;
-    void*               m_retAddress;
-    bool                m_alwaysExpand;
-    bool                m_funError;
-
-    volatile int         m_counter;
-    volatile uint64      m_ticksTotal;
-    TLSData<NodeDataTls> m_tls;
-    int                  m_threads;
-
-    // No synchronization
-    double getTotalMs()   const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; }
-    double getMeanMs()    const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; }
-};
-bool operator==(const NodeData& lhs, const NodeData& rhs);
-
-typedef Node<NodeData> InstrNode;
-
-CV_EXPORTS InstrNode* getTrace();
-
-#endif // !defined OPENCV_ABI_CHECK
-
-
-CV_EXPORTS bool       useInstrumentation();
-CV_EXPORTS void       setUseInstrumentation(bool flag);
-CV_EXPORTS void       resetTrace();
-
-enum FLAGS
-{
-    FLAGS_NONE              = 0,
-    FLAGS_MAPPING           = 0x01,
-    FLAGS_EXPAND_SAME_NAMES = 0x02,
-};
-
-CV_EXPORTS void       setFlags(FLAGS modeFlags);
-static inline void    setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); }
-CV_EXPORTS FLAGS      getFlags();
-
-} // namespace instr
-

 namespace samples {

@@ -1355,6 +1194,13 @@ CV_EXPORTS int getThreadID();

 } //namespace cv

+#ifdef CV_COLLECT_IMPL_DATA
+#include "opencv2/core/utils/instrumentation.hpp"
+#else
+/// Collect implementation data on OpenCV function call. Requires ENABLE_IMPL_COLLECTION build option.
+#define CV_IMPL_ADD(impl)
+#endif
+
 #ifndef DISABLE_OPENCV_24_COMPATIBILITY
 #include "opencv2/core/core_c.h"
 #endif

--- a/modules/core/include/opencv2/core/utils/instrumentation.hpp
+++ b/modules/core/include/opencv2/core/utils/instrumentation.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_INSTR_HPP
+#define OPENCV_UTILS_INSTR_HPP
+
+#include <opencv2/core/utility.hpp>
+#include <opencv2/core/utils/tls.hpp>
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+#ifdef CV_COLLECT_IMPL_DATA
+CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays
+CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays
+// Get stored implementation flags and functions names arrays
+// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function
+CV_EXPORTS int getImpl(std::vector<int> &impl, std::vector<String> &funName);
+
+CV_EXPORTS bool useCollection(); // return implementation collection state
+CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state
+
+#define CV_IMPL_PLAIN  0x01 // native CPU OpenCV implementation
+#define CV_IMPL_OCL    0x02 // OpenCL implementation
+#define CV_IMPL_IPP    0x04 // IPP implementation
+#define CV_IMPL_MT     0x10 // multithreaded implementation
+
+#undef CV_IMPL_ADD
+#define CV_IMPL_ADD(impl)                                                   \
+    if(cv::useCollection())                                                 \
+    {                                                                       \
+        cv::addImpl(impl, CV_Func);                                         \
+    }
+#endif
+
+// Instrumentation external interface
+namespace instr
+{
+
+#if !defined OPENCV_ABI_CHECK
+
+enum TYPE
+{
+    TYPE_GENERAL = 0,   // OpenCV API function, e.g. exported function
+    TYPE_MARKER,        // Information marker
+    TYPE_WRAPPER,       // Wrapper function for implementation
+    TYPE_FUN,           // Simple function call
+};
+
+enum IMPL
+{
+    IMPL_PLAIN = 0,
+    IMPL_IPP,
+    IMPL_OPENCL,
+};
+
+struct NodeDataTls
+{
+    NodeDataTls()
+    {
+        m_ticksTotal = 0;
+    }
+    uint64      m_ticksTotal;
+};
+
+class CV_EXPORTS NodeData
+{
+public:
+    NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN);
+    NodeData(NodeData &ref);
+    ~NodeData();
+    NodeData& operator=(const NodeData&);
+
+    cv::String          m_funName;
+    cv::instr::TYPE     m_instrType;
+    cv::instr::IMPL     m_implType;
+    const char*         m_fileName;
+    int                 m_lineNum;
+    void*               m_retAddress;
+    bool                m_alwaysExpand;
+    bool                m_funError;
+
+    volatile int         m_counter;
+    volatile uint64      m_ticksTotal;
+    TLSDataAccumulator<NodeDataTls> m_tls;
+    int                  m_threads;
+
+    // No synchronization
+    double getTotalMs()   const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; }
+    double getMeanMs()    const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; }
+};
+bool operator==(const NodeData& lhs, const NodeData& rhs);
+
+typedef Node<NodeData> InstrNode;
+
+CV_EXPORTS InstrNode* getTrace();
+
+#endif // !defined OPENCV_ABI_CHECK
+
+
+CV_EXPORTS bool       useInstrumentation();
+CV_EXPORTS void       setUseInstrumentation(bool flag);
+CV_EXPORTS void       resetTrace();
+
+enum FLAGS
+{
+    FLAGS_NONE              = 0,
+    FLAGS_MAPPING           = 0x01,
+    FLAGS_EXPAND_SAME_NAMES = 0x02,
+};
+
+CV_EXPORTS void       setFlags(FLAGS modeFlags);
+static inline void    setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); }
+CV_EXPORTS FLAGS      getFlags();
+
+} // namespace instr
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
--- a/modules/core/include/opencv2/core/utils/tls.hpp
+++ b/modules/core/include/opencv2/core/utils/tls.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_TLS_HPP
+#define OPENCV_UTILS_TLS_HPP
+
+#include <opencv2/core/utility.hpp>
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+namespace details { class TlsStorage; }
+
+/** TLS container base implementation
+ *
+ * Don't use directly.
+ *
+ * @sa TLSData, TLSDataAccumulator templates
+ */
+class CV_EXPORTS TLSDataContainer
+{
+protected:
+    TLSDataContainer();
+    virtual ~TLSDataContainer();
+
+    /// @deprecated use detachData() instead
+    void  gatherData(std::vector<void*> &data) const;
+    /// get TLS data and detach all data from threads (similar to cleanup() call)
+    void  detachData(std::vector<void*>& data);
+
+    void* getData() const;
+    void  release();
+
+protected:
+    virtual void* createDataInstance() const = 0;
+    virtual void  deleteDataInstance(void* pData) const = 0;
+
+#if OPENCV_ABI_COMPATIBILITY > 300
+private:
+#else
+public:
+#endif
+    int key_;
+
+    friend class cv::details::TlsStorage;  // core/src/system.cpp
+
+public:
+    void cleanup(); //!< Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid.
+
+private:
+    // Disable copy/assign (noncopyable pattern)
+    TLSDataContainer(TLSDataContainer &);
+    TLSDataContainer& operator =(const TLSDataContainer &);
+};
+
+
+/** @brief Simple TLS data class
+ *
+ * @sa TLSDataAccumulator
+ */
+template <typename T>
+class TLSData : protected TLSDataContainer
+{
+public:
+    inline TLSData() {}
+    inline ~TLSData() { release(); }
+
+    inline T* get() const   { return (T*)getData(); }  //!< Get data associated with key
+    inline T& getRef() const { T* ptr = (T*)getData(); CV_DbgAssert(ptr); return *ptr; }  //!< Get data associated with key
+
+    /// Release associated thread data
+    inline void cleanup()
+    {
+        TLSDataContainer::cleanup();
+    }
+
+protected:
+    /// Wrapper to allocate data by template
+    virtual void* createDataInstance() const CV_OVERRIDE { return new T; }
+    /// Wrapper to release data by template
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE { delete (T*)pData; }
+};
+
+
+/// TLS data accumulator with gathering methods
+template <typename T>
+class TLSDataAccumulator : public TLSData<T>
+{
+    mutable cv::Mutex mutex;
+    mutable std::vector<T*> dataFromTerminatedThreads;
+    std::vector<T*> detachedData;
+    bool cleanupMode;
+public:
+    TLSDataAccumulator() : cleanupMode(false) {}
+    ~TLSDataAccumulator()
+    {
+        release();
+    }
+
+    /** @brief Get data from all threads
+     * @deprecated replaced by detachData()
+     *
+     * Lifetime of vector data is valid until next detachData()/cleanup()/release() calls
+     *
+     * @param[out] data result buffer (should be empty)
+     */
+    void gather(std::vector<T*> &data) const
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        CV_Assert(data.empty());
+        {
+            std::vector<void*> &dataVoid = reinterpret_cast<std::vector<void*>&>(data);
+            TLSDataContainer::gatherData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            data.reserve(data.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                data.push_back((T*)*i);
+            }
+        }
+    }
+
+    /** @brief Get and detach data from all threads
+     *
+     * Call cleanupDetachedData() when returned vector is not needed anymore.
+     *
+     * @return Vector with associated data. Content is preserved (including lifetime of attached data pointers) until next detachData()/cleanupDetachedData()/cleanup()/release() calls
+     */
+    std::vector<T*>& detachData()
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        std::vector<void*> dataVoid;
+        {
+            TLSDataContainer::detachData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            detachedData.reserve(dataVoid.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                detachedData.push_back((T*)*i);
+            }
+            dataFromTerminatedThreads.clear();
+            for (typename std::vector<void*>::const_iterator i = dataVoid.begin(); i != dataVoid.end(); ++i)
+            {
+                detachedData.push_back((T*)(void*)*i);
+            }
+        }
+        dataVoid.clear();
+        return detachedData;
+    }
+
+    /// Release associated thread data returned by detachData() call
+    void cleanupDetachedData()
+    {
+        AutoLock lock(mutex);
+        cleanupMode = true;
+        _cleanupDetachedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data
+    void cleanup()
+    {
+        cleanupMode = true;
+        TLSDataContainer::cleanup();
+
+        AutoLock lock(mutex);
+        _cleanupDetachedData();
+        _cleanupTerminatedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data and free TLS key
+    void release()
+    {
+        cleanupMode = true;
+        TLSDataContainer::release();
+        {
+            AutoLock lock(mutex);
+            _cleanupDetachedData();
+            _cleanupTerminatedData();
+        }
+    }
+
+protected:
+    // synchronized
+    void _cleanupDetachedData()
+    {
+        for (typename std::vector<T*>::iterator i = detachedData.begin(); i != detachedData.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        detachedData.clear();
+    }
+
+    // synchronized
+    void _cleanupTerminatedData()
+    {
+        for (typename std::vector<T*>::iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        dataFromTerminatedThreads.clear();
+    }
+
+protected:
+    virtual void* createDataInstance() const CV_OVERRIDE
+    {
+        // Note: we can collect all allocated data here, but this would require raced mutex locks
+        return new T;
+    }
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE
+    {
+        if (cleanupMode)
+        {
+            delete (T*)pData;
+        }
+        else
+        {
+            AutoLock lock(mutex);
+            dataFromTerminatedThreads.push_back((T*)pData);
+        }
+    }
+};
+
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
--- a/modules/core/include/opencv2/core/utils/trace.private.hpp
+++ b/modules/core/include/opencv2/core/utils/trace.private.hpp
@@ -9,6 +9,8 @@

 #include <opencv2/core/utils/logger.hpp>

+#include <opencv2/core/utils/tls.hpp>
+
 #include "trace.hpp"

 //! @cond IGNORED
@@ -332,7 +334,7 @@ public:
    Mutex mutexCreate;
    Mutex mutexCount;

-    TLSData<TraceManagerThreadLocal> tls;
+    TLSDataAccumulator<TraceManagerThreadLocal> tls;

    cv::Ptr<TraceStorage> trace_storage;
 private:

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -909,19 +909,19 @@ bool haveOpenCL()

 bool useOpenCL()
 {
-    CoreTLSData* data = getCoreTlsData().get();
-    if( data->useOpenCL < 0 )
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useOpenCL < 0)
    {
        try
        {
-            data->useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
+            data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
        }
        catch (...)
        {
-            data->useOpenCL = 0;
+            data.useOpenCL = 0;
        }
    }
-    return data->useOpenCL > 0;
+    return data.useOpenCL > 0;
 }

 #ifdef HAVE_OPENCL
@@ -937,14 +937,14 @@ void setUseOpenCL(bool flag)
 {
    CV_TRACE_FUNCTION();

-    CoreTLSData* data = getCoreTlsData().get();
+    CoreTLSData& data = getCoreTlsData();
    if (!flag)
    {
-        data->useOpenCL = 0;
+        data.useOpenCL = 0;
    }
    else if( haveOpenCL() )
    {
-        data->useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
+        data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
    }
 }

@@ -1655,7 +1655,7 @@ size_t Device::profilingTimerResolution() const
 const Device& Device::getDefault()
 {
    const Context& ctx = Context::getDefault();
-    int idx = getCoreTlsData().get()->device;
+    int idx = getCoreTlsData().device;
    const Device& device = ctx.device(idx);
    return device;
 }
@@ -2562,9 +2562,10 @@ void attachContext(const String& platformName, void* platformID, void* context,
    CV_OCL_CHECK(clRetainContext((cl_context)context));

    // clear command queue, if any
-    getCoreTlsData().get()->oclQueue.finish();
+    CoreTLSData& data = getCoreTlsData();
+    data.oclQueue.finish();
    Queue q;
-    getCoreTlsData().get()->oclQueue = q;
+    data.oclQueue = q;

    return;
 } // attachContext()
@@ -2752,7 +2753,7 @@ void* Queue::ptr() const

 Queue& Queue::getDefault()
 {
-    Queue& q = getCoreTlsData().get()->oclQueue;
+    Queue& q = getCoreTlsData().oclQueue;
    if( !q.p && haveOpenCL() )
        q.create(Context::getDefault());
    return q;

--- a/modules/core/src/ovx.cpp
+++ b/modules/core/src/ovx.cpp
@@ -76,13 +76,13 @@ bool haveOpenVX()
 bool useOpenVX()
 {
 #ifdef HAVE_OPENVX
-    CoreTLSData* data = getCoreTlsData().get();
-    if( data->useOpenVX < 0 )
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useOpenVX < 0)
    {
        // enabled (if available) by default
-        data->useOpenVX = haveOpenVX() ? 1 : 0;
+        data.useOpenVX = haveOpenVX() ? 1 : 0;
    }
-    return data->useOpenVX > 0;
+    return data.useOpenVX > 0;
 #else
    return false;
 #endif
@@ -93,8 +93,8 @@ void setUseOpenVX(bool flag)
 #ifdef HAVE_OPENVX
    if( haveOpenVX() )
    {
-        CoreTLSData* data = getCoreTlsData().get();
-        data->useOpenVX = flag ? 1 : 0;
+        CoreTLSData& data = getCoreTlsData();
+        data.useOpenVX = flag ? 1 : 0;
    }
 #else
    CV_Assert(!flag && "OpenVX support isn't enabled at compile time");

--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -354,7 +354,7 @@ struct CoreTLSData
 #endif
 };

-TLSData<CoreTLSData>& getCoreTlsData();
+CoreTLSData& getCoreTlsData();

 #if defined(BUILD_SHARED_LIBS)
 #if defined _WIN32 || defined WINCE

--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -770,7 +770,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,

 cv::RNG& cv::theRNG()
 {
-    return getCoreTlsData().get()->rng;
+    return getCoreTlsData().rng;
 }

 void cv::setRNGSeed(int seed)

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -50,6 +50,9 @@

 #include <opencv2/core/utils/logger.hpp>

+#include <opencv2/core/utils/tls.hpp>
+#include <opencv2/core/utils/instrumentation.hpp>
+
 namespace cv {

 static Mutex* __initialization_mutex = NULL;
@@ -1375,6 +1378,8 @@ bool Mutex::trylock() { return impl->trylock(); }

 //////////////////////////////// thread-local storage ////////////////////////////////

+namespace details {
+
 #ifdef _WIN32
 #ifdef _MSC_VER
 #pragma warning(disable:4505) // unreferenced local function has been removed
@@ -1436,9 +1441,10 @@ void  TlsAbstraction::SetData(void *pData)
 }
 #endif
 #else // _WIN32
+static void opencv_tls_destructor(void* pData);
 TlsAbstraction::TlsAbstraction()
 {
-    CV_Assert(pthread_key_create(&tlsKey, NULL) == 0);
+    CV_Assert(pthread_key_create(&tlsKey, opencv_tls_destructor) == 0);
 }
 TlsAbstraction::~TlsAbstraction()
 {
@@ -1479,42 +1485,46 @@ public:
    }
    ~TlsStorage()
    {
-        for(size_t i = 0; i < threads.size(); i++)
-        {
-            if(threads[i])
-            {
-                /* Current architecture doesn't allow proper global objects release, so this check can cause crashes
-
-                // Check if all slots were properly cleared
-                for(size_t j = 0; j < threads[i]->slots.size(); j++)
-                {
-                    CV_Assert(threads[i]->slots[j] == 0);
-                }
-                */
-                delete threads[i];
-            }
-        }
-        threads.clear();
+        // TlsStorage object should not be released
+        // There is no reliable way to avoid problems caused by static initialization order fiasco
+        CV_LOG_FATAL(NULL, "TlsStorage::~TlsStorage() call is not expected");
    }

-    void releaseThread()
+    void releaseThread(void* tlsValue = NULL)
    {
+        ThreadData *pTD = tlsValue == NULL ? (ThreadData*)tls.GetData() : (ThreadData*)tlsValue;
+        if (pTD == NULL)
+            return;  // no OpenCV TLS data for this thread
        AutoLock guard(mtxGlobalAccess);
-        ThreadData *pTD = (ThreadData*)tls.GetData();
-        for(size_t i = 0; i < threads.size(); i++)
+        for (size_t i = 0; i < threads.size(); i++)
        {
-            if(pTD == threads[i])
+            if (pTD == threads[i])
            {
-                threads[i] = 0;
-                break;
+                threads[i] = NULL;
+                if (tlsValue == NULL)
+                    tls.SetData(0);
+                std::vector<void*>& thread_slots = pTD->slots;
+                for (size_t slotIdx = 0; slotIdx < thread_slots.size(); slotIdx++)
+                {
+                    void* pData = thread_slots[slotIdx];
+                    thread_slots[slotIdx] = NULL;
+                    if (!pData)
+                        continue;
+                    TLSDataContainer* container = tlsSlots[slotIdx].container;
+                    if (container)
+                        container->deleteDataInstance(pData);
+                    else
+                        CV_LOG_ERROR(NULL, "TLS: container for slotIdx=" << slotIdx << " is NULL. Can't release thread data");
+                }
+                delete pTD;
+                return;
            }
        }
-        tls.SetData(0);
-        delete pTD;
+        CV_LOG_WARNING(NULL, "TLS: Can't release thread TLS data (unknown pointer or data race): " << (void*)pTD);
    }

    // Reserve TLS storage index
-    size_t reserveSlot()
+    size_t reserveSlot(TLSDataContainer* container)
    {
        AutoLock guard(mtxGlobalAccess);
        CV_Assert(tlsSlotsSize == tlsSlots.size());
@@ -1522,15 +1532,15 @@ public:
        // Find unused slots
        for(size_t slot = 0; slot < tlsSlotsSize; slot++)
        {
-            if(!tlsSlots[slot])
+            if (tlsSlots[slot].container == NULL)
            {
-                tlsSlots[slot] = 1;
+                tlsSlots[slot].container = container;
                return slot;
            }
        }

        // Create new slot
-        tlsSlots.push_back(1); tlsSlotsSize++;
+        tlsSlots.push_back(TlsSlotInfo(container)); tlsSlotsSize++;
        return tlsSlotsSize - 1;
    }

@@ -1555,7 +1565,9 @@ public:
        }

        if (!keepSlot)
-            tlsSlots[slotIdx] = 0;
+        {
+            tlsSlots[slotIdx].container = NULL;  // mark slot as free (see reserveSlot() implementation)
+        }
    }

    // Get data by TLS storage index
@@ -1604,8 +1616,26 @@ public:
            tls.SetData((void*)threadData);
            {
                AutoLock guard(mtxGlobalAccess);
-                threadData->idx = threads.size();
-                threads.push_back(threadData);
+
+                bool found = false;
+                // Find unused slots
+                for(size_t slot = 0; slot < threads.size(); slot++)
+                {
+                    if (threads[slot] == NULL)
+                    {
+                        threadData->idx = (int)slot;
+                        threads[slot] = threadData;
+                        found = true;
+                        break;
+                    }
+                }
+
+                if (!found)
+                {
+                    // Create new slot
+                    threadData->idx = threads.size();
+                    threads.push_back(threadData);
+                }
            }
        }

@@ -1622,8 +1652,14 @@ private:

    Mutex  mtxGlobalAccess;           // Shared objects operation guard
    size_t tlsSlotsSize;              // equal to tlsSlots.size() in synchronized sections
-                                      // without synchronization this counter doesn't desrease - it is used for slotIdx sanity checks
-    std::vector<int> tlsSlots;        // TLS keys state
+                                      // without synchronization this counter doesn't decrease - it is used for slotIdx sanity checks
+
+    struct TlsSlotInfo
+    {
+        TlsSlotInfo(TLSDataContainer* _container) : container(_container) {}
+        TLSDataContainer* container;  // attached container (to dispose data of terminated threads)
+    };
+    std::vector<struct TlsSlotInfo> tlsSlots;  // TLS keys state
    std::vector<ThreadData*> threads; // Array for all allocated data. Thread data pointers are placed here to allow data cleanup
 };

@@ -1633,9 +1669,19 @@ static TlsStorage &getTlsStorage()
    CV_SINGLETON_LAZY_INIT_REF(TlsStorage, new TlsStorage())
 }

+#ifndef _WIN32  // pthread key destructor
+static void opencv_tls_destructor(void* pData)
+{
+    getTlsStorage().releaseThread(pData);
+}
+#endif
+
+} // namespace details
+using namespace details;
+
 TLSDataContainer::TLSDataContainer()
 {
-    key_ = (int)getTlsStorage().reserveSlot(); // Reserve key from TLS storage
+    key_ = (int)getTlsStorage().reserveSlot(this); // Reserve key from TLS storage
 }

 TLSDataContainer::~TLSDataContainer()
@@ -1648,11 +1694,17 @@ void TLSDataContainer::gatherData(std::vector<void*> &data) const
    getTlsStorage().gather(key_, data);
 }

+void TLSDataContainer::detachData(std::vector<void*> &data)
+{
+    getTlsStorage().releaseSlot(key_, data, true);
+}
+
 void TLSDataContainer::release()
 {
-    std::vector<void*> data;
-    data.reserve(32);
-    getTlsStorage().releaseSlot(key_, data); // Release key and get stored data for proper destruction
+    if (key_ == -1)
+        return;  // already released
+    std::vector<void*> data; data.reserve(32);
+    getTlsStorage().releaseSlot(key_, data, false); // Release key and get stored data for proper destruction
    key_ = -1;
    for(size_t i = 0; i < data.size(); i++)  // Delete all associated data
        deleteDataInstance(data[i]);
@@ -1660,8 +1712,7 @@ void TLSDataContainer::release()

 void TLSDataContainer::cleanup()
 {
-    std::vector<void*> data;
-    data.reserve(32);
+    std::vector<void*> data; data.reserve(32);
    getTlsStorage().releaseSlot(key_, data, true); // Extract stored data with removal from TLS tables
    for(size_t i = 0; i < data.size(); i++)  // Delete all associated data
        deleteDataInstance(data[i]);
@@ -1680,11 +1731,16 @@ void* TLSDataContainer::getData() const
    return pData;
 }

-TLSData<CoreTLSData>& getCoreTlsData()
+static TLSData<CoreTLSData>& getCoreTlsDataTLS()
 {
    CV_SINGLETON_LAZY_INIT_REF(TLSData<CoreTLSData>, new TLSData<CoreTLSData>())
 }

+CoreTLSData& getCoreTlsData()
+{
+    return getCoreTlsDataTLS().getRef();
+}
+
 #if defined CVAPI_EXPORTS && defined _WIN32 && !defined WINCE
 #ifdef WINRT
    #pragma warning(disable:4447) // Disable warning 'main' signature found without threading model
@@ -2338,12 +2394,12 @@ String getIppVersion()
 bool useIPP()
 {
 #ifdef HAVE_IPP
-    CoreTLSData* data = getCoreTlsData().get();
-    if(data->useIPP < 0)
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useIPP < 0)
    {
-        data->useIPP = getIPPSingleton().useIPP;
+        data.useIPP = getIPPSingleton().useIPP;
    }
-    return (data->useIPP > 0);
+    return (data.useIPP > 0);
 #else
    return false;
 #endif
@@ -2351,24 +2407,24 @@ bool useIPP()

 void setUseIPP(bool flag)
 {
-    CoreTLSData* data = getCoreTlsData().get();
+    CoreTLSData& data = getCoreTlsData();
 #ifdef HAVE_IPP
-    data->useIPP = (getIPPSingleton().useIPP)?flag:false;
+    data.useIPP = (getIPPSingleton().useIPP)?flag:false;
 #else
    CV_UNUSED(flag);
-    data->useIPP = false;
+    data.useIPP = false;
 #endif
 }

 bool useIPP_NotExact()
 {
 #ifdef HAVE_IPP
-    CoreTLSData* data = getCoreTlsData().get();
-    if(data->useIPP_NE < 0)
+    CoreTLSData& data = getCoreTlsData();
+    if (data.useIPP_NE < 0)
    {
-        data->useIPP_NE = getIPPSingleton().useIPP_NE;
+        data.useIPP_NE = getIPPSingleton().useIPP_NE;
    }
-    return (data->useIPP_NE > 0);
+    return (data.useIPP_NE > 0);
 #else
    return false;
 #endif
@@ -2376,12 +2432,12 @@ bool useIPP_NotExact()

 void setUseIPP_NotExact(bool flag)
 {
-    CoreTLSData* data = getCoreTlsData().get();
+    CoreTLSData& data = getCoreTlsData();
 #ifdef HAVE_IPP
-    data->useIPP_NE = flag;
+    data.useIPP_NE = flag;
 #else
    CV_UNUSED(flag);
-    data->useIPP_NE = false;
+    data.useIPP_NE = false;
 #endif
 }

@@ -2407,7 +2463,7 @@ namespace tegra {

 bool useTegra()
 {
-    cv::CoreTLSData* data = cv::getCoreTlsData().get();
+    cv::CoreTLSData* data = cv::getCoreTlsData();

    if (data->useTegra < 0)
    {
@@ -2423,7 +2479,7 @@ bool useTegra()

 void setUseTegra(bool flag)
 {
-    cv::CoreTLSData* data = cv::getCoreTlsData().get();
+    cv::CoreTLSData* data = cv::getCoreTlsData();
    data->useTegra = flag;
 }


--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -43,6 +43,8 @@
 #include "opencl_kernels_core.hpp"
 #include "umatrix.hpp"

+#include <opencv2/core/utils/tls.hpp>
+
 ///////////////////////////////// UMat implementation ///////////////////////////////

 namespace cv {

--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -4,6 +4,8 @@
 #include "test_precomp.hpp"
 #include "opencv2/core/utils/logger.hpp"

+#include "test_utils_tls.impl.hpp"
+
 namespace opencv_test { namespace {

 static const char * const keys =

--- a/modules/core/test/test_utils_tls.impl.hpp
+++ b/modules/core/test/test_utils_tls.impl.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This is .hpp file included from test_utils.cpp
+
+#ifdef CV_CXX11
+#include <thread>  // std::thread
+#endif
+
+#include "opencv2/core/utils/tls.hpp"
+
+namespace opencv_test { namespace {
+
+class TLSReporter
+{
+public:
+    static int g_last_id;
+    static int g_allocated;
+
+    int id;
+
+    TLSReporter()
+    {
+        id = CV_XADD(&g_last_id, 1);
+        CV_XADD(&g_allocated, 1);
+    }
+    ~TLSReporter()
+    {
+        CV_XADD(&g_allocated, -1);
+    }
+};
+
+int TLSReporter::g_last_id = 0;
+int TLSReporter::g_allocated = 0;
+
+#ifdef CV_CXX11
+
+template<typename T>
+static void callNThreadsWithTLS(int N, TLSData<T>& tls)
+{
+    std::vector<std::thread> threads(N);
+    for (int i = 0; i < N; i++)
+    {
+        threads[i] = std::thread([&]() {
+            TLSReporter* pData = tls.get();
+            (void)pData;
+        });
+    }
+    for (int i = 0; i < N; i++)
+    {
+        threads[i].join();
+    }
+    threads.clear();
+}
+
+TEST(Core_TLS, HandleThreadTermination)
+{
+    const int init_id = TLSReporter::g_last_id;
+    const int init_allocated = TLSReporter::g_allocated;
+
+    const int N = 4;
+    TLSData<TLSReporter> tls;
+
+    // use TLS
+    ASSERT_NO_THROW(callNThreadsWithTLS(N, tls));
+
+    EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+    EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated);
+}
+
+
+static void testTLSAccumulator(bool detachFirst)
+{
+    const int init_id = TLSReporter::g_last_id;
+    const int init_allocated = TLSReporter::g_allocated;
+
+    const int N = 4;
+    TLSDataAccumulator<TLSReporter> tls;
+
+    {  // empty TLS checks
+        std::vector<TLSReporter*>& data0 = tls.detachData();
+        EXPECT_EQ((size_t)0, data0.size());
+        tls.cleanupDetachedData();
+    }
+
+    // use TLS
+    ASSERT_NO_THROW(callNThreadsWithTLS(N, tls));
+
+    EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+    EXPECT_EQ(init_allocated + N, TLSReporter::g_allocated);
+
+    if (detachFirst)
+    {
+        std::vector<TLSReporter*>& data1 = tls.detachData();
+        EXPECT_EQ((size_t)N, data1.size());
+
+        // no data through gather after detachData()
+        std::vector<TLSReporter*> data2;
+        tls.gather(data2);
+        EXPECT_EQ((size_t)0, data2.size());
+
+        tls.cleanupDetachedData();
+
+        EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+        EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated);
+        EXPECT_EQ((size_t)0, data1.size());
+    }
+    else
+    {
+        std::vector<TLSReporter*> data2;
+        tls.gather(data2);
+        EXPECT_EQ((size_t)N, data2.size());
+
+        std::vector<TLSReporter*>& data1 = tls.detachData();
+        EXPECT_EQ((size_t)N, data1.size());
+
+        tls.cleanupDetachedData();
+
+        EXPECT_EQ((size_t)0, data1.size());
+        // data2 is not empty, but it has invalid contents
+        EXPECT_EQ((size_t)N, data2.size());
+    }
+
+    EXPECT_EQ(init_id + N, TLSReporter::g_last_id);
+    EXPECT_EQ(init_allocated + 0, TLSReporter::g_allocated);
+}
+
+TEST(Core_TLS, AccumulatorHoldData_detachData) { testTLSAccumulator(true); }
+TEST(Core_TLS, AccumulatorHoldData_gather) { testTLSAccumulator(false); }
+
+#endif
+
+}}  // namespace
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -45,6 +45,8 @@

 #include "opencv2/core/openvx/ovx_defs.hpp"

+#include "opencv2/core/utils/tls.hpp"
+
 namespace cv
 {