/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the OpenCV Foundation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "precomp.hpp" #include #include #include #include // std::cerr #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp" #ifdef HAVE_OPENCL #include "opencv2/core/opencl/runtime/opencl_core.hpp" #else // TODO FIXIT: This file can't be build without OPENCL /* Part of the file is an extract from the standard OpenCL headers from Khronos site. Below is the original copyright. */ /******************************************************************************* * Copyright (c) 2008 - 2012 The Khronos Group Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and/or associated documentation files (the * "Materials"), to deal in the Materials without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Materials, and to * permit persons to whom the Materials are furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Materials. * * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. ******************************************************************************/ #if 0 //defined __APPLE__ #define HAVE_OPENCL 1 #else #undef HAVE_OPENCL #endif #define OPENCV_CL_NOT_IMPLEMENTED -1000 #ifdef HAVE_OPENCL #if defined __APPLE__ #include #else #include #endif static const bool g_haveOpenCL = true; #else extern "C" { struct _cl_platform_id { int dummy; }; struct _cl_device_id { int dummy; }; struct _cl_context { int dummy; }; struct _cl_command_queue { int dummy; }; struct _cl_mem { int dummy; }; struct _cl_program { int dummy; }; struct _cl_kernel { int dummy; }; struct _cl_event { int dummy; }; struct _cl_sampler { int dummy; }; typedef struct _cl_platform_id * cl_platform_id; typedef struct _cl_device_id * cl_device_id; typedef struct _cl_context * cl_context; typedef struct _cl_command_queue * cl_command_queue; typedef struct _cl_mem * cl_mem; typedef struct _cl_program * cl_program; typedef struct _cl_kernel * cl_kernel; typedef struct _cl_event * cl_event; typedef struct _cl_sampler * cl_sampler; typedef int cl_int; typedef unsigned cl_uint; #if defined (_WIN32) && defined(_MSC_VER) typedef __int64 cl_long; typedef unsigned __int64 cl_ulong; #else typedef long cl_long; typedef unsigned long cl_ulong; #endif typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ typedef cl_ulong cl_bitfield; typedef cl_bitfield cl_device_type; typedef cl_uint cl_platform_info; typedef cl_uint cl_device_info; typedef cl_bitfield cl_device_fp_config; typedef cl_uint cl_device_mem_cache_type; typedef cl_uint cl_device_local_mem_type; typedef cl_bitfield cl_device_exec_capabilities; typedef cl_bitfield cl_command_queue_properties; typedef intptr_t cl_device_partition_property; typedef cl_bitfield cl_device_affinity_domain; typedef intptr_t cl_context_properties; typedef cl_uint cl_context_info; typedef cl_uint cl_command_queue_info; typedef cl_uint cl_channel_order; typedef cl_uint cl_channel_type; typedef cl_bitfield cl_mem_flags; typedef cl_uint cl_mem_object_type; typedef cl_uint cl_mem_info; typedef cl_bitfield cl_mem_migration_flags; typedef cl_uint cl_image_info; typedef cl_uint cl_buffer_create_type; typedef cl_uint cl_addressing_mode; typedef cl_uint cl_filter_mode; typedef cl_uint cl_sampler_info; typedef cl_bitfield cl_map_flags; typedef cl_uint cl_program_info; typedef cl_uint cl_program_build_info; typedef cl_uint cl_program_binary_type; typedef cl_int cl_build_status; typedef cl_uint cl_kernel_info; typedef cl_uint cl_kernel_arg_info; typedef cl_uint cl_kernel_arg_address_qualifier; typedef cl_uint cl_kernel_arg_access_qualifier; typedef cl_bitfield cl_kernel_arg_type_qualifier; typedef cl_uint cl_kernel_work_group_info; typedef cl_uint cl_event_info; typedef cl_uint cl_command_type; typedef cl_uint cl_profiling_info; typedef struct _cl_image_format { cl_channel_order image_channel_order; cl_channel_type image_channel_data_type; } cl_image_format; typedef struct _cl_image_desc { cl_mem_object_type image_type; size_t image_width; size_t image_height; size_t image_depth; size_t image_array_size; size_t image_row_pitch; size_t image_slice_pitch; cl_uint num_mip_levels; cl_uint num_samples; cl_mem buffer; } cl_image_desc; typedef struct _cl_buffer_region { size_t origin; size_t size; } cl_buffer_region; ////////////////////////////////////////////////////////// #define CL_SUCCESS 0 #define CL_DEVICE_NOT_FOUND -1 #define CL_DEVICE_NOT_AVAILABLE -2 #define CL_COMPILER_NOT_AVAILABLE -3 #define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 #define CL_OUT_OF_RESOURCES -5 #define CL_OUT_OF_HOST_MEMORY -6 #define CL_PROFILING_INFO_NOT_AVAILABLE -7 #define CL_MEM_COPY_OVERLAP -8 #define CL_IMAGE_FORMAT_MISMATCH -9 #define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 #define CL_BUILD_PROGRAM_FAILURE -11 #define CL_MAP_FAILURE -12 #define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 #define CL_COMPILE_PROGRAM_FAILURE -15 #define CL_LINKER_NOT_AVAILABLE -16 #define CL_LINK_PROGRAM_FAILURE -17 #define CL_DEVICE_PARTITION_FAILED -18 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 #define CL_INVALID_VALUE -30 #define CL_INVALID_DEVICE_TYPE -31 #define CL_INVALID_PLATFORM -32 #define CL_INVALID_DEVICE -33 #define CL_INVALID_CONTEXT -34 #define CL_INVALID_QUEUE_PROPERTIES -35 #define CL_INVALID_COMMAND_QUEUE -36 #define CL_INVALID_HOST_PTR -37 #define CL_INVALID_MEM_OBJECT -38 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 #define CL_INVALID_IMAGE_SIZE -40 #define CL_INVALID_SAMPLER -41 #define CL_INVALID_BINARY -42 #define CL_INVALID_BUILD_OPTIONS -43 #define CL_INVALID_PROGRAM -44 #define CL_INVALID_PROGRAM_EXECUTABLE -45 #define CL_INVALID_KERNEL_NAME -46 #define CL_INVALID_KERNEL_DEFINITION -47 #define CL_INVALID_KERNEL -48 #define CL_INVALID_ARG_INDEX -49 #define CL_INVALID_ARG_VALUE -50 #define CL_INVALID_ARG_SIZE -51 #define CL_INVALID_KERNEL_ARGS -52 #define CL_INVALID_WORK_DIMENSION -53 #define CL_INVALID_WORK_GROUP_SIZE -54 #define CL_INVALID_WORK_ITEM_SIZE -55 #define CL_INVALID_GLOBAL_OFFSET -56 #define CL_INVALID_EVENT_WAIT_LIST -57 #define CL_INVALID_EVENT -58 #define CL_INVALID_OPERATION -59 #define CL_INVALID_GL_OBJECT -60 #define CL_INVALID_BUFFER_SIZE -61 #define CL_INVALID_MIP_LEVEL -62 #define CL_INVALID_GLOBAL_WORK_SIZE -63 #define CL_INVALID_PROPERTY -64 #define CL_INVALID_IMAGE_DESCRIPTOR -65 #define CL_INVALID_COMPILER_OPTIONS -66 #define CL_INVALID_LINKER_OPTIONS -67 #define CL_INVALID_DEVICE_PARTITION_COUNT -68 /*#define CL_VERSION_1_0 1 #define CL_VERSION_1_1 1 #define CL_VERSION_1_2 1*/ #define CL_FALSE 0 #define CL_TRUE 1 #define CL_BLOCKING CL_TRUE #define CL_NON_BLOCKING CL_FALSE #define CL_PLATFORM_PROFILE 0x0900 #define CL_PLATFORM_VERSION 0x0901 #define CL_PLATFORM_NAME 0x0902 #define CL_PLATFORM_VENDOR 0x0903 #define CL_PLATFORM_EXTENSIONS 0x0904 #define CL_DEVICE_TYPE_DEFAULT (1 << 0) #define CL_DEVICE_TYPE_CPU (1 << 1) #define CL_DEVICE_TYPE_GPU (1 << 2) #define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) #define CL_DEVICE_TYPE_CUSTOM (1 << 4) #define CL_DEVICE_TYPE_ALL 0xFFFFFFFF #define CL_DEVICE_TYPE 0x1000 #define CL_DEVICE_VENDOR_ID 0x1001 #define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 #define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 #define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B #define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C #define CL_DEVICE_ADDRESS_BITS 0x100D #define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F #define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 #define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 #define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 #define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 #define CL_DEVICE_IMAGE_SUPPORT 0x1016 #define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 #define CL_DEVICE_MAX_SAMPLERS 0x1018 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A #define CL_DEVICE_SINGLE_FP_CONFIG 0x101B #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E #define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 #define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 #define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 #define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 #define CL_DEVICE_ENDIAN_LITTLE 0x1026 #define CL_DEVICE_AVAILABLE 0x1027 #define CL_DEVICE_COMPILER_AVAILABLE 0x1028 #define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 #define CL_DEVICE_QUEUE_PROPERTIES 0x102A #define CL_DEVICE_NAME 0x102B #define CL_DEVICE_VENDOR 0x102C #define CL_DRIVER_VERSION 0x102D #define CL_DEVICE_PROFILE 0x102E #define CL_DEVICE_VERSION 0x102F #define CL_DEVICE_EXTENSIONS 0x1030 #define CL_DEVICE_PLATFORM 0x1031 #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 #define CL_DEVICE_HALF_FP_CONFIG 0x1033 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 #define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C #define CL_DEVICE_OPENCL_C_VERSION 0x103D #define CL_DEVICE_LINKER_AVAILABLE 0x103E #define CL_DEVICE_BUILT_IN_KERNELS 0x103F #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 #define CL_DEVICE_PARENT_DEVICE 0x1042 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 #define CL_DEVICE_PARTITION_PROPERTIES 0x1044 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 #define CL_DEVICE_PARTITION_TYPE 0x1046 #define CL_DEVICE_REFERENCE_COUNT 0x1047 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 #define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B #define CL_FP_DENORM (1 << 0) #define CL_FP_INF_NAN (1 << 1) #define CL_FP_ROUND_TO_NEAREST (1 << 2) #define CL_FP_ROUND_TO_ZERO (1 << 3) #define CL_FP_ROUND_TO_INF (1 << 4) #define CL_FP_FMA (1 << 5) #define CL_FP_SOFT_FLOAT (1 << 6) #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) #define CL_NONE 0x0 #define CL_READ_ONLY_CACHE 0x1 #define CL_READ_WRITE_CACHE 0x2 #define CL_LOCAL 0x1 #define CL_GLOBAL 0x2 #define CL_EXEC_KERNEL (1 << 0) #define CL_EXEC_NATIVE_KERNEL (1 << 1) #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) #define CL_QUEUE_PROFILING_ENABLE (1 << 1) #define CL_CONTEXT_REFERENCE_COUNT 0x1080 #define CL_CONTEXT_DEVICES 0x1081 #define CL_CONTEXT_PROPERTIES 0x1082 #define CL_CONTEXT_NUM_DEVICES 0x1083 #define CL_CONTEXT_PLATFORM 0x1084 #define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 #define CL_DEVICE_PARTITION_EQUALLY 0x1086 #define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) #define CL_QUEUE_CONTEXT 0x1090 #define CL_QUEUE_DEVICE 0x1091 #define CL_QUEUE_REFERENCE_COUNT 0x1092 #define CL_QUEUE_PROPERTIES 0x1093 #define CL_MEM_READ_WRITE (1 << 0) #define CL_MEM_WRITE_ONLY (1 << 1) #define CL_MEM_READ_ONLY (1 << 2) #define CL_MEM_USE_HOST_PTR (1 << 3) #define CL_MEM_ALLOC_HOST_PTR (1 << 4) #define CL_MEM_COPY_HOST_PTR (1 << 5) // reserved (1 << 6) #define CL_MEM_HOST_WRITE_ONLY (1 << 7) #define CL_MEM_HOST_READ_ONLY (1 << 8) #define CL_MEM_HOST_NO_ACCESS (1 << 9) #define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) #define CL_R 0x10B0 #define CL_A 0x10B1 #define CL_RG 0x10B2 #define CL_RA 0x10B3 #define CL_RGB 0x10B4 #define CL_RGBA 0x10B5 #define CL_BGRA 0x10B6 #define CL_ARGB 0x10B7 #define CL_INTENSITY 0x10B8 #define CL_LUMINANCE 0x10B9 #define CL_Rx 0x10BA #define CL_RGx 0x10BB #define CL_RGBx 0x10BC #define CL_DEPTH 0x10BD #define CL_DEPTH_STENCIL 0x10BE #define CL_SNORM_INT8 0x10D0 #define CL_SNORM_INT16 0x10D1 #define CL_UNORM_INT8 0x10D2 #define CL_UNORM_INT16 0x10D3 #define CL_UNORM_SHORT_565 0x10D4 #define CL_UNORM_SHORT_555 0x10D5 #define CL_UNORM_INT_101010 0x10D6 #define CL_SIGNED_INT8 0x10D7 #define CL_SIGNED_INT16 0x10D8 #define CL_SIGNED_INT32 0x10D9 #define CL_UNSIGNED_INT8 0x10DA #define CL_UNSIGNED_INT16 0x10DB #define CL_UNSIGNED_INT32 0x10DC #define CL_HALF_FLOAT 0x10DD #define CL_FLOAT 0x10DE #define CL_UNORM_INT24 0x10DF #define CL_MEM_OBJECT_BUFFER 0x10F0 #define CL_MEM_OBJECT_IMAGE2D 0x10F1 #define CL_MEM_OBJECT_IMAGE3D 0x10F2 #define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 #define CL_MEM_OBJECT_IMAGE1D 0x10F4 #define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 #define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 #define CL_MEM_TYPE 0x1100 #define CL_MEM_FLAGS 0x1101 #define CL_MEM_SIZE 0x1102 #define CL_MEM_HOST_PTR 0x1103 #define CL_MEM_MAP_COUNT 0x1104 #define CL_MEM_REFERENCE_COUNT 0x1105 #define CL_MEM_CONTEXT 0x1106 #define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 #define CL_MEM_OFFSET 0x1108 #define CL_IMAGE_FORMAT 0x1110 #define CL_IMAGE_ELEMENT_SIZE 0x1111 #define CL_IMAGE_ROW_PITCH 0x1112 #define CL_IMAGE_SLICE_PITCH 0x1113 #define CL_IMAGE_WIDTH 0x1114 #define CL_IMAGE_HEIGHT 0x1115 #define CL_IMAGE_DEPTH 0x1116 #define CL_IMAGE_ARRAY_SIZE 0x1117 #define CL_IMAGE_BUFFER 0x1118 #define CL_IMAGE_NUM_MIP_LEVELS 0x1119 #define CL_IMAGE_NUM_SAMPLES 0x111A #define CL_ADDRESS_NONE 0x1130 #define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 #define CL_ADDRESS_CLAMP 0x1132 #define CL_ADDRESS_REPEAT 0x1133 #define CL_ADDRESS_MIRRORED_REPEAT 0x1134 #define CL_FILTER_NEAREST 0x1140 #define CL_FILTER_LINEAR 0x1141 #define CL_SAMPLER_REFERENCE_COUNT 0x1150 #define CL_SAMPLER_CONTEXT 0x1151 #define CL_SAMPLER_NORMALIZED_COORDS 0x1152 #define CL_SAMPLER_ADDRESSING_MODE 0x1153 #define CL_SAMPLER_FILTER_MODE 0x1154 #define CL_MAP_READ (1 << 0) #define CL_MAP_WRITE (1 << 1) #define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) #define CL_PROGRAM_REFERENCE_COUNT 0x1160 #define CL_PROGRAM_CONTEXT 0x1161 #define CL_PROGRAM_NUM_DEVICES 0x1162 #define CL_PROGRAM_DEVICES 0x1163 #define CL_PROGRAM_SOURCE 0x1164 #define CL_PROGRAM_BINARY_SIZES 0x1165 #define CL_PROGRAM_BINARIES 0x1166 #define CL_PROGRAM_NUM_KERNELS 0x1167 #define CL_PROGRAM_KERNEL_NAMES 0x1168 #define CL_PROGRAM_BUILD_STATUS 0x1181 #define CL_PROGRAM_BUILD_OPTIONS 0x1182 #define CL_PROGRAM_BUILD_LOG 0x1183 #define CL_PROGRAM_BINARY_TYPE 0x1184 #define CL_PROGRAM_BINARY_TYPE_NONE 0x0 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 #define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 #define CL_BUILD_SUCCESS 0 #define CL_BUILD_NONE -1 #define CL_BUILD_ERROR -2 #define CL_BUILD_IN_PROGRESS -3 #define CL_KERNEL_FUNCTION_NAME 0x1190 #define CL_KERNEL_NUM_ARGS 0x1191 #define CL_KERNEL_REFERENCE_COUNT 0x1192 #define CL_KERNEL_CONTEXT 0x1193 #define CL_KERNEL_PROGRAM 0x1194 #define CL_KERNEL_ATTRIBUTES 0x1195 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 #define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 #define CL_KERNEL_ARG_TYPE_NAME 0x1198 #define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 #define CL_KERNEL_ARG_NAME 0x119A #define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B #define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C #define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D #define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E #define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 #define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 #define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 #define CL_KERNEL_ARG_TYPE_NONE 0 #define CL_KERNEL_ARG_TYPE_CONST (1 << 0) #define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) #define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) #define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 #define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 #define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 #define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 #define CL_EVENT_COMMAND_QUEUE 0x11D0 #define CL_EVENT_COMMAND_TYPE 0x11D1 #define CL_EVENT_REFERENCE_COUNT 0x11D2 #define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 #define CL_EVENT_CONTEXT 0x11D4 #define CL_COMMAND_NDRANGE_KERNEL 0x11F0 #define CL_COMMAND_TASK 0x11F1 #define CL_COMMAND_NATIVE_KERNEL 0x11F2 #define CL_COMMAND_READ_BUFFER 0x11F3 #define CL_COMMAND_WRITE_BUFFER 0x11F4 #define CL_COMMAND_COPY_BUFFER 0x11F5 #define CL_COMMAND_READ_IMAGE 0x11F6 #define CL_COMMAND_WRITE_IMAGE 0x11F7 #define CL_COMMAND_COPY_IMAGE 0x11F8 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA #define CL_COMMAND_MAP_BUFFER 0x11FB #define CL_COMMAND_MAP_IMAGE 0x11FC #define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD #define CL_COMMAND_MARKER 0x11FE #define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF #define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 #define CL_COMMAND_READ_BUFFER_RECT 0x1201 #define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 #define CL_COMMAND_COPY_BUFFER_RECT 0x1203 #define CL_COMMAND_USER 0x1204 #define CL_COMMAND_BARRIER 0x1205 #define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 #define CL_COMMAND_FILL_BUFFER 0x1207 #define CL_COMMAND_FILL_IMAGE 0x1208 #define CL_COMPLETE 0x0 #define CL_RUNNING 0x1 #define CL_SUBMITTED 0x2 #define CL_QUEUED 0x3 #define CL_BUFFER_CREATE_TYPE_REGION 0x1220 #define CL_PROFILING_COMMAND_QUEUED 0x1280 #define CL_PROFILING_COMMAND_SUBMIT 0x1281 #define CL_PROFILING_COMMAND_START 0x1282 #define CL_PROFILING_COMMAND_END 0x1283 #define CL_CALLBACK CV_STDCALL static volatile bool g_haveOpenCL = false; static const char* oclFuncToCheck = "clEnqueueReadBufferRect"; #if defined(__APPLE__) #include static void* initOpenCLAndLoad(const char* funcname) { static bool initialized = false; static void* handle = 0; if (!handle) { if(!initialized) { const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME"); oclpath = oclpath && strlen(oclpath) > 0 ? oclpath : "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL"; handle = dlopen(oclpath, RTLD_LAZY); initialized = true; g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0; if( g_haveOpenCL ) fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath); else fprintf(stderr, "Failed to load OpenCL runtime\n"); } if(!handle) return 0; } return funcname && handle ? dlsym(handle, funcname) : 0; } #elif defined WIN32 || defined _WIN32 #ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?) #define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx #endif #include #if (_WIN32_WINNT >= 0x0602) #include #endif #undef small #undef min #undef max #undef abs static void* initOpenCLAndLoad(const char* funcname) { static bool initialized = false; static HMODULE handle = 0; if (!handle) { if(!initialized) { handle = LoadLibraryA("OpenCL.dll"); initialized = true; g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0; } if(!handle) return 0; } return funcname ? (void*)GetProcAddress(handle, funcname) : 0; } #elif defined(__linux) #include #include static void* initOpenCLAndLoad(const char* funcname) { static bool initialized = false; static void* handle = 0; if (!handle) { if(!initialized) { handle = dlopen("libOpenCL.so", RTLD_LAZY); if(!handle) handle = dlopen("libCL.so", RTLD_LAZY); initialized = true; g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0; } if(!handle) return 0; } return funcname ? (void*)dlsym(handle, funcname) : 0; } #else static void* initOpenCLAndLoad(const char*) { return 0; } #endif #define OCL_FUNC(rettype, funcname, argsdecl, args) \ typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \ static rettype funcname argsdecl \ { \ static funcname##_t funcname##_p = 0; \ if( !funcname##_p ) \ { \ funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \ if( !funcname##_p ) \ return OPENCV_CL_NOT_IMPLEMENTED; \ } \ return funcname##_p args; \ } #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \ typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \ static rettype funcname argsdecl \ { \ static funcname##_t funcname##_p = 0; \ if( !funcname##_p ) \ { \ funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \ if( !funcname##_p ) \ { \ if( errcode_ret ) \ *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \ return 0; \ } \ } \ return funcname##_p args; \ } OCL_FUNC(cl_int, clGetPlatformIDs, (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms), (num_entries, platforms, num_platforms)) OCL_FUNC(cl_int, clGetPlatformInfo, (cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (platform, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clGetDeviceInfo, (cl_device_id device, cl_device_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (device, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clGetDeviceIDs, (cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id * devices, cl_uint * num_devices), (platform, device_type, num_entries, devices, num_devices)) OCL_FUNC_P(cl_context, clCreateContext, (const cl_context_properties * properties, cl_uint num_devices, const cl_device_id * devices, void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *), void * user_data, cl_int * errcode_ret), (properties, num_devices, devices, pfn_notify, user_data, errcode_ret)) OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context)) /* OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context)) OCL_FUNC_P(cl_context, clCreateContextFromType, (const cl_context_properties * properties, cl_device_type device_type, void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *), void * user_data, cl_int * errcode_ret), (properties, device_type, pfn_notify, user_data, errcode_ret)) OCL_FUNC(cl_int, clGetContextInfo, (cl_context context, cl_context_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (context, param_name, param_value_size, param_value, param_value_size_ret)) */ OCL_FUNC_P(cl_command_queue, clCreateCommandQueue, (cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_int * errcode_ret), (context, device, properties, errcode_ret)) OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue)) OCL_FUNC_P(cl_mem, clCreateBuffer, (cl_context context, cl_mem_flags flags, size_t size, void * host_ptr, cl_int * errcode_ret), (context, flags, size, host_ptr, errcode_ret)) /* OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue)) OCL_FUNC(cl_int, clGetCommandQueueInfo, (cl_command_queue command_queue, cl_command_queue_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (command_queue, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC_P(cl_mem, clCreateSubBuffer, (cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type buffer_create_type, const void * buffer_create_info, cl_int * errcode_ret), (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret)) */ OCL_FUNC_P(cl_mem, clCreateImage, (cl_context context, cl_mem_flags flags, const cl_image_format * image_format, const cl_image_desc * image_desc, void * host_ptr, cl_int * errcode_ret), (context, flags, image_format, image_desc, host_ptr, errcode_ret)) OCL_FUNC_P(cl_mem, clCreateImage2D, (cl_context context, cl_mem_flags flags, const cl_image_format * image_format, size_t image_width, size_t image_height, size_t image_row_pitch, void * host_ptr, cl_int *errcode_ret), (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret)) /* OCL_FUNC(cl_int, clGetSupportedImageFormats, (cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, cl_uint num_entries, cl_image_format * image_formats, cl_uint * num_image_formats), (context, flags, image_type, num_entries, image_formats, num_image_formats)) OCL_FUNC(cl_int, clGetMemObjectInfo, (cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (memobj, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clGetImageInfo, (cl_mem image, cl_image_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (image, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clCreateKernelsInProgram, (cl_program program, cl_uint num_kernels, cl_kernel * kernels, cl_uint * num_kernels_ret), (program, num_kernels, kernels, num_kernels_ret)) OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel)) OCL_FUNC(cl_int, clGetKernelArgInfo, (cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clEnqueueReadImage, (cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, const size_t * origin[3], const size_t * region[3], size_t row_pitch, size_t slice_pitch, void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, image, blocking_read, origin, region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueWriteImage, (cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, const size_t * origin[3], const size_t * region[3], size_t input_row_pitch, size_t input_slice_pitch, const void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, image, blocking_write, origin, region, input_row_pitch, input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueFillImage, (cl_command_queue command_queue, cl_mem image, const void * fill_color, const size_t * origin[3], const size_t * region[3], cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, image, fill_color, origin, region, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueCopyImage, (cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, const size_t * src_origin[3], const size_t * dst_origin[3], const size_t * region[3], cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, src_image, dst_image, src_origin, dst_origin, region, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer, (cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, const size_t * src_origin[3], const size_t * region[3], size_t dst_offset, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, src_image, dst_buffer, src_origin, region, dst_offset, num_events_in_wait_list, event_wait_list, event)) */ OCL_FUNC(cl_int, clEnqueueCopyBufferToImage, (cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, size_t src_offset, const size_t dst_origin[3], const size_t region[3], cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, src_buffer, dst_image, src_offset, dst_origin, region, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clFlush, (cl_command_queue command_queue), (command_queue)) /* OCL_FUNC_P(void*, clEnqueueMapImage, (cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags, const size_t * origin[3], const size_t * region[3], size_t * image_row_pitch, size_t * image_slice_pitch, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event, cl_int * errcode_ret), (command_queue, image, blocking_map, map_flags, origin, region, image_row_pitch, image_slice_pitch, num_events_in_wait_list, event_wait_list, event, errcode_ret)) */ /* OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program)) OCL_FUNC(cl_int, clGetKernelInfo, (cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (kernel, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj)) */ OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj)) OCL_FUNC_P(cl_program, clCreateProgramWithSource, (cl_context context, cl_uint count, const char ** strings, const size_t * lengths, cl_int * errcode_ret), (context, count, strings, lengths, errcode_ret)) OCL_FUNC_P(cl_program, clCreateProgramWithBinary, (cl_context context, cl_uint num_devices, const cl_device_id * device_list, const size_t * lengths, const unsigned char ** binaries, cl_int * binary_status, cl_int * errcode_ret), (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret)) OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program)) OCL_FUNC(cl_int, clBuildProgram, (cl_program program, cl_uint num_devices, const cl_device_id * device_list, const char * options, void (CL_CALLBACK * pfn_notify)(cl_program, void *), void * user_data), (program, num_devices, device_list, options, pfn_notify, user_data)) OCL_FUNC(cl_int, clGetProgramInfo, (cl_program program, cl_program_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (program, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clGetProgramBuildInfo, (cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (program, device, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC_P(cl_kernel, clCreateKernel, (cl_program program, const char * kernel_name, cl_int * errcode_ret), (program, kernel_name, errcode_ret)) OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel)) OCL_FUNC(cl_int, clSetKernelArg, (cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void * arg_value), (kernel, arg_index, arg_size, arg_value)) OCL_FUNC(cl_int, clGetKernelWorkGroupInfo, (cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret), (kernel, device, param_name, param_value_size, param_value, param_value_size_ret)) OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue)) OCL_FUNC(cl_int, clEnqueueReadBuffer, (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueReadBufferRect, (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, const size_t * buffer_offset, const size_t * host_offset, const size_t * region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueWriteBuffer, (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueWriteBufferRect, (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, const size_t * buffer_offset, const size_t * host_offset, const size_t * region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, buffer, blocking_write, buffer_offset, host_offset, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event)) /*OCL_FUNC(cl_int, clEnqueueFillBuffer, (cl_command_queue command_queue, cl_mem buffer, const void * pattern, size_t pattern_size, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, buffer, pattern, pattern_size, offset, size, num_events_in_wait_list, event_wait_list, event))*/ OCL_FUNC(cl_int, clEnqueueCopyBuffer, (cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueCopyBufferRect, (cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, const size_t * src_origin, const size_t * dst_origin, const size_t * region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC_P(void*, clEnqueueMapBuffer, (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event, cl_int * errcode_ret), (command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret)) OCL_FUNC(cl_int, clEnqueueUnmapMemObject, (cl_command_queue command_queue, cl_mem memobj, void * mapped_ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueNDRangeKernel, (cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t * global_work_offset, const size_t * global_work_size, const size_t * local_work_size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clEnqueueTask, (cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event), (command_queue, kernel, num_events_in_wait_list, event_wait_list, event)) OCL_FUNC(cl_int, clSetEventCallback, (cl_event event, cl_int command_exec_callback_type , void (CL_CALLBACK *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data), void *user_data), (event, command_exec_callback_type, pfn_event_notify, user_data)) OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event)) } #endif #ifndef CL_VERSION_1_2 #define CL_VERSION_1_2 #endif #endif namespace cv { namespace ocl { struct UMat2D { UMat2D(const UMat& m) { offset = (int)m.offset; step = (int)m.step; rows = m.rows; cols = m.cols; } int offset; int step; int rows; int cols; }; struct UMat3D { UMat3D(const UMat& m) { offset = (int)m.offset; step = (int)m.step.p[1]; slicestep = (int)m.step.p[0]; slices = (int)m.size.p[0]; rows = m.size.p[1]; cols = m.size.p[2]; } int offset; int slicestep; int step; int slices; int rows; int cols; }; // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 ) { static uint64 table[256]; static bool initialized = false; if( !initialized ) { for( int i = 0; i < 256; i++ ) { uint64 c = i; for( int j = 0; j < 8; j++ ) c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1); table[i] = c; } initialized = true; } uint64 crc = ~crc0; for( size_t idx = 0; idx < size; idx++ ) crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8); return ~crc; } struct HashKey { typedef uint64 part; HashKey(part _a, part _b) : a(_a), b(_b) {} part a, b; }; inline bool operator == (const HashKey& h1, const HashKey& h2) { return h1.a == h2.a && h1.b == h2.b; } inline bool operator < (const HashKey& h1, const HashKey& h2) { return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b); } static bool g_isOpenCLInitialized = false; static bool g_isOpenCLAvailable = false; bool haveOpenCL() { if (!g_isOpenCLInitialized) { try { cl_uint n = 0; g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS; } catch (...) { g_isOpenCLAvailable = false; } g_isOpenCLInitialized = true; } return g_isOpenCLAvailable; } bool useOpenCL() { CoreTLSData* data = coreTlsData.get(); if( data->useOpenCL < 0 ) data->useOpenCL = (int)haveOpenCL(); return data->useOpenCL > 0; } void setUseOpenCL(bool flag) { if( haveOpenCL() ) { CoreTLSData* data = coreTlsData.get(); data->useOpenCL = flag ? 1 : 0; } } #ifdef HAVE_CLAMDBLAS class AmdBlasHelper { public: static AmdBlasHelper & getInstance() { static AmdBlasHelper amdBlas; return amdBlas; } bool isAvailable() const { return g_isAmdBlasAvailable; } ~AmdBlasHelper() { try { clAmdBlasTeardown(); } catch (...) { } } protected: AmdBlasHelper() { if (!g_isAmdBlasInitialized) { AutoLock lock(m); if (!g_isAmdBlasInitialized && haveOpenCL()) { try { g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess; } catch (...) { g_isAmdBlasAvailable = false; } } else g_isAmdBlasAvailable = false; g_isAmdBlasInitialized = true; } } private: static Mutex m; static bool g_isAmdBlasInitialized; static bool g_isAmdBlasAvailable; }; bool AmdBlasHelper::g_isAmdBlasAvailable = false; bool AmdBlasHelper::g_isAmdBlasInitialized = false; Mutex AmdBlasHelper::m; bool haveAmdBlas() { return AmdBlasHelper::getInstance().isAvailable(); } #else bool haveAmdBlas() { return false; } #endif #ifdef HAVE_CLAMDFFT class AmdFftHelper { public: static AmdFftHelper & getInstance() { static AmdFftHelper amdFft; return amdFft; } bool isAvailable() const { return g_isAmdFftAvailable; } ~AmdFftHelper() { try { // clAmdFftTeardown(); } catch (...) { } } protected: AmdFftHelper() { if (!g_isAmdFftInitialized) { AutoLock lock(m); if (!g_isAmdFftInitialized && haveOpenCL()) { try { CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS); g_isAmdFftAvailable = true; } catch (const Exception &) { g_isAmdFftAvailable = false; } } else g_isAmdFftAvailable = false; g_isAmdFftInitialized = true; } } private: static clAmdFftSetupData setupData; static Mutex m; static bool g_isAmdFftInitialized; static bool g_isAmdFftAvailable; }; clAmdFftSetupData AmdFftHelper::setupData; bool AmdFftHelper::g_isAmdFftAvailable = false; bool AmdFftHelper::g_isAmdFftInitialized = false; Mutex AmdFftHelper::m; bool haveAmdFft() { return AmdFftHelper::getInstance().isAvailable(); } #else bool haveAmdFft() { return false; } #endif void finish2() { Queue::getDefault().finish(); } #define IMPLEMENT_REFCOUNTABLE() \ void addref() { CV_XADD(&refcount, 1); } \ void release() { if( CV_XADD(&refcount, -1) == 1 ) delete this; } \ int refcount struct Platform::Impl { Impl() { refcount = 1; handle = 0; initialized = false; } ~Impl() {} void init() { if( !initialized ) { //cl_uint num_entries cl_uint n = 0; if( clGetPlatformIDs(1, &handle, &n) < 0 || n == 0 ) handle = 0; if( handle != 0 ) { char buf[1000]; size_t len = 0; clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len); buf[len] = '\0'; vendor = String(buf); } initialized = true; } } IMPLEMENT_REFCOUNTABLE(); cl_platform_id handle; String vendor; bool initialized; }; Platform::Platform() { p = 0; } Platform::~Platform() { if(p) p->release(); } Platform::Platform(const Platform& pl) { p = (Impl*)pl.p; if(p) p->addref(); } Platform& Platform::operator = (const Platform& pl) { Impl* newp = (Impl*)pl.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } void* Platform::ptr() const { return p ? p->handle : 0; } Platform& Platform::getDefault() { static Platform p; if( !p.p ) { p.p = new Impl; p.p->init(); } return p; } /////////////////////////////////////////////////////////////////////////////////// struct Device::Impl { Impl(void* d) { handle = (cl_device_id)d; refcount = 1; } template _TpOut getProp(cl_device_info prop) const { _TpCL temp=_TpCL(); size_t sz = 0; return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) >= 0 && sz == sizeof(temp) ? _TpOut(temp) : _TpOut(); } bool getBoolProp(cl_device_info prop) const { cl_bool temp = CL_FALSE; size_t sz = 0; return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) >= 0 && sz == sizeof(temp) ? temp != 0 : false; } String getStrProp(cl_device_info prop) const { char buf[1024]; size_t sz=0; return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) >= 0 && sz < sizeof(buf) ? String(buf) : String(); } IMPLEMENT_REFCOUNTABLE(); cl_device_id handle; }; Device::Device() { p = 0; } Device::Device(void* d) { p = 0; set(d); } Device::Device(const Device& d) { p = d.p; if(p) p->addref(); } Device& Device::operator = (const Device& d) { Impl* newp = (Impl*)d.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Device::~Device() { if(p) p->release(); } void Device::set(void* d) { if(p) p->release(); p = new Impl(d); } void* Device::ptr() const { return p ? p->handle : 0; } String Device::name() const { return p ? p->getStrProp(CL_DEVICE_NAME) : String(); } String Device::extensions() const { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); } String Device::version() const { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); } String Device::vendor() const { return p ? p->getStrProp(CL_DEVICE_VENDOR) : String(); } String Device::OpenCL_C_Version() const { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); } String Device::OpenCLVersion() const { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); } String Device::deviceVersion() const { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); } String Device::driverVersion() const { return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); } int Device::type() const { return p ? p->getProp(CL_DEVICE_TYPE) : 0; } int Device::addressBits() const { return p ? p->getProp(CL_DEVICE_ADDRESS_BITS) : 0; } bool Device::available() const { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; } bool Device::compilerAvailable() const { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; } bool Device::linkerAvailable() const #ifdef CL_VERSION_1_2 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif int Device::doubleFPConfig() const { return p ? p->getProp(CL_DEVICE_DOUBLE_FP_CONFIG) : 0; } int Device::singleFPConfig() const { return p ? p->getProp(CL_DEVICE_SINGLE_FP_CONFIG) : 0; } int Device::halfFPConfig() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_HALF_FP_CONFIG) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif bool Device::endianLittle() const { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; } bool Device::errorCorrectionSupport() const { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; } int Device::executionCapabilities() const { return p ? p->getProp(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; } size_t Device::globalMemCacheSize() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; } int Device::globalMemCacheType() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; } int Device::globalMemCacheLineSize() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; } size_t Device::globalMemSize() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; } size_t Device::localMemSize() const { return p ? p->getProp(CL_DEVICE_LOCAL_MEM_SIZE) : 0; } int Device::localMemType() const { return p ? p->getProp(CL_DEVICE_LOCAL_MEM_TYPE) : 0; } bool Device::hostUnifiedMemory() const { return p ? p->getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY) : false; } bool Device::imageSupport() const { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; } size_t Device::image2DMaxWidth() const { return p ? p->getProp(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; } size_t Device::image2DMaxHeight() const { return p ? p->getProp(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; } size_t Device::image3DMaxWidth() const { return p ? p->getProp(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; } size_t Device::image3DMaxHeight() const { return p ? p->getProp(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; } size_t Device::image3DMaxDepth() const { return p ? p->getProp(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; } size_t Device::imageMaxBufferSize() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif size_t Device::imageMaxArraySize() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif int Device::maxClockFrequency() const { return p ? p->getProp(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; } int Device::maxComputeUnits() const { return p ? p->getProp(CL_DEVICE_MAX_COMPUTE_UNITS) : 0; } int Device::maxConstantArgs() const { return p ? p->getProp(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; } size_t Device::maxConstantBufferSize() const { return p ? p->getProp(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; } size_t Device::maxMemAllocSize() const { return p ? p->getProp(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; } size_t Device::maxParameterSize() const { return p ? p->getProp(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; } int Device::maxReadImageArgs() const { return p ? p->getProp(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; } int Device::maxWriteImageArgs() const { return p ? p->getProp(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; } int Device::maxSamplers() const { return p ? p->getProp(CL_DEVICE_MAX_SAMPLERS) : 0; } size_t Device::maxWorkGroupSize() const { return p ? p->getProp(CL_DEVICE_MAX_WORK_GROUP_SIZE) : 0; } int Device::maxWorkItemDims() const { return p ? p->getProp(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; } void Device::maxWorkItemSizes(size_t* sizes) const { if(p) { const int MAX_DIMS = 32; size_t retsz = 0; clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES, MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz); } } int Device::memBaseAddrAlign() const { return p ? p->getProp(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; } int Device::nativeVectorWidthChar() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; } int Device::nativeVectorWidthShort() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; } int Device::nativeVectorWidthInt() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; } int Device::nativeVectorWidthLong() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; } int Device::nativeVectorWidthFloat() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; } int Device::nativeVectorWidthDouble() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; } int Device::nativeVectorWidthHalf() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; } int Device::preferredVectorWidthChar() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; } int Device::preferredVectorWidthShort() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; } int Device::preferredVectorWidthInt() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; } int Device::preferredVectorWidthLong() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; } int Device::preferredVectorWidthFloat() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; } int Device::preferredVectorWidthDouble() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; } int Device::preferredVectorWidthHalf() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; } size_t Device::printfBufferSize() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif size_t Device::profilingTimerResolution() const { return p ? p->getProp(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; } const Device& Device::getDefault() { const Context2& ctx = Context2::getDefault(); int idx = coreTlsData.get()->device; return ctx.device(idx); } ///////////////////////////////////////////////////////////////////////////////////////// template inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param) { ::size_t required; cl_int err = f(obj, name, 0, NULL, &required); if (err != CL_SUCCESS) return err; param.clear(); if (required > 0) { AutoBuffer buf(required + 1); char* ptr = (char*)buf; // cleanup is not needed err = f(obj, name, required, ptr, NULL); if (err != CL_SUCCESS) return err; param = ptr; } return CL_SUCCESS; } static void split(const std::string &s, char delim, std::vector &elems) { elems.clear(); if (s.size() == 0) return; std::istringstream ss(s); std::string item; while (!ss.eof()) { std::getline(ss, item, delim); elems.push_back(item); } } // Layout: :: // Sample: AMD:GPU: // Sample: AMD:GPU:Tahiti // Sample: :GPU|CPU: = '' = ':' = '::' static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr, std::string& platform, std::vector& deviceTypes, std::string& deviceNameOrID) { std::vector parts; split(configurationStr, ':', parts); if (parts.size() > 3) { std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl; return false; } if (parts.size() > 2) deviceNameOrID = parts[2]; if (parts.size() > 1) { split(parts[1], '|', deviceTypes); } if (parts.size() > 0) { platform = parts[0]; } return true; } static cl_device_id selectOpenCLDevice() { std::string platform; std::vector deviceTypes; std::string deviceName; const char* configuration = getenv("OPENCV_OPENCL_DEVICE"); if (configuration) { if (!parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)) return NULL; } bool isID = false; int deviceID = -1; if (deviceName.length() == 1) // We limit ID range to 0..9, because we want to write: // - '2500' to mean i5-2500 // - '8350' to mean AMD FX-8350 // - '650' to mean GeForce 650 // To extend ID range change condition to '> 0' { isID = true; for (size_t i = 0; i < deviceName.length(); i++) { if (!isdigit(deviceName[i])) { isID = false; break; } } if (isID) { deviceID = atoi(deviceName.c_str()); CV_Assert(deviceID >= 0); } } cl_int status = CL_SUCCESS; std::vector platforms; { cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); CV_Assert(status == CL_SUCCESS); if (numPlatforms == 0) return NULL; platforms.resize((size_t)numPlatforms); status = clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms); CV_Assert(status == CL_SUCCESS); platforms.resize(numPlatforms); } int selectedPlatform = -1; if (platform.length() > 0) { for (size_t i = 0; i < platforms.size(); i++) { std::string name; status = getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name); CV_Assert(status == CL_SUCCESS); if (name.find(platform) != std::string::npos) { selectedPlatform = (int)i; break; } } if (selectedPlatform == -1) { std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl; goto not_found; } } if (deviceTypes.size() == 0) { if (!isID) { deviceTypes.push_back("GPU"); deviceTypes.push_back("CPU"); } else { deviceTypes.push_back("ALL"); } } for (size_t t = 0; t < deviceTypes.size(); t++) { int deviceType = 0; if (deviceTypes[t] == "GPU") { deviceType = Device::TYPE_GPU; } else if (deviceTypes[t] == "CPU") { deviceType = Device::TYPE_CPU; } else if (deviceTypes[t] == "ACCELERATOR") { deviceType = Device::TYPE_ACCELERATOR; } else if (deviceTypes[t] == "ALL") { deviceType = Device::TYPE_ALL; } else { std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl; goto not_found; } std::vector devices; // TODO Use clReleaseDevice to cleanup for (int i = selectedPlatform >= 0 ? selectedPlatform : 0; (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size()); i++) { cl_uint count = 0; status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count); CV_Assert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND); if (count == 0) continue; size_t base = devices.size(); devices.resize(base + count); status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count); CV_Assert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND); } for (size_t i = (isID ? deviceID : 0); (isID ? (i == (size_t)deviceID) : true) && (i < devices.size()); i++) { std::string name; status = getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name); CV_Assert(status == CL_SUCCESS); if (isID || name.find(deviceName) != std::string::npos) { // TODO check for OpenCL 1.1 return devices[i]; } } } not_found: std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl << " Device types: "; for (size_t t = 0; t < deviceTypes.size(); t++) { std::cerr << deviceTypes[t] << " "; } std::cerr << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl; return NULL; } struct Context2::Impl { Impl() { refcount = 1; handle = 0; } void setDefault() { CV_Assert(handle == NULL); cl_device_id d = selectOpenCLDevice(); if (d == NULL) return; cl_platform_id pl = NULL; cl_int status = clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL); CV_Assert(status == CL_SUCCESS); cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)pl, 0 }; // !!! in the current implementation force the number of devices to 1 !!! int nd = 1; handle = clCreateContext(prop, nd, &d, 0, 0, &status); CV_Assert(status == CL_SUCCESS); bool ok = handle != 0 && status >= 0; if( ok ) { devices.resize(nd); devices[0].set(d); } else { handle = NULL; } } Impl(int dtype0) { refcount = 1; handle = 0; cl_int retval = 0; cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr(); cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)pl, 0 }; cl_uint i, nd0 = 0, nd = 0; int dtype = dtype0 & 15; clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ); if(retval < 0) return; AutoBuffer dlistbuf(nd0*2+1); cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf; cl_device_id* dlist_new = dlist + nd0; clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ); String name0; for(i = 0; i < nd0; i++) { Device d(dlist[i]); if( !d.available() || !d.compilerAvailable() ) continue; if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() ) continue; if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() ) continue; String name = d.name(); if( nd != 0 && name != name0 ) continue; name0 = name; dlist_new[nd++] = dlist[i]; } if(nd == 0) return; // !!! in the current implementation force the number of devices to 1 !!! nd = 1; handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval); bool ok = handle != 0 && retval >= 0; if( ok ) { devices.resize(nd); for( i = 0; i < nd; i++ ) devices[i].set(dlist_new[i]); } } ~Impl() { if(handle) clReleaseContext(handle); devices.clear(); } Program getProg(const ProgramSource2& src, const String& buildflags, String& errmsg) { String prefix = Program::getPrefix(buildflags); HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size())); phash_t::iterator it = phash.find(k); if( it != phash.end() ) return it->second; //String filename = format("%08x%08x_%08x%08x.clb2", Program prog(src, buildflags, errmsg); if(prog.ptr()) phash.insert(std::pair(k, prog)); return prog; } IMPLEMENT_REFCOUNTABLE(); cl_context handle; std::vector devices; typedef ProgramSource2::hash_t hash_t; struct HashKey { HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {} bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); } bool operator == (const HashKey& k) const { return a == k.a && b == k.b; } bool operator != (const HashKey& k) const { return a != k.a || b != k.b; } hash_t a, b; }; typedef std::map phash_t; phash_t phash; }; Context2::Context2() { p = 0; } Context2::Context2(int dtype) { p = 0; create(dtype); } bool Context2::create() { if( !haveOpenCL() ) return false; if(p) p->release(); p = new Impl(); if(!p->handle) { delete p; p = 0; } return p != 0; } bool Context2::create(int dtype0) { if( !haveOpenCL() ) return false; if(p) p->release(); p = new Impl(dtype0); if(!p->handle) { delete p; p = 0; } return p != 0; } Context2::~Context2() { if (p) { p->release(); p = NULL; } } Context2::Context2(const Context2& c) { p = (Impl*)c.p; if(p) p->addref(); } Context2& Context2::operator = (const Context2& c) { Impl* newp = (Impl*)c.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } void* Context2::ptr() const { return p == NULL ? NULL : p->handle; } size_t Context2::ndevices() const { return p ? p->devices.size() : 0; } const Device& Context2::device(size_t idx) const { static Device dummy; return !p || idx >= p->devices.size() ? dummy : p->devices[idx]; } Context2& Context2::getDefault(bool initialize) { static Context2 ctx; if(!ctx.p && haveOpenCL()) { if (!ctx.p) ctx.p = new Impl(); if (initialize) { // do not create new Context2 right away. // First, try to retrieve existing context of the same type. // In its turn, Platform::getContext() may call Context2::create() // if there is no such context. if (ctx.p->handle == NULL) ctx.p->setDefault(); } } return ctx; } Program Context2::getProg(const ProgramSource2& prog, const String& buildopts, String& errmsg) { return p ? p->getProg(prog, buildopts, errmsg) : Program(); } void initializeContextFromHandle(Context2& ctx, void* platform, void* _context, void* _device) { cl_context context = (cl_context)_context; cl_device_id device = (cl_device_id)_device; // cleanup old context Context2::Impl* impl = ctx._getImpl(); if (impl->handle) { cl_int status = clReleaseContext(impl->handle); (void)status; } impl->devices.clear(); impl->handle = context; impl->devices.resize(1); impl->devices[0].set(device); Platform& p = Platform::getDefault(); Platform::Impl* pImpl = p._getImpl(); pImpl->handle = (cl_platform_id)platform; } struct Queue::Impl { Impl(const Context2& c, const Device& d) { refcount = 1; const Context2* pc = &c; cl_context ch = (cl_context)pc->ptr(); if( !ch ) { pc = &Context2::getDefault(); ch = (cl_context)pc->ptr(); } cl_device_id dh = (cl_device_id)d.ptr(); if( !dh ) dh = (cl_device_id)pc->device(0).ptr(); cl_int retval = 0; handle = clCreateCommandQueue(ch, dh, 0, &retval); } ~Impl() { #ifdef _WIN32 if (!cv::__termination) #endif { if(handle) { clFinish(handle); clReleaseCommandQueue(handle); } } } IMPLEMENT_REFCOUNTABLE(); cl_command_queue handle; bool initialized; }; Queue::Queue() { p = 0; } Queue::Queue(const Context2& c, const Device& d) { p = 0; create(c, d); } Queue::Queue(const Queue& q) { p = q.p; if(p) p->addref(); } Queue& Queue::operator = (const Queue& q) { Impl* newp = (Impl*)q.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Queue::~Queue() { if(p) p->release(); } bool Queue::create(const Context2& c, const Device& d) { if(p) p->release(); p = new Impl(c, d); return p->handle != 0; } void Queue::finish() { if(p && p->handle) clFinish(p->handle); } void* Queue::ptr() const { return p ? p->handle : 0; } Queue& Queue::getDefault() { Queue& q = coreTlsData.get()->oclQueue; if( !q.p && haveOpenCL() ) q.create(Context2::getDefault()); return q; } static cl_command_queue getQueue(const Queue& q) { cl_command_queue qq = (cl_command_queue)q.ptr(); if(!qq) qq = (cl_command_queue)Queue::getDefault().ptr(); return qq; } KernelArg::KernelArg() : flags(0), m(0), obj(0), sz(0), wscale(1) { } KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, const void* _obj, size_t _sz) : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale) { } KernelArg KernelArg::Constant(const Mat& m) { CV_Assert(m.isContinuous()); return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize()); } struct Kernel::Impl { Impl(const char* kname, const Program& prog) { e = 0; refcount = 1; cl_program ph = (cl_program)prog.ptr(); cl_int retval = 0; handle = ph != 0 ? clCreateKernel(ph, kname, &retval) : 0; for( int i = 0; i < MAX_ARRS; i++ ) u[i] = 0; haveTempDstUMats = false; } void cleanupUMats() { for( int i = 0; i < MAX_ARRS; i++ ) if( u[i] ) { if( CV_XADD(&u[i]->urefcount, -1) == 1 ) u[i]->currAllocator->deallocate(u[i]); u[i] = 0; } nu = 0; haveTempDstUMats = false; } void addUMat(const UMat& m, bool dst) { CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0); u[nu] = m.u; CV_XADD(&m.u->urefcount, 1); nu++; if(dst && m.u->tempUMat()) haveTempDstUMats = true; } void finit() { cleanupUMats(); if(e) { clReleaseEvent(e); e = 0; } release(); } ~Impl() { if(handle) clReleaseKernel(handle); } IMPLEMENT_REFCOUNTABLE(); cl_kernel handle; cl_event e; enum { MAX_ARRS = 16 }; UMatData* u[MAX_ARRS]; int nu; bool haveTempDstUMats; }; }} extern "C" { static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p) { ((cv::ocl::Kernel::Impl*)p)->finit(); } } namespace cv { namespace ocl { Kernel::Kernel() { p = 0; } Kernel::Kernel(const char* kname, const Program& prog) { p = 0; create(kname, prog); } Kernel::Kernel(const char* kname, const ProgramSource2& src, const String& buildopts, String* errmsg) { p = 0; create(kname, src, buildopts, errmsg); } Kernel::Kernel(const Kernel& k) { p = k.p; if(p) p->addref(); } Kernel& Kernel::operator = (const Kernel& k) { Impl* newp = (Impl*)k.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Kernel::~Kernel() { if(p) p->release(); } bool Kernel::create(const char* kname, const Program& prog) { if(p) p->release(); p = new Impl(kname, prog); if(p->handle == 0) { p->release(); p = 0; } return p != 0; } bool Kernel::create(const char* kname, const ProgramSource2& src, const String& buildopts, String* errmsg) { if(p) { p->release(); p = 0; } String tempmsg; if( !errmsg ) errmsg = &tempmsg; const Program& prog = Context2::getDefault().getProg(src, buildopts, *errmsg); return create(kname, prog); } void* Kernel::ptr() const { return p ? p->handle : 0; } bool Kernel::empty() const { return ptr() == 0; } int Kernel::set(int i, const void* value, size_t sz) { CV_Assert(i >= 0); if( i == 0 ) p->cleanupUMats(); if( !p || !p->handle || clSetKernelArg(p->handle, (cl_uint)i, sz, value) < 0 ) return -1; return i+1; } int Kernel::set(int i, const Image2D& image2D) { cl_mem h = (cl_mem)image2D.ptr(); return set(i, &h, sizeof(h)); } int Kernel::set(int i, const UMat& m) { return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0)); } int Kernel::set(int i, const KernelArg& arg) { CV_Assert( i >= 0 ); if( !p || !p->handle ) return -1; if( i == 0 ) p->cleanupUMats(); if( arg.m ) { int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) + ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0); bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0; cl_mem h = (cl_mem)arg.m->handle(accessFlags); if (ptronly) clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h); else if( arg.m->dims <= 2 ) { UMat2D u2d(*arg.m); clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h); clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step); clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset); i += 3; if( !(arg.flags & KernelArg::NO_SIZE) ) { int cols = u2d.cols*arg.wscale; clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows); clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols); i += 2; } } else { UMat3D u3d(*arg.m); clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h); clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep); clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step); clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset); i += 4; if( !(arg.flags & KernelArg::NO_SIZE) ) { int cols = u3d.cols*arg.wscale; clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows); clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows); clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols); i += 3; } } p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0); return i; } clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj); return i+1; } bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], bool sync, const Queue& q) { if(!p || !p->handle || p->e != 0) return false; cl_command_queue qq = getQueue(q); size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1}; size_t total = 1; CV_Assert(_globalsize != 0); for (int i = 0; i < dims; i++) { size_t val = _localsize ? _localsize[i] : dims == 1 ? 64 : dims == 2 ? (16>>i) : dims == 3 ? (8>>(int)(i>0)) : 1; CV_Assert( val > 0 ); total *= _globalsize[i]; globalsize[i] = ((_globalsize[i] + val - 1)/val)*val; } if( total == 0 ) return true; if( p->haveTempDstUMats ) sync = true; cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims, offset, globalsize, _localsize, 0, 0, sync ? 0 : &p->e); if( sync || retval < 0 ) { clFinish(qq); p->cleanupUMats(); } else { p->addref(); clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p); } return retval >= 0; } bool Kernel::runTask(bool sync, const Queue& q) { if(!p || !p->handle || p->e != 0) return false; cl_command_queue qq = getQueue(q); cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e); if( sync || retval < 0 ) { clFinish(qq); p->cleanupUMats(); } else { p->addref(); clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p); } return retval >= 0; } size_t Kernel::workGroupSize() const { if(!p) return 0; size_t val = 0, retsz = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz) >= 0 ? val : 0; } size_t Kernel::preferedWorkGroupSizeMultiple() const { if(!p) return 0; size_t val = 0, retsz = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz) >= 0 ? val : 0; } bool Kernel::compileWorkGroupSize(size_t wsz[]) const { if(!p || !wsz) return 0; size_t retsz = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0]*3), wsz, &retsz) >= 0; } size_t Kernel::localMemSize() const { if(!p) return 0; size_t retsz = 0; cl_ulong val = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz) >= 0 ? (size_t)val : 0; } //////////////////////////////////////////////////////////////////////////////////////// struct Program::Impl { Impl(const ProgramSource2& _src, const String& _buildflags, String& errmsg) { refcount = 1; const Context2& ctx = Context2::getDefault(); src = _src; buildflags = _buildflags; const String& srcstr = src.source(); const char* srcptr = srcstr.c_str(); size_t srclen = srcstr.size(); cl_int retval = 0; handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval); if( handle && retval >= 0 ) { int i, n = (int)ctx.ndevices(); AutoBuffer deviceListBuf(n+1); void** deviceList = deviceListBuf; for( i = 0; i < n; i++ ) deviceList[i] = ctx.device(i).ptr(); retval = clBuildProgram(handle, n, (const cl_device_id*)deviceList, buildflags.c_str(), 0, 0); if( retval < 0 ) { size_t retsz = 0; retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG, 0, 0, &retsz); if( retval >= 0 && retsz > 1 ) { AutoBuffer bufbuf(retsz + 16); char* buf = bufbuf; retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz); if( retval >= 0 ) { errmsg = String(buf); printf("OpenCL program can not be built: %s", errmsg.c_str()); } } if( handle ) { clReleaseProgram(handle); handle = NULL; } } } } Impl(const String& _buf, const String& _buildflags) { refcount = 1; handle = 0; buildflags = _buildflags; if(_buf.empty()) return; String prefix0 = Program::getPrefix(buildflags); const Context2& ctx = Context2::getDefault(); const Device& dev = Device::getDefault(); const char* pos0 = _buf.c_str(); const char* pos1 = strchr(pos0, '\n'); if(!pos1) return; const char* pos2 = strchr(pos1+1, '\n'); if(!pos2) return; const char* pos3 = strchr(pos2+1, '\n'); if(!pos3) return; size_t prefixlen = (pos3 - pos0)+1; String prefix(pos0, prefixlen); if( prefix != prefix0 ) return; const uchar* bin = (uchar*)(pos3+1); void* devid = dev.ptr(); size_t codelen = _buf.length() - prefixlen; cl_int binstatus = 0, retval = 0; handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid, &codelen, &bin, &binstatus, &retval); } String store() { if(!handle) return String(); size_t progsz = 0, retsz = 0; String prefix = Program::getPrefix(buildflags); size_t prefixlen = prefix.length(); if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) < 0) return String(); AutoBuffer bufbuf(prefixlen + progsz + 16); uchar* buf = bufbuf; memcpy(buf, prefix.c_str(), prefixlen); buf += prefixlen; if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) < 0) return String(); buf[progsz] = (uchar)'\0'; return String((const char*)(uchar*)bufbuf, prefixlen + progsz); } ~Impl() { if( handle ) clReleaseProgram(handle); } IMPLEMENT_REFCOUNTABLE(); ProgramSource2 src; String buildflags; cl_program handle; }; Program::Program() { p = 0; } Program::Program(const ProgramSource2& src, const String& buildflags, String& errmsg) { p = 0; create(src, buildflags, errmsg); } Program::Program(const Program& prog) { p = prog.p; if(p) p->addref(); } Program& Program::operator = (const Program& prog) { Impl* newp = (Impl*)prog.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Program::~Program() { if(p) p->release(); } bool Program::create(const ProgramSource2& src, const String& buildflags, String& errmsg) { if(p) p->release(); p = new Impl(src, buildflags, errmsg); if(!p->handle) { p->release(); p = 0; } return p != 0; } const ProgramSource2& Program::source() const { static ProgramSource2 dummy; return p ? p->src : dummy; } void* Program::ptr() const { return p ? p->handle : 0; } bool Program::read(const String& bin, const String& buildflags) { if(p) p->release(); p = new Impl(bin, buildflags); return p->handle != 0; } bool Program::write(String& bin) const { if(!p) return false; bin = p->store(); return !bin.empty(); } String Program::getPrefix() const { if(!p) return String(); return getPrefix(p->buildflags); } String Program::getPrefix(const String& buildflags) { const Context2& ctx = Context2::getDefault(); const Device& dev = ctx.device(0); return format("name=%s\ndriver=%s\nbuildflags=%s\n", dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str()); } //////////////////////////////////////////////////////////////////////////////////////// struct ProgramSource2::Impl { Impl(const char* _src) { init(String(_src)); } Impl(const String& _src) { init(_src); } void init(const String& _src) { refcount = 1; src = _src; h = crc64((uchar*)src.c_str(), src.size()); } IMPLEMENT_REFCOUNTABLE(); String src; ProgramSource2::hash_t h; }; ProgramSource2::ProgramSource2() { p = 0; } ProgramSource2::ProgramSource2(const char* prog) { p = new Impl(prog); } ProgramSource2::ProgramSource2(const String& prog) { p = new Impl(prog); } ProgramSource2::~ProgramSource2() { if(p) p->release(); } ProgramSource2::ProgramSource2(const ProgramSource2& prog) { p = prog.p; if(p) p->addref(); } ProgramSource2& ProgramSource2::operator = (const ProgramSource2& prog) { Impl* newp = (Impl*)prog.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } const String& ProgramSource2::source() const { static String dummy; return p ? p->src : dummy; } ProgramSource2::hash_t ProgramSource2::hash() const { return p ? p->h : 0; } ////////////////////////////////////////////////////////////////////////////////////////////// class OpenCLAllocator : public MatAllocator { public: OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); } UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step, int flags) const { UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags); return u; } void getBestFlags(const Context2& ctx, int /*flags*/, int& createFlags, int& flags0) const { const Device& dev = ctx.device(0); createFlags = CL_MEM_READ_WRITE; if( dev.hostUnifiedMemory() ) flags0 = 0; else flags0 = UMatData::COPY_ON_MAP; } UMatData* allocate(int dims, const int* sizes, int type, void* data, size_t* step, int flags) const { if(!useOpenCL()) return defaultAllocate(dims, sizes, type, data, step, flags); CV_Assert(data == 0); size_t total = CV_ELEM_SIZE(type); for( int i = dims-1; i >= 0; i-- ) { if( step ) step[i] = total; total *= sizes[i]; } Context2& ctx = Context2::getDefault(); int createFlags = 0, flags0 = 0; getBestFlags(ctx, flags, createFlags, flags0); cl_int retval = 0; void* handle = clCreateBuffer((cl_context)ctx.ptr(), createFlags, total, 0, &retval); if( !handle || retval < 0 ) return defaultAllocate(dims, sizes, type, data, step, flags); UMatData* u = new UMatData(this); u->data = 0; u->size = total; u->handle = handle; u->flags = flags0; return u; } bool allocate(UMatData* u, int accessFlags) const { if(!u) return false; UMatDataAutoLock lock(u); if(u->handle == 0) { CV_Assert(u->origdata != 0); Context2& ctx = Context2::getDefault(); int createFlags = 0, flags0 = 0; getBestFlags(ctx, accessFlags, createFlags, flags0); cl_context ctx_handle = (cl_context)ctx.ptr(); cl_int retval = 0; int tempUMatFlags = UMatData::TEMP_UMAT; u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags, u->size, u->origdata, &retval); if((!u->handle || retval < 0) && !(accessFlags & ACCESS_FAST)) { u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|createFlags, u->size, u->origdata, &retval); tempUMatFlags = UMatData::TEMP_COPIED_UMAT; } if(!u->handle || retval < 0) return false; u->prevAllocator = u->currAllocator; u->currAllocator = this; u->flags |= tempUMatFlags; } if(accessFlags & ACCESS_WRITE) u->markHostCopyObsolete(true); return true; } /*void sync(UMatData* u) const { cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); UMatDataAutoLock lock(u); if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata) { if( u->tempCopiedUMat() ) { clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, u->origdata, 0, 0, 0); } else { cl_int retval = 0; void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), 0, u->size, 0, 0, 0, &retval); clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0); clFinish(q); } u->markHostCopyObsolete(false); } else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data ) { clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, u->data, 0, 0, 0); } }*/ void deallocate(UMatData* u) const { if(!u) return; CV_Assert(u->urefcount >= 0); CV_Assert(u->refcount >= 0); // TODO: !!! when we add Shared Virtual Memory Support, // this function (as well as the others) should be corrected CV_Assert(u->handle != 0 && u->urefcount == 0); if(u->tempUMat()) { // UMatDataAutoLock lock(u); if( u->hostCopyObsolete() && u->refcount > 0 ) { cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); if( u->tempCopiedUMat() ) { clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, u->origdata, 0, 0, 0); } else { cl_int retval = 0; void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), 0, u->size, 0, 0, 0, &retval); clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0); clFinish(q); } } u->markHostCopyObsolete(false); clReleaseMemObject((cl_mem)u->handle); u->handle = 0; u->currAllocator = u->prevAllocator; if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) fastFree(u->data); u->data = u->origdata; if(u->refcount == 0) u->currAllocator->deallocate(u); } else { CV_Assert(u->refcount == 0); if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) { fastFree(u->data); u->data = 0; } clReleaseMemObject((cl_mem)u->handle); u->handle = 0; delete u; } } void map(UMatData* u, int accessFlags) const { if(!u) return; CV_Assert( u->handle != 0 ); UMatDataAutoLock autolock(u); if(accessFlags & ACCESS_WRITE) u->markDeviceCopyObsolete(true); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); if( u->refcount == 0 ) { if( !u->copyOnMap() ) { CV_Assert(u->data == 0); // because there can be other map requests for the same UMat with different access flags, // we use the universal (read-write) access mode. cl_int retval = 0; u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), 0, u->size, 0, 0, 0, &retval); if(u->data && retval >= 0) { u->markHostCopyObsolete(false); return; } // if map failed, switch to copy-on-map mode for the particular buffer u->flags |= UMatData::COPY_ON_MAP; } if(!u->data) { u->data = (uchar*)fastMalloc(u->size); u->markHostCopyObsolete(true); } } if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() ) { CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, u->data, 0, 0, 0) >= 0 ); u->markHostCopyObsolete(false); } } void unmap(UMatData* u) const { if(!u) return; CV_Assert(u->handle != 0); UMatDataAutoLock autolock(u); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); cl_int retval = 0; if( !u->copyOnMap() && u->data ) { CV_Assert( (retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0)) >= 0 ); clFinish(q); u->data = 0; } else if( u->copyOnMap() && u->deviceCopyObsolete() ) { CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, u->data, 0, 0, 0)) >= 0 ); } u->markDeviceCopyObsolete(false); u->markHostCopyObsolete(false); } bool checkContinuous(int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], const size_t dstofs[], const size_t dststep[], size_t& total, size_t new_sz[], size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[], size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const { bool iscontinuous = true; srcrawofs = srcofs ? srcofs[dims-1] : 0; dstrawofs = dstofs ? dstofs[dims-1] : 0; total = sz[dims-1]; for( int i = dims-2; i >= 0; i-- ) { if( i >= 0 && (total != srcstep[i] || total != dststep[i]) ) iscontinuous = false; total *= sz[i]; if( srcofs ) srcrawofs += srcofs[i]*srcstep[i]; if( dstofs ) dstrawofs += dstofs[i]*dststep[i]; } if( !iscontinuous ) { // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order. if( dims == 2 ) { new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1; // we assume that new_... arrays are initialized by caller // with 0's, so there is no else branch if( srcofs ) { new_srcofs[0] = srcofs[1]; new_srcofs[1] = srcofs[0]; new_srcofs[2] = 0; } if( dstofs ) { new_dstofs[0] = dstofs[1]; new_dstofs[1] = dstofs[0]; new_dstofs[2] = 0; } new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0; new_dststep[0] = dststep[0]; new_dststep[1] = 0; } else { // we could check for dims == 3 here, // but from user perspective this one is more informative CV_Assert(dims <= 3); new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0]; if( srcofs ) { new_srcofs[0] = srcofs[2]; new_srcofs[1] = srcofs[1]; new_srcofs[2] = srcofs[0]; } if( dstofs ) { new_dstofs[0] = dstofs[2]; new_dstofs[1] = dstofs[1]; new_dstofs[2] = dstofs[0]; } new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0]; new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0]; } } return iscontinuous; } void download(UMatData* u, void* dstptr, int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], const size_t dststep[]) const { if(!u) return; UMatDataAutoLock autolock(u); if( u->data && !u->hostCopyObsolete() ) { Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep); return; } CV_Assert( u->handle != 0 ); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); size_t total = 0, new_sz[] = {0, 0, 0}; size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep, total, new_sz, srcrawofs, new_srcofs, new_srcstep, dstrawofs, new_dstofs, new_dststep); if( iscontinuous ) { CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, srcrawofs, total, dstptr, 0, 0, 0) >= 0 ); } else { CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE, new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1], dstptr, 0, 0, 0) >= 0 ); } } void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[], const size_t dstofs[], const size_t dststep[], const size_t srcstep[]) const { if(!u) return; // there should be no user-visible CPU copies of the UMat which we are going to copy to CV_Assert(u->refcount == 0 || u->tempUMat()); size_t total = 0, new_sz[] = {0, 0, 0}; size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep, total, new_sz, srcrawofs, new_srcofs, new_srcstep, dstrawofs, new_dstofs, new_dststep); UMatDataAutoLock autolock(u); // if there is cached CPU copy of the GPU matrix, // we could use it as a destination. // we can do it in 2 cases: // 1. we overwrite the whole content // 2. we overwrite part of the matrix, but the GPU copy is out-of-date if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size)) { Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep); u->markHostCopyObsolete(false); u->markDeviceCopyObsolete(true); return; } CV_Assert( u->handle != 0 ); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); if( iscontinuous ) { CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) >= 0 ); } else { CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE, new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1], new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) >= 0 ); } u->markHostCopyObsolete(true); u->markDeviceCopyObsolete(false); clFinish(q); } void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], const size_t dstofs[], const size_t dststep[], bool _sync) const { if(!src || !dst) return; size_t total = 0, new_sz[] = {0, 0, 0}; size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep, total, new_sz, srcrawofs, new_srcofs, new_srcstep, dstrawofs, new_dstofs, new_dststep); UMatDataAutoLock src_autolock(src); UMatDataAutoLock dst_autolock(dst); if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) ) { upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep); return; } if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) ) { download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep); dst->markHostCopyObsolete(false); dst->markDeviceCopyObsolete(true); return; } // there should be no user-visible CPU copies of the UMat which we are going to copy to CV_Assert(dst->refcount == 0); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); if( iscontinuous ) { CV_Assert( clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle, srcrawofs, dstrawofs, total, 0, 0, 0) >= 0 ); } else { cl_int retval; CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle, new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1], 0, 0, 0)) >= 0 ); } dst->markHostCopyObsolete(true); dst->markDeviceCopyObsolete(false); if( _sync ) clFinish(q); } MatAllocator* matStdAllocator; }; MatAllocator* getOpenCLAllocator() { static OpenCLAllocator allocator; return &allocator; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// static void getDevices(std::vector& devices,cl_platform_id& platform) { cl_int status = CL_SUCCESS; cl_uint numDevices = 0; status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices); CV_Assert(status == CL_SUCCESS); if (numDevices == 0) return; devices.resize((size_t)numDevices); status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices); CV_Assert(status == CL_SUCCESS); devices.resize(numDevices); } struct PlatformInfo2::Impl { Impl(void* id) { handle = *(cl_platform_id*)id; getDevices(devices, handle); } String getStrProp(cl_device_info prop) const { char buf[1024]; size_t sz=0; return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) >= 0 && sz < sizeof(buf) ? String(buf) : String(); } IMPLEMENT_REFCOUNTABLE(); std::vector devices; cl_platform_id handle; }; PlatformInfo2::PlatformInfo2() { p = 0; } PlatformInfo2::PlatformInfo2(void* platform_id) { p = new Impl(platform_id); } PlatformInfo2::~PlatformInfo2() { if(p) p->release(); } int PlatformInfo2::deviceNumber() const { return p ? (int)p->devices.size() : 0; } void PlatformInfo2::getDevice(Device& device, int d) const { CV_Assert(d < (int)p->devices.size() ); if(p) device.set(p->devices[d]); } String PlatformInfo2::name() const { return p ? p->getStrProp(CL_PLATFORM_NAME) : String(); } String PlatformInfo2::vendor() const { return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String(); } String PlatformInfo2::version() const { return p ? p->getStrProp(CL_PLATFORM_VERSION) : String(); } static void getPlatforms(std::vector& platforms) { cl_int status = CL_SUCCESS; cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); CV_Assert(status == CL_SUCCESS); if (numPlatforms == 0) return; platforms.resize((size_t)numPlatforms); status = clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms); CV_Assert(status == CL_SUCCESS); platforms.resize(numPlatforms); } void getPlatfomsInfo(std::vector& platformsInfo) { std::vector platforms; getPlatforms(platforms); for (size_t i = 0; i < platforms.size(); i++) { platformsInfo.push_back( PlatformInfo2((void*)&platforms[i]) ); } } const char* typeToStr(int t) { static const char* tab[]= { "uchar", "uchar2", "uchar3", "uchar4", "char", "char2", "char3", "char4", "ushort", "ushort2", "ushort3", "ushort4", "short", "short2", "short3", "short4", "int", "int2", "int3", "int4", "float", "float2", "float3", "float4", "double", "double2", "double3", "double4", "?", "?", "?", "?" }; int cn = CV_MAT_CN(t); return cn > 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1]; } const char* memopTypeToStr(int t) { static const char* tab[]= { "uchar", "uchar2", "uchar3", "uchar4", "uchar", "uchar2", "uchar3", "uchar4", "ushort", "ushort2", "ushort3", "ushort4", "ushort", "ushort2", "ushort3", "ushort4", "int", "int2", "int3", "int4", "int", "int2", "int3", "int4", "int2", "int4", "?", "int8", "?", "?", "?", "?" }; int cn = CV_MAT_CN(t); return cn > 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1]; } const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) { if( sdepth == ddepth ) return "noconvert"; const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn)); if( ddepth >= CV_32F || (ddepth == CV_32S && sdepth < CV_32S) || (ddepth == CV_16S && sdepth <= CV_8S) || (ddepth == CV_16U && sdepth == CV_8U)) { sprintf(buf, "convert_%s", typestr); } else if( sdepth >= CV_32F ) { sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : "")); } else { sprintf(buf, "convert_%s_sat", typestr); } return buf; } template static std::string kerToStr(const Mat & k) { int width = k.cols - 1, depth = k.depth(); const T * const data = reinterpret_cast(k.data); std::ostringstream stream; stream.precision(10); if (depth <= CV_8S) { for (int i = 0; i < width; ++i) stream << (int)data[i] << ", "; stream << (int)data[width]; } else if (depth == CV_32F) { stream.setf(std::ios_base::showpoint); for (int i = 0; i < width; ++i) stream << data[i] << "f, "; stream << data[width] << "f"; } else { for (int i = 0; i < width; ++i) stream << data[i] << ", "; } return stream.str(); } String kernelToStr(InputArray _kernel, int ddepth) { Mat kernel = _kernel.getMat().reshape(1, 1); int depth = kernel.depth(); if (ddepth < 0) ddepth = depth; if (ddepth != depth) kernel.convertTo(kernel, ddepth); typedef std::string (*func_t)(const Mat &); static const func_t funcs[] = { kerToStr, kerToStr, kerToStr,kerToStr, kerToStr, kerToStr, kerToStr, 0 }; const func_t func = funcs[depth]; CV_Assert(func != 0); return cv::format(" -D COEFF=%s", func(kernel).c_str()); } /////////////////////////////////////////////////////////////////////////////////////////////// // deviceVersion has format // OpenCL // by specification // http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html // http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor) { major = minor = 0; if (10 >= deviceVersion.length()) return; const char *pstr = deviceVersion.c_str(); if (0 != strncmp(pstr, "OpenCL ", 7)) return; size_t ppos = deviceVersion.find('.', 7); if (String::npos == ppos) return; String temp = deviceVersion.substr(7, ppos - 7); major = atoi(temp.c_str()); temp = deviceVersion.substr(ppos + 1); minor = atoi(temp.c_str()); } struct Image2D::Impl { Impl(const UMat &src) { handle = 0; refcount = 1; init(src); } ~Impl() { if (handle) clReleaseMemObject(handle); } void init(const UMat &src) { cl_image_format format; int err; int depth = src.depth(); int channels = src.channels(); switch(depth) { case CV_8U: format.image_channel_data_type = CL_UNSIGNED_INT8; break; case CV_32S: format.image_channel_data_type = CL_UNSIGNED_INT32; break; case CV_32F: format.image_channel_data_type = CL_FLOAT; break; default: CV_Error(-1, "Image forma is not supported"); break; } switch(channels) { case 1: format.image_channel_order = CL_R; break; case 3: format.image_channel_order = CL_RGB; break; case 4: format.image_channel_order = CL_RGBA; break; default: CV_Error(-1, "Image format is not supported"); break; } #ifdef CL_VERSION_1_2 //this enables backwards portability to //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support int minor, major; parseDeviceVersion(Device::getDefault().deviceVersion(), major, minor); if ((1 < major) || ((1 == major) && (2 <= minor))) { cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = src.cols; desc.image_height = src.rows; desc.image_depth = 0; desc.image_array_size = 1; desc.image_row_pitch = 0; desc.image_slice_pitch = 0; desc.buffer = NULL; desc.num_mip_levels = 0; desc.num_samples = 0; handle = clCreateImage((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err); } else #endif { handle = clCreateImage2D((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err); } size_t origin[] = { 0, 0, 0 }; size_t region[] = { src.cols, src.rows, 1 }; cl_mem devData; if (!src.isContinuous()) { devData = clCreateBuffer((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, NULL); const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1}; clEnqueueCopyBufferRect((cl_command_queue)Queue::getDefault().ptr(), (cl_mem)src.handle(ACCESS_READ), devData, origin, origin, roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL); clFlush((cl_command_queue)Queue::getDefault().ptr()); } else { devData = (cl_mem)src.handle(ACCESS_READ); } clEnqueueCopyBufferToImage((cl_command_queue)Queue::getDefault().ptr(), devData, handle, 0, origin, region, 0, NULL, 0); if (!src.isContinuous()) { clFlush((cl_command_queue)Queue::getDefault().ptr()); clReleaseMemObject(devData); } } IMPLEMENT_REFCOUNTABLE(); cl_mem handle; }; Image2D::Image2D() { p = NULL; } Image2D::Image2D(const UMat &src) { p = new Impl(src); } Image2D::~Image2D() { if (p) p->release(); } void* Image2D::ptr() const { return p ? p->handle : 0; } }}