diff --git a/CMakeLists.txt b/CMakeLists.txt index cdb4b3e..429ca0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,48 +6,78 @@ ENDIF() project(ctc_release) -IF (NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O2") -ENDIF() - -IF (APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") - add_definitions(-DAPPLE) -ENDIF() - include_directories(include) FIND_PACKAGE(CUDA 6.5) +FIND_PACKAGE(Torch) + MESSAGE(STATUS "cuda found ${CUDA_FOUND}") +MESSAGE(STATUS "Torch found ${Torch_DIR}") -option(WITH_GPU "compile warp-ctc with cuda." ${CUDA_FOUND}) -option(WITH_OMP "compile warp-ctc with openmp." ON) +option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND}) +option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND}) +option(WITH_OMP "compile warp-ctc with OpenMP." ON) +option(BUILD_TESTS "build warp-ctc unit tests." ON) +option(BUILD_SHARED "build warp-ctc shared library." ON) + +if(BUILD_SHARED) + set(WARPCTC_SHARED "SHARED") +else(BUILD_SHARED) + set(WARPCTC_SHARED "STATIC") +endif(BUILD_SHARED) + +if(WIN32) + set(CMAKE_STATIC_LIBRARY_PREFIX lib) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +else(WIN32) + # Set c++ flags + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2") +endif(WIN32) + +if(APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + add_definitions(-DAPPLE) +endif() -if(NOT WITH_OMP) +if(WITH_OMP AND NOT APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +else() add_definitions(-DCTC_DISABLE_OMP) endif() # need to be at least 30 or __shfl_down in reduce wont compile -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") -IF (CUDA_VERSION GREATER 7.6) +IF (CUDA_VERSION VERSION_GREATER "7.6") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") ENDIF() -if (NOT APPLE) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") +IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0")) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") ENDIF() -FIND_PACKAGE(Torch) - -MESSAGE(STATUS "Torch found ${Torch_DIR}") +IF(NOT APPLE AND NOT WIN32) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") + if(WITH_OMP) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") + endif() +ENDIF() IF (APPLE) EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) @@ -65,22 +95,63 @@ ELSE() set(CMAKE_SKIP_RPATH TRUE) ENDIF() +# windows treat symbolic file as a real file, which is different with unix +# We create a hidden file and compile it instead of origin source file. +function(windows_symbolic TARGET) + set(oneValueArgs "") + set(multiValueArgs SRCS PATH DEPS) + cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) + foreach(src ${windows_symbolic_SRCS}) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.") + endif() + + # only copy the xx.cu to .xx.cu when the content are modified + set(copy_flag 1) + if (EXISTS ${final_path}/.${src}.cu) + file(READ ${final_path}/${src}.cpp SOURCE_STR) + file(READ ${final_path}/.${src}.cu TARGET_STR) + if (SOURCE_STR STREQUAL TARGET_STR) + set(copy_flag 0) + endif() + endif() + if (copy_flag) + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + endif(copy_flag) + add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) + endforeach() +endfunction() IF (WITH_GPU) MESSAGE(STATUS "Building shared library with GPU support") + MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS}) + + if (WIN32) + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"") + windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src) + CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu) + else() + CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu) + endif(WIN32) - CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu) - IF (!Torch_FOUND) + IF (!WITH_TORCH) TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) ENDIF() - add_executable(test_cpu tests/test_cpu.cpp ) - TARGET_LINK_LIBRARIES(test_cpu warpctc) - SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + if(BUILD_TESTS) + add_executable(test_cpu tests/test_cpu.cpp ) + TARGET_LINK_LIBRARIES(test_cpu warpctc) + SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") - cuda_add_executable(test_gpu tests/test_gpu.cu) - TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) + cuda_add_executable(test_gpu tests/test_gpu.cu) + TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) + endif(BUILD_TESTS) INSTALL(TARGETS warpctc RUNTIME DESTINATION "bin" @@ -89,7 +160,7 @@ IF (WITH_GPU) INSTALL(FILES include/ctc.h DESTINATION "include") - IF (Torch_FOUND) + IF (WITH_TORCH) MESSAGE(STATUS "Building Torch Bindings with GPU support") INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc") INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC) @@ -105,26 +176,26 @@ IF (WITH_GPU) ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") IF (APPLE) - TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) ELSE() TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp) ENDIF() ENDIF() - ELSE() MESSAGE(STATUS "Building shared library with no GPU support") - if (NOT APPLE) + if (NOT APPLE AND NOT WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") ENDIF() - ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp) + ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp) - add_executable(test_cpu tests/test_cpu.cpp ) - TARGET_LINK_LIBRARIES(test_cpu warpctc) - SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + if(BUILD_TESTS) + add_executable(test_cpu tests/test_cpu.cpp ) + TARGET_LINK_LIBRARIES(test_cpu warpctc) + SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + endif(BUILD_TESTS) INSTALL(TARGETS warpctc RUNTIME DESTINATION "bin" @@ -133,7 +204,7 @@ ELSE() INSTALL(FILES include/ctc.h DESTINATION "include") - IF (Torch_FOUND) + IF (WITH_TORCH) MESSAGE(STATUS "Building Torch Bindings with no GPU support") add_definitions(-DTORCH_NOGPU) INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH) diff --git a/include/contrib/moderngpu/include/device/intrinsics.cuh b/include/contrib/moderngpu/include/device/intrinsics.cuh index a601443..905565f 100644 --- a/include/contrib/moderngpu/include/device/intrinsics.cuh +++ b/include/contrib/moderngpu/include/device/intrinsics.cuh @@ -112,8 +112,12 @@ __device__ __forceinline__ float shfl_up(float var, unsigned int delta, int width = 32) { #if __CUDA_ARCH__ >= 300 +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); +#else var = __shfl_up(var, delta, width); #endif +#endif return var; } @@ -122,8 +126,13 @@ __device__ __forceinline__ double shfl_up(double var, #if __CUDA_ARCH__ >= 300 int2 p = mgpu::double_as_int2(var); +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); + p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); +#else p.x = __shfl_up(p.x, delta, width); p.y = __shfl_up(p.y, delta, width); +#endif var = mgpu::int2_as_double(p); #endif @@ -137,6 +146,15 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { int result = 0; #if __CUDA_ARCH__ >= 300 int mask = (WARP_SIZE - width)<< 8; +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" + "@p add.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#else asm( "{.reg .s32 r0;" ".reg .pred p;" @@ -145,6 +163,7 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { "mov.s32 %0, r0; }" : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); #endif +#endif return result; } @@ -152,6 +171,15 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { int result = 0; #if __CUDA_ARCH__ >= 300 int mask = (WARP_SIZE - width)<< 8; +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" + "@p max.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#else asm( "{.reg .s32 r0;" ".reg .pred p;" @@ -160,6 +188,7 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { "mov.s32 %0, r0; }" : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); #endif +#endif return result; } diff --git a/include/ctc.h b/include/ctc.h index a2d3d2d..a0a65e7 100644 --- a/include/ctc.h +++ b/include/ctc.h @@ -5,6 +5,16 @@ #pragma once +#ifdef _WIN32 +#ifdef warpctc_EXPORTS +#define API_REFERENCE extern "C" __declspec(dllexport) +#else +#define API_REFERENCE extern "C" __declspec(dllimport) +#endif +#else +#define API_REFERENCE +#endif + #ifdef __cplusplus #include extern "C" { @@ -22,13 +32,13 @@ typedef enum { } ctcStatus_t; /** Returns a single integer which specifies the API version of the warpctc library */ -int get_warpctc_version(); +API_REFERENCE int get_warpctc_version(); /** Returns a string containing a description of status that was passed in * \param[in] status identifies which string should be returned * \return C style string containing the text description * */ -const char* ctcGetStatusString(ctcStatus_t status); +API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status); typedef enum { CTC_CPU = 0, @@ -91,7 +101,7 @@ struct ctcOptions { * \return Status information * * */ -ctcStatus_t compute_ctc_loss(const float* const activations, +API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations, float* gradients, const int* const flat_labels, const int* const label_lengths, @@ -120,7 +130,7 @@ ctcStatus_t compute_ctc_loss(const float* const activations, * * \return Status information **/ -ctcStatus_t get_workspace_size(const int* const label_lengths, +API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths, const int* const input_lengths, int alphabet_size, int minibatch, ctcOptions info, diff --git a/include/detail/cpu_ctc.h b/include/detail/cpu_ctc.h index 8aae3a6..08621d6 100644 --- a/include/detail/cpu_ctc.h +++ b/include/detail/cpu_ctc.h @@ -163,6 +163,8 @@ template void CpuCTC::softmax(const ProbT* const activations, ProbT* probs, const int* const input_lengths) { + ProbT min_T = std::numeric_limits::min(); + #pragma omp parallel for for (int mb = 0; mb < minibatch_; ++mb) { for(int c = 0; c < input_lengths[mb]; ++c) { @@ -179,6 +181,9 @@ CpuCTC::softmax(const ProbT* const activations, ProbT* probs, for(int r = 0; r < alphabet_size_; ++r) { probs[r + col_offset] /= denom; + if (probs[r + col_offset] < min_T) { + probs[r + col_offset] = min_T; + } } } } @@ -226,7 +231,6 @@ ProbT CpuCTC::compute_alphas(const ProbT* probs, int repeats, int S, int const int* const s_inc, const int* const labels, ProbT* alphas) { - int start = (((S /2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1; diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h index 0f1d239..2149d99 100644 --- a/include/detail/gpu_ctc.h +++ b/include/detail/gpu_ctc.h @@ -395,6 +395,9 @@ GpuCTC::compute_probs(const ProbT* const activations) { (ctc_helper::exponential(), probs_, denoms_, out_dim_, num_elements); + truncate_probs_kernel<<>> + (probs_, num_elements); + return CTC_STATUS_SUCCESS; } diff --git a/include/detail/gpu_ctc_kernels.h b/include/detail/gpu_ctc_kernels.h index cf6dba9..07412d0 100644 --- a/include/detail/gpu_ctc_kernels.h +++ b/include/detail/gpu_ctc_kernels.h @@ -88,8 +88,8 @@ template __global__ void compute_alpha_kernel (const ProbT* probs, const int *label_sizes, const int *utt_length, const int *repeats_in_labels, - const int *labels_without_blanks, const int *label_offsets, - int *labels_with_blanks, ProbT *alphas, + const int *labels_without_blanks, const int *label_offsets, + int *labels_with_blanks, ProbT *alphas, ProbT* nll_forward, int stride, int out_dim, int S_memoffset, int T_memoffset, int blank_label) { @@ -469,6 +469,23 @@ __global__ void compute_probs_kernel(Op f, ProbT* probs, } } +template +__global__ void truncate_probs_kernel(ProbT* probs, int count) { + + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + ProbT min_T = numeric_limits::min(); +#pragma unroll + for(int i = 0; i < VT; i++) { + if (idx < count) { + if (min_T > probs[idx]) { + probs[idx] = min_T; + } + } + idx += stride; + } +} + template __global__ void prepare_stable_SM_kernel(Op f, ProbT* probs, const ProbT* const col_max, diff --git a/include/detail/hostdevice.h b/include/detail/hostdevice.h index 7bec1e0..3bc318c 100644 --- a/include/detail/hostdevice.h +++ b/include/detail/hostdevice.h @@ -5,3 +5,20 @@ #else #define HOSTDEVICE #endif + +// NOTE(dzhwinter) +// the warp primitive is different in cuda9(Volta) GPU. +// add a wrapper to compatible with cuda7 to cuda9 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 +#define DEFAULT_MASK 0u +template +__forceinline__ __device__ T __shfl_down(T input, int delta) { + return __shfl_down_sync(DEFAULT_MASK, input, delta); +} + +template +__forceinline__ __device__ T __shfl_up(T input, int delta) { + return __shfl_up_sync(DEFAULT_MASK, input, delta); +} + +#endif diff --git a/src/ctc_entrypoint.cpp b/src/ctc_entrypoint.cpp index a68ef84..e1476d8 100644 --- a/src/ctc_entrypoint.cpp +++ b/src/ctc_entrypoint.cpp @@ -46,7 +46,6 @@ ctcStatus_t compute_ctc_loss(const float* const activations, float *costs, void *workspace, ctcOptions options) { - if (activations == nullptr || flat_labels == nullptr || label_lengths == nullptr || diff --git a/src/reduce.cu b/src/reduce.cu index df7b3af..0abcbb3 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -41,7 +41,11 @@ struct CTAReduce { T shuff; for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + shuff = __shfl_down_sync(0xFFFFFFFF, x, offset); +#else shuff = __shfl_down(x, offset); +#endif if (tid + offset < count && tid < offset) x = g(x, shuff); } diff --git a/tests/test.h b/tests/test.h index 72c26ae..c495000 100644 --- a/tests/test.h +++ b/tests/test.h @@ -1,5 +1,7 @@ #pragma once +#include +#include #include #include #include diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp index 45a594f..e710fbc 100644 --- a/tests/test_cpu.cpp +++ b/tests/test_cpu.cpp @@ -13,8 +13,8 @@ bool small_test() { const int alphabet_size = 5; const int T = 2; - std::vector activations = {0.1, 0.6, 0.1, 0.1, 0.1, - 0.1, 0.1, 0.6, 0.1, 0.1}; + std::vector activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f, + 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; // Calculate the score analytically float expected_score; @@ -78,36 +78,36 @@ bool options_test() { const int minibatch = 2; std::vector activations = - {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, - 0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, + {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, + 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - 0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436, - 0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, + 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, + 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - 0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688, - 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456, + 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, + 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, - 0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533, - 0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345, + 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, + 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - 0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, - 0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; + 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, + 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; std::vector expected_grads = // from tensorflow - {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, - -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, + {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, + -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - 0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436, - 0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549, + 0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, + 0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - 0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688, - 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544, + 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f, + 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, - 0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533, - 0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345, + 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, + 0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, - -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; + -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, + -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; // Calculate the expected scores analytically @@ -116,7 +116,7 @@ bool options_test() { expected_scores[0] = -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); - expected_scores[1] = 5.42262; // from tensorflow + expected_scores[1] = 5.42262f; // from tensorflow // now take the log to account for the softmax for (auto& a : activations) { diff --git a/tests/test_gpu.cu b/tests/test_gpu.cu index e7e66f1..15a1037 100644 --- a/tests/test_gpu.cu +++ b/tests/test_gpu.cu @@ -12,8 +12,8 @@ bool small_test() { const int alphabet_size = 5; const int T = 2; - std::vector activations = {0.1, 0.6, 0.1, 0.1, 0.1, - 0.1, 0.1, 0.6, 0.1, 0.1}; + std::vector activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f, + 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; // Calculate the score analytically float expected_score; @@ -98,36 +98,36 @@ bool options_test() { const int minibatch = 2; std::vector activations = - {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, - 0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, + {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, + 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - 0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436, - 0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, + 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, + 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - 0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688, - 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456, + 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, + 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, - 0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533, - 0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345, + 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, + 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - 0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, - 0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; + 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, + 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; std::vector expected_grads = // from tensorflow - {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, - -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, + {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, + -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - 0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436, - 0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549, + 0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, + 0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - 0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688, - 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544, + 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f, + 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, - 0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533, - 0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345, + 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, + 0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, - -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; + -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, + -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; // Calculate the expected scores analytically auto& a = activations; @@ -135,7 +135,7 @@ bool options_test() { expected_score[0] = -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); - expected_score[1] = 5.42262; // from tensorflow + expected_score[1] = 5.42262f; // from tensorflow // now take the log to account for the softmax for (auto& a : activations) {