diff --git a/dnn/src/cuda/convolution/cudnn_heuristic.cpp b/dnn/src/cuda/convolution/cudnn_heuristic.cpp deleted file mode 100644 index 04065b76847e5e8a477bf020a7bc14409aab8713..0000000000000000000000000000000000000000 --- a/dnn/src/cuda/convolution/cudnn_heuristic.cpp +++ /dev/null @@ -1,235 +0,0 @@ -/** - * \file dnn/src/cuda/convolution/cudnn_heuristic.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "./cudnn_heuristic.h" -#include "megdnn.h" -#include "src/cuda/utils.h" - -using namespace megdnn; -using namespace cuda; -using namespace convolution; - -bool convolution::PerformanceModelBase::args_is_proper( - const TensorLayout* x_layout, - const ConvolutionBase::CanonizedFilterMeta& filter) { - bool available = (x_layout->dtype == dtype::Float32() && - filter.format == param::Convolution::Format::NCHW && - filter.should_flip == 0 && filter.stride[0] == 1 && - filter.stride[1] == 1 && filter.spatial_ndim == 2 && - filter.dilation[0] == 1 && filter.dilation[1] == 1); - return available; -} - -bool convolution::PerformanceModelBase::predict_time_success( - const TensorLayout* x_layout, const ConvolutionBase::CanonizedFilterMeta& filter, - const ConvolutionType& conv_type, float** mask_p, float** time_pred_p, - size_t* output_dim_p) { - size_t layer_num; - const size_t* layers_dim; - size_t input_params[9]; - const float* matrices; - const float* biases; - const float* alpha; - const float* beta; - float* hidden_units; - - if (!(args_is_proper(x_layout, filter))) { - return false; - } - - if (!convolution::heuristic_params_available( - cuda::current_device_prop().major, - cuda::current_device_prop().minor, &layer_num, &layers_dim, - &matrices, &biases, &alpha, &beta, conv_type, &hidden_units, - time_pred_p, mask_p)) { - return false; - } - - input_params[0] = x_layout->shape[0]; - input_params[1] = x_layout->shape[1]; - input_params[2] = x_layout->shape[2]; - input_params[3] = x_layout->shape[3]; - input_params[4] = filter.ocpg; - input_params[5] = filter.spatial[0]; - input_params[6] = filter.spatial[1]; - input_params[7] = filter.padding[0]; - input_params[8] = filter.padding[1]; - - predict_time(layer_num, layers_dim, input_params, matrices, biases, alpha, - beta, hidden_units, *time_pred_p); - - *output_dim_p = layers_dim[layer_num - 1]; - - return true; -} - -void convolution::PerformanceModelBase::predict_time( - const size_t layer_num, const size_t* layers_dim, - const size_t* input_params, const float* matrices, const float* biases, - const float* alpha, const float* beta, float* hidden_units, - float* time_pred) { - size_t layer_ind; - size_t i, j; - const float *matrix_entry = matrices, *bias_entry = biases; - float *prev_entry, *next_entry = hidden_units; - size_t shape; - - for (j = 0; j < layers_dim[1]; ++j) { - for (i = 0; i < layers_dim[0]; ++i) { - next_entry[j] += - matrix_entry[j * layers_dim[0] + i] * input_params[i]; - } - next_entry[j] += bias_entry[j]; - next_entry[j] = element_ReLU(next_entry[j]); - } - prev_entry = next_entry; - next_entry += layers_dim[1]; - matrix_entry += layers_dim[0] * layers_dim[1]; - bias_entry += layers_dim[1]; - - for (layer_ind = 1; layer_ind < layer_num - 2; ++layer_ind) { - for (j = 0; j < layers_dim[layer_ind + 1]; ++j) { - for (i = 0; i < layers_dim[layer_ind]; ++i) { - next_entry[j] += matrix_entry[j * layers_dim[layer_ind] + i] * - prev_entry[i]; - } - next_entry[j] += bias_entry[j]; - next_entry[j] = element_ReLU(next_entry[j]); - } - prev_entry = next_entry; - next_entry += layers_dim[layer_ind + 1]; - matrix_entry += layers_dim[layer_ind] * layers_dim[layer_ind + 1]; - bias_entry += layers_dim[layer_ind + 1]; - } - - for (j = 0; j < layers_dim[layer_num - 2]; ++j) { - for (i = 0; i < layers_dim[layer_num - 1]; ++i) { - time_pred[j] += matrix_entry[j * layers_dim[i]] * input_params[i]; - } - time_pred[j] += bias_entry[j]; - } - - shape = input_params[0] * input_params[1] * input_params[4] * - (input_params[2] + input_params[7] * 2 - input_params[5] + 1) * - (input_params[3] + input_params[8] * 2 - input_params[6] + 1) * - input_params[5] * input_params[6]; - for (i = 0; i < layers_dim[layer_num - 1]; ++i) { - time_pred[i] = std::exp2f(time_pred[i] * beta[i]) * (shape / alpha[i]); - } -} - -/* backward filter */ -void convolution::PerformanceModelBackwardFilter::gen_mask_backward_filter( - float* mask, const size_t output_dim, - const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdFilterDescs& D, - const size_t workspace_size_limit_in_bytes) { - size_t i; - size_t workspace_size; - for (i = 0; i < output_dim; ++i) { - mask[i] = -1.0f; - auto cudnnStat = cudnnGetConvolutionBackwardFilterWorkspaceSize( - args.handle->cudnn_handle(), D.src_desc.desc, D.diff_desc.desc, - D.conv_desc.desc, D.grad_desc.desc, - static_cast(i), - &workspace_size); - if (cudnnStat == CUDNN_STATUS_SUCCESS && - workspace_size < workspace_size_limit_in_bytes) { - mask[i] = 1.0f; - } - } -} - -bool convolution::PerformanceModelBackwardFilter:: - get_algo_backward_filter_success( - const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdFilterDescs& D, - const size_t workspace_limit_in_bytes, - cudnnConvolutionBwdFilterAlgo_t* algo) { - float* mask; - size_t output_dim; - float* time_pred; - - if (!predict_time_success(args.src_layout, args.grad_filter_meta, - ConvolutionType::BACKWARD_FILTER, &(mask), - &(time_pred), &(output_dim))) { - return false; - } - - gen_mask_backward_filter(mask, output_dim, args, D, - workspace_limit_in_bytes); - - size_t i, selected = 0; - for (i = 0; i < output_dim; ++i) { - if (mask[i] > 0 && time_pred[i] < time_pred[selected]) { - selected = i; - } - } - *algo = static_cast(selected); - - return mask[selected] > 0; -} - -/* backward data */ -void convolution::PerformanceModelBackwardData::gen_mask_backward_data( - float* mask, const size_t output_dim, - const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdDataDescs& D, - const size_t workspace_size_limit_in_bytes) { - size_t i; - size_t workspace_size; - for (i = 0; i < output_dim; ++i) { - mask[i] = -1.0f; - auto cudnnStat = cudnnGetConvolutionBackwardDataWorkspaceSize( - args.handle->cudnn_handle(), D.filter_desc.desc, - D.diff_desc.desc, D.conv_desc.desc, D.grad_desc.desc, - static_cast(i), &workspace_size); - if (cudnnStat == CUDNN_STATUS_SUCCESS && - workspace_size < workspace_size_limit_in_bytes) { - mask[i] = 1.0f; - } - } -} - -bool convolution::PerformanceModelBackwardData::get_algo_backward_data_success( - const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes, - cudnnConvolutionBwdDataAlgo_t* algo) { - float* mask; - size_t output_dim; - float* time_pred; - - if (!predict_time_success(args.grad_layout, args.filter_meta, - ConvolutionType::BACKWARD_DATA, &mask, &time_pred, - &output_dim)) { - return false; - } - - gen_mask_backward_data(mask, output_dim, args, D, workspace_limit_in_bytes); - - size_t i, selected = 0; - for (i = 0; i < output_dim; ++i) { - if (mask[i] > 0 && time_pred[i] < time_pred[selected]) { - selected = i; - } - } - - // special case: - // if the filter shape in cudnnConvolutionBackwardData is too asymmetric, - // the performance of algo1 is dramatically reduced, - // we temporarily choose algo0. - if (args.filter_meta.spatial[0] / args.filter_meta.spatial[1] > 32 || - args.filter_meta.spatial[1] / args.filter_meta.spatial[0] > 32) { - selected = 0; - } - *algo = static_cast(selected); - - return mask[selected] > 0; -} diff --git a/dnn/src/cuda/convolution/cudnn_heuristic.h b/dnn/src/cuda/convolution/cudnn_heuristic.h deleted file mode 100644 index 54cfc742590bf3c9d8ccd38fc6bda0044404d707..0000000000000000000000000000000000000000 --- a/dnn/src/cuda/convolution/cudnn_heuristic.h +++ /dev/null @@ -1,86 +0,0 @@ -/** - * \file dnn/src/cuda/convolution/cudnn_heuristic.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once - -#include "src/cuda/convolution/backward_data/algo.h" -#include "src/cuda/convolution/backward_filter/algo.h" - -namespace megdnn { -namespace cuda { -namespace convolution { - -enum class ConvolutionType { - FORWARD = 0, - BACKWARD_FILTER = 1, - BACKWARD_DATA = 2 -}; - -bool heuristic_params_available( - int cuda_major, int cuda_minor, size_t* layer_num_p, - const size_t** layers_dim_p, const float** matrices_p, - const float** biases_p, const float** alpha_p, const float** beta_p, - const ConvolutionType& conv_type, float** hidden_units_p, - float** time_pred_p, float** mask_p); - -class PerformanceModelBase { -public: - static float element_ReLU(float element) { - return element > 0.0 ? element : 0.0; - } - static bool predict_time_success(const TensorLayout* x_layout, - const ConvolutionBase::CanonizedFilterMeta& filter, - const ConvolutionType& conv_type, - float** mask_p, float** time_pred_p, - size_t* output_dim_p); - -private: - static bool args_is_proper( - const TensorLayout* x_layout, - const ConvolutionBase::CanonizedFilterMeta& filter); - static void predict_time(const size_t layer_num, const size_t* layers_dim, - const size_t* input_params, const float* matrices, - const float* biases, const float* alpha, - const float* beta, float* hidden_units, - float* time_pred); -}; - -class PerformanceModelBackwardFilter : public PerformanceModelBase { -public: - static bool get_algo_backward_filter_success( - const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdFilterDescs& D, const size_t workspace_limit_in_bytes, - cudnnConvolutionBwdFilterAlgo_t* algo); - -private: - static void gen_mask_backward_filter( - float* mask, const size_t output_dim, - const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdFilterDescs& D, - const size_t workspace_limit_in_bytes); -}; - -class PerformanceModelBackwardData : public PerformanceModelBase { -public: - static bool get_algo_backward_data_success( - const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes, - cudnnConvolutionBwdDataAlgo_t* algo); - -private: - static void gen_mask_backward_data( - float* mask, const size_t output_dim, - const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, - const CUDNNBwdDataDescs& D, const size_t workspace_limit_in_bytes); -}; - -} // namespace convolution -} // namespace cuda -} // namespace megdnn diff --git a/dnn/src/cuda/convolution/get_params.cpp b/dnn/src/cuda/convolution/get_params.cpp deleted file mode 100644 index 8697223e00af0b732ae82c6451b8ddac74755efb..0000000000000000000000000000000000000000 --- a/dnn/src/cuda/convolution/get_params.cpp +++ /dev/null @@ -1,754 +0,0 @@ -/** - * \file dnn/src/cuda/convolution/get_params.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/cuda/convolution/cudnn_heuristic.h" -#include "megdnn.h" - -using namespace megdnn; -using namespace cuda; -using namespace convolution; - -bool convolution::heuristic_params_available( - int cuda_major, int cuda_minor, size_t* layer_num_p, - const size_t** layers_dim_p, const float** matrices_p, - const float** biases_p, const float** alpha_p, const float** beta_p, - const ConvolutionType& conv_type, float** hidden_units_p, - float** time_pred_p, float** mask_p) { - MEGDNN_MARK_USED_VAR(cuda_major); - MEGDNN_MARK_USED_VAR(cuda_minor); - MEGDNN_MARK_USED_VAR(layer_num_p); - MEGDNN_MARK_USED_VAR(layers_dim_p); - MEGDNN_MARK_USED_VAR(matrices_p); - MEGDNN_MARK_USED_VAR(biases_p); - MEGDNN_MARK_USED_VAR(alpha_p); - MEGDNN_MARK_USED_VAR(beta_p); - MEGDNN_MARK_USED_VAR(conv_type); - MEGDNN_MARK_USED_VAR(hidden_units_p); - MEGDNN_MARK_USED_VAR(time_pred_p); - MEGDNN_MARK_USED_VAR(mask_p); - -#if CUDNN_MAJOR == 6 && CUDNN_MINOR == 0 - - float cuda5_2_BACKWARD_FILTER_time_pred[7] = {0.0f}; - float cuda5_2_BACKWARD_FILTER_mask[7] = {0.0f}; - float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f}; - const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 7}; - const static float cuda5_2_BACKWARD_FILTER_matrices[336] = { - 3.499478e-03, 1.353932e-02, -1.316529e-01, 1.006798e-01, - 1.249662e-02, -3.591197e-01, -4.299506e-01, -3.613592e-01, - -3.783917e-01, -4.249511e-01, 6.287370e-03, -2.861480e-03, - 3.128614e-03, 8.496360e-03, 5.568272e-01, 1.965293e-01, - -6.205962e-02, -1.999864e-01, 9.333656e-03, -6.377945e-02, - 6.122595e-02, 1.122032e-01, -1.683744e-02, -9.395520e-02, - -2.953549e-02, -2.772853e-02, -2.892097e-02, 3.200796e-03, - 5.553298e-03, 6.707606e-01, 3.111190e-01, -5.293804e-01, - -8.127835e-02, -5.839296e-02, 9.633666e-02, 5.957389e-02, - -7.131222e-02, 4.057650e-02, 4.311656e-02, -1.456163e-02, - 5.683148e-02, 6.175192e-02, 9.331264e-02, 9.957494e-02, - 5.202487e-02, 0.0, 0.0, -7.725500e-14, - -8.058319e-17, 0.0, 0.0, 0.0, - 0.0, 0.0, 1.988015e-04, -1.530555e-01, - 3.629641e-03, -1.238047e-03, 1.692593e-02, 3.404703e-01, - 5.441420e-01, -3.275000e-01, -3.742920e-01, -1.714999e-01, - 1.979161e-02, 5.019676e-02, 1.406423e-02, -4.360787e-02, - -5.948093e-03, 1.522342e-01, 1.012455e-02, 5.666151e-02, - -7.033888e-05, 1.519375e-02, -2.360136e-02, -5.682724e-04, - -2.552732e-02, 2.329080e-01, 3.437024e-01, 4.054402e-01, - 3.379739e-01, 1.566344e-03, 3.172801e-02, -1.336258e-02, - 1.401075e-02, 2.876163e-02, 1.293039e+00, 7.118387e-01, - 2.966451e-01, 4.372724e-01, -2.286311e-02, -6.896693e-03, - 3.156468e-02, 3.829155e-02, -9.890525e-04, 1.836302e-02, - 2.394343e-02, 4.963258e-02, 4.368515e-02, 2.950634e-03, - 1.129842e-02, 7.078686e-01, 3.193808e-01, 9.759862e-03, - 2.906150e-01, 1.806232e-01, 1.396071e-01, 2.047469e-01, - -2.561368e-01, -3.322504e-01, 7.250011e-02, -3.389789e-02, - -1.372720e-02, 0.0, -1.690562e-01, -1.013354e-01, - -1.920926e-02, 1.018956e-01, 2.467915e-02, 4.451101e-02, - -4.139300e-02, -1.031867e-02, -5.686982e-03, 2.993172e-01, - 1.746564e-02, -3.393853e-20, 1.905611e-02, -5.220098e-02, - 4.550828e-02, 8.211702e-02, -2.850403e-03, -2.816908e-01, - 6.826700e-02, -1.102444e-02, 7.373374e-03, 9.173237e-03, - -6.144243e-03, 0.0, -1.675391e-02, 2.949211e-02, - -1.925736e-02, 2.259453e-02, 6.339108e-02, -1.233638e-01, - -1.239254e-02, -9.204817e-03, -6.979109e-02, -2.015045e-02, - -1.624232e-02, 0.0, 8.557694e-02, -2.066801e-02, - 2.876340e-01, -1.265177e-01, 7.225822e-03, 7.337274e-02, - -4.342360e-02, -1.974944e-01, -6.721890e-03, -4.495411e-02, - -3.655335e-02, 0.0, -4.551398e-01, 8.440251e-02, - -2.404170e-01, 1.250752e-01, 1.646416e-03, 9.063166e-02, - 2.506036e-02, 8.455078e-03, -1.908465e-02, 6.791655e-03, - 2.511951e-02, 0.0, 7.265597e-03, -1.285137e-03, - -3.404747e-04, 8.924944e-03, 4.234224e-03, -1.186513e-02, - 2.454471e-02, 9.120111e-04, 2.120904e-02, -5.555666e-03, - -1.493565e-02, 0.0, 2.764972e-03, -6.132948e-04, - 6.180623e-03, 3.238724e-03, -1.073131e-02, -1.342798e-04, - 8.969568e-02, 1.010931e-01, -1.038349e-02, -9.198243e-02, - 4.724314e-02, 0.0, 1.175188e-02, -6.051729e-02, - -2.525244e-03, -1.566657e-01, -1.447370e-02, 1.747005e-01, - 1.078679e-01, 2.556116e-01, 3.880575e-02, 9.777729e-03, - 1.078563e-01, 0.0, 4.525005e-01, 8.311278e-03, - 8.198996e-02, -2.884443e-01, -1.808732e-02, -3.114621e-02, - 1.732809e-02, 2.442103e-01, 3.329617e-02, 8.462872e-03, - 6.775563e-02, -7.453864e-19, 1.846050e-01, 2.739331e-02, - 1.029433e-01, -2.251960e-01, 3.331415e-02, -2.261097e-02, - 3.815529e-02, -5.755350e-02, -8.908589e-03, -4.526101e-02, - 1.555560e-02, 0.0, 2.347023e-02, -1.399980e-01, - -2.699343e-02, 2.168779e-02, 2.629133e-03, 3.232189e-02, - 3.693172e-02, -9.767429e-02, 2.461806e-02, 1.045579e-01, - 5.808600e-02, 0.0, -1.331031e-02, 3.555656e-03, - -9.530113e-02, -1.961061e-02, -1.579800e-02, -7.582582e-02, - -3.099381e-02, 9.698432e-01, 7.805698e-01, 1.542833e-01, - -1.025053e-01, -7.509316e-04, -1.675225e-02, -7.818724e-03, - -2.718012e-01, 8.506276e-01, 3.869322e-02, 2.732933e-02, - -4.932691e-02, 7.077541e-01, 4.385699e-01, 8.550947e-02, - -1.737943e-01, -1.007005e-02, 1.884576e-02, 6.328513e-02, - -2.711761e-01, 1.054725e+00, -1.001195e-02, 6.876359e-02, - -4.647969e-01, 7.618478e-01, 1.170148e+00, 5.507177e-02, - -1.284761e-01, 2.255174e-02, 5.041638e-03, 2.431494e-01, - -2.259419e-01, 6.318219e-01, 4.526694e-02, -1.068190e-01, - 9.181661e-05, 7.900977e-01, 5.499427e-01, 2.147153e-02, - -1.855706e-01, -6.816355e-03, 2.600182e-02, 7.784649e-02, - -2.902775e-01, 9.821153e-01, -1.705817e-02, 9.162355e-02, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 4.378970e-02, 7.106545e-01, 5.512720e-01, 1.076883e-01, - -3.036375e-01, 4.190212e-02, -1.192542e-02, 1.002918e-01, - -2.498885e-01, 6.789825e-01, -1.278644e-01, 8.962566e-02, - -4.231460e-02, 2.334089e-01, 3.083326e-03, 2.404322e-02, - -2.668908e-01, 3.057625e-03, -1.283901e-03, 1.349618e-02, - -4.993697e-02, 6.061308e-01, -9.689163e-02, 1.609056e-01}; - const static float cuda5_2_BACKWARD_FILTER_biases[31] = { - 3.927711e-02, 4.658543e-01, 3.737917e-02, -4.173907e-02, - 6.516264e-04, 0.0, 1.141180e+00, 5.656777e-03, - -1.466774e-01, -3.637813e-01, 3.348432e-02, -2.374912e-01, - 1.856181e-01, 1.458818e+00, 1.436140e+00, 1.708800e-01, - 3.663654e-02, 2.147604e-02, 5.249544e-02, 9.389526e-02, - -7.182905e-02, 2.513293e+00, -6.255527e-02, -1.452608e-01, - -7.379941e-01, -5.884537e-01, -6.324590e-01, -6.180407e-01, - 0.0, -1.712828e-01, -2.353933e-01}; - const static float cuda5_2_BACKWARD_FILTER_alpha[7] = { - 2.189385e+08, 1.987406e+08, 6.368552e+07, 2.164986e+08, - 2.000000e+08, 3.611623e+08, 8.509315e+06}; - const static float cuda5_2_BACKWARD_FILTER_beta[7] = { - 1.558573e+00, 1.825239e+00, 1.782366e+00, 1.772095e+00, - 2.000000e+00, 1.856787e+00, 1.625270e+00}; - - float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f}; - float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f}; - float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f}; - const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6}; - const static float cuda5_2_BACKWARD_DATA_matrices[324] = { - 1.090385e-03, -9.525486e-02, 2.116694e-02, 8.324536e-03, - 2.443915e-03, -1.486993e-03, 1.996945e-01, -3.490458e-02, - -2.909729e-01, -4.403929e-01, 3.302580e-03, 8.758115e-03, - 2.016278e-03, 5.139519e-03, 6.631530e-01, 4.163170e-01, - -2.275565e-01, -1.927734e-01, 4.901680e-02, 3.499708e-02, - 4.430823e-02, -6.245822e-01, 2.489910e-02, 2.943479e-01, - 3.011928e-01, -6.154800e-02, -6.945755e-02, 2.156114e-02, - -2.706529e-02, 2.254039e-02, -2.130969e-01, -1.711698e-03, - 3.185264e-01, 3.669779e-01, 2.366176e-01, 2.016553e-01, - 1.742197e-04, 8.993217e-04, -3.757331e-01, -1.517802e-01, - 1.150989e-03, 4.397022e-01, 2.472478e-01, -5.120142e-01, - -5.310764e-01, -2.185705e-02, -1.019608e-02, -1.484592e-01, - -1.720972e-01, 3.073631e-02, 1.679189e-02, 9.030435e-03, - -4.171251e-03, -7.412981e-03, 3.670006e-02, 2.704583e-02, - 1.162922e-01, 8.629673e-02, -1.661878e-01, -1.722751e-01, - -2.494859e-01, 6.303188e-02, 2.379866e-03, -9.154570e-02, - -8.703206e-02, 3.478937e-02, 2.733189e-02, -6.598901e-02, - -2.212522e-02, -3.853705e-02, 2.827537e-02, 2.944724e-02, - 1.588451e-02, 2.663488e-02, 1.933236e-02, 3.978135e-02, - 1.509624e-02, 1.144023e+00, 7.680039e-01, 4.072323e-01, - 3.243737e-01, 4.177893e-02, 4.054888e-02, 1.758260e-01, - 1.351026e-01, 2.773634e-02, 8.728213e-02, 1.938261e-01, - -1.641249e-02, 7.889663e-02, 4.266707e-04, 6.022587e-04, - 6.884130e-04, 2.244700e-04, -3.188357e-01, 1.903596e-01, - 3.979538e-01, -2.875198e-01, -5.881550e-01, -1.732513e-02, - 2.107770e-02, -2.415357e-02, 5.184836e-02, 2.633666e-03, - -4.351313e-01, -3.523280e-01, -1.124865e-01, -5.509025e-02, - -2.874137e-03, -2.260433e-03, 5.087418e-03, 2.825674e-03, - 4.565214e-03, 1.520132e-03, -1.722531e-03, -1.287867e-04, - 1.223576e-03, -5.230475e-04, -2.300250e-03, -6.684309e-03, - -7.956048e-03, -3.028432e-03, 2.238011e-02, -1.166453e-02, - 6.994838e-02, 5.585106e-03, -9.814836e-03, -4.010206e-03, - -3.232308e-03, -1.020571e-02, -1.587651e-02, 6.942352e-02, - 6.370817e-01, 5.906755e-02, -3.062441e-03, 9.914325e-02, - 2.335527e-01, -4.718621e-03, -2.132248e-02, 3.841487e-02, - 7.563891e-02, -7.599686e-02, 1.408871e-01, 5.740594e-02, - 1.902002e-01, 2.145507e-01, 3.427162e-02, 3.367433e-02, - 2.967569e-01, 2.863470e-02, 3.392174e-02, 3.514072e-02, - -1.441963e-01, -4.797359e-02, -5.965770e-03, 1.214167e-01, - 0.0, 0.0, -4.498340e-06, -1.828862e-07, - 0.0, 1.093948e-12, -2.601859e-06, 0.0, - -9.811162e-09, -2.785148e-06, 0.0, -2.360134e-27, - -1.110723e-01, -1.570218e-01, -4.062234e-02, -7.606770e-02, - 5.144730e-01, 9.398572e-02, 1.906881e-01, 1.747961e-02, - 1.106279e-01, -1.254419e-01, 6.205062e-01, -5.617496e-02, - -1.629532e-01, -1.042091e-01, -1.413646e-01, 1.433934e-01, - 1.425548e-01, 2.505819e-02, 5.484238e-04, -9.254320e-02, - 1.448994e-01, 3.132954e-02, -1.425708e-01, -1.685494e-02, - -3.513211e-01, -1.992232e-01, -1.081804e-01, 4.960524e-02, - -5.546688e-01, -1.675645e-02, -3.610602e-02, 2.780567e-02, - 2.227647e-01, 4.038066e-02, -6.002745e-01, -1.275032e-01, - -1.026016e-01, -2.635376e-01, 2.059869e-02, -8.100250e-02, - 8.695480e-02, -4.293829e-02, -1.870224e-02, 7.269356e-02, - 3.979762e-02, 3.270284e-02, 1.190808e-01, -1.059370e-01, - 1.286611e-02, 3.927987e-02, 7.228687e-03, 2.264480e-02, - -1.119717e-01, 8.701903e-02, 2.064170e-02, 5.297894e-02, - 9.965703e-03, 1.206108e-02, -5.411500e-02, -5.476563e-02, - -1.837980e-01, -7.351980e-01, -1.781217e-01, 1.473823e-01, - -4.530039e-01, -3.604104e-02, 2.418269e-02, 2.903621e-02, - 4.367216e-01, -5.112789e-02, -3.706729e-01, -2.049569e-01, - -9.153855e-02, -1.008104e-01, -1.009935e-02, -1.033947e-01, - 5.495172e-02, 1.323372e-02, -5.191914e-02, -1.545710e-02, - 3.271207e-02, 1.939050e-02, -3.092350e-02, 7.518642e-02, - -5.528467e-03, 8.568556e-02, 1.924936e-02, 1.007434e+00, - -6.850208e-07, 5.599304e-01, 3.076834e-01, -4.312680e-01, - 7.534813e-02, -3.293671e-02, 5.830373e-03, -2.450454e-02, - -3.698347e-04, -8.712796e-03, 4.009782e-01, 1.215293e+00, - -5.273760e-07, 2.344936e-01, 1.927198e-01, -3.006327e-01, - -2.927265e-02, -8.696410e-03, -2.446414e-02, 1.890189e-02, - 3.553152e-03, -1.651816e-02, 2.438239e-01, 6.245783e-01, - 1.809883e-07, 3.264363e-01, 7.772639e-01, -2.954962e-01, - 2.704587e-02, -3.836469e-02, -4.457633e-01, 1.726713e-02, - 5.172309e-03, 1.289187e-02, 5.472647e-01, 6.243305e-01, - -4.123602e-08, 4.334479e-01, 7.573158e-02, -2.572208e-01, - 5.492910e-02, -9.502222e-03, -2.104075e-01, -3.131663e-02, - 2.312713e-03, 3.963990e-02, 4.713630e-01, 8.256559e-01, - -2.583514e-08, 4.528451e-01, 7.318445e-02, -2.987004e-01, - 8.577114e-02, -2.907754e-02, -5.389895e-02, 8.495960e-02, - -1.558219e-04, 3.880079e-02, 4.180317e-01, 5.884213e-01, - 3.963620e-07, 4.769594e-01, 3.800152e-01, -3.191836e-01, - -1.669163e-01, 8.362461e-04, -1.668053e-01, -9.146041e-02}; - const static float cuda5_2_BACKWARD_DATA_biases[30] = { - 1.238052e+00, 7.745910e-01, 3.356679e-01, -7.175566e-02, - 1.497247e+00, 3.300638e-03, 2.789130e-01, -8.312362e-02, - -7.829870e-02, -3.456568e-01, 1.328189e+00, -2.689771e-01, - 9.444705e-03, -1.149580e-01, 4.422197e-01, 2.072980e+00, - 0.0, 4.782698e-01, -1.116326e+00, 7.193607e-01, - 2.938375e-02, 1.465170e-02, 8.513468e-02, 6.830001e-02, - 4.035618e-01, 1.607704e-01, 9.502214e-01, 6.022118e-01, - 2.584324e-01, 7.981322e-01}; - const static float cuda5_2_BACKWARD_DATA_alpha[6] = { - 1.997689e+08, 3.799992e+08, 6.843723e+07, 1.140762e+08, - 5.562133e+08, 3.324116e+08}; - const static float cuda5_2_BACKWARD_DATA_beta[6] = { - 1.537834e+00, 1.587649e+00, 1.844705e+00, 1.671656e+00, - 1.672516e+00, 1.705950e+00}; - - float cuda5_2_FORWARD_time_pred[8] = {0.0f}; - float cuda5_2_FORWARD_mask[8] = {0.0f}; - float cuda5_2_FORWARD_hidden_units[24] = {0.0f}; - const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 8}; - const static float cuda5_2_FORWARD_matrices[348] = { - -9.209032e-02, -1.659105e-01, -5.965192e-02, -2.153863e-02, - 8.719379e-02, -3.499233e-02, 7.201853e-03, -1.419160e-02, - -1.818457e-04, -3.145495e-01, 1.526620e-03, -3.928741e-03, - -2.569693e-03, 3.410484e-03, 2.167806e-01, 1.747067e-01, - -2.598841e-01, -3.055519e-01, 5.274500e-04, -9.025287e-03, - -2.483256e-02, 4.541647e-02, 7.308841e-02, -4.819591e-01, - -4.753071e-01, -1.471946e-02, 5.257137e-03, 2.392092e-03, - -1.222254e-02, 1.609546e-02, -3.770980e-03, 1.646060e-02, - 1.753314e-02, 1.508273e-02, 9.316003e-03, -5.777596e-04, - -2.694935e-05, 1.604315e-03, -1.762570e-02, -4.887820e-01, - 4.957791e-03, 2.363977e-01, 3.638881e-01, -4.731908e-01, - -5.269557e-01, -1.159047e-03, 1.838379e-02, -1.427773e-01, - -1.495254e-01, 1.330812e-01, 3.283872e-01, 3.582126e-01, - -1.175109e-01, -1.454948e-01, 2.369200e-02, 1.493328e-02, - 3.108240e-02, 3.270133e-02, -6.615507e-01, 3.380858e-01, - 3.704230e-01, 8.769190e-02, -6.377754e-02, 4.325379e-02, - -2.027540e-03, -1.402376e-01, -9.008316e-02, -2.559709e-03, - -8.711295e-02, -9.627704e-02, -1.539383e-01, -1.632525e-01, - 3.015039e-02, 3.144164e-02, 6.656437e-02, 5.488716e-02, - 1.877632e-01, 5.748791e-01, 3.917130e-01, 2.071713e-01, - 2.771358e-01, -5.960735e-02, 1.106716e-02, 5.781374e-02, - 6.840285e-03, 2.902341e-02, -3.347534e-01, -1.212164e-01, - -8.089989e-02, -1.384973e-01, 1.251527e-02, -2.644526e-01, - 6.949010e-02, 2.681785e-02, 1.081700e-01, -3.502952e-02, - 3.512865e-01, -9.033766e-02, 2.017496e-02, 2.095562e-02, - 1.330583e-02, 2.582395e-02, -2.550245e-03, -1.596605e-03, - -4.966798e-01, -5.384876e-01, -3.006902e-01, -2.735094e-01, - 2.044184e-02, 3.490414e-01, 1.717040e-02, 6.914880e-03, - 1.496788e-02, -7.078647e-02, 6.652176e-02, 6.768194e-03, - -3.086404e-02, 1.317981e-01, -5.902661e-02, -8.681632e-02, - -6.622906e-02, 1.597742e-01, 3.700355e-03, 1.707309e-02, - -5.229016e-02, 2.836531e-02, 9.072421e-03, -1.104825e-01, - 1.009224e-02, -1.915519e-02, -2.592222e-02, -9.112109e-02, - -2.824950e-02, 5.274639e-01, 1.052709e-01, 1.325189e-02, - 3.486569e-01, 1.155336e-01, 7.854062e-02, 1.637263e-02, - -1.599528e-01, 1.090762e-01, 2.625560e-02, 8.724683e-02, - 3.858089e-02, -5.696925e-01, -2.280933e-01, -3.096054e-02, - -5.547203e-01, -6.229282e-02, -1.009606e-01, 5.365341e-02, - 1.673071e-01, -1.734997e-01, -2.949879e-02, -2.640804e-01, - 4.783161e-02, -4.411741e-01, -1.495569e-01, -1.043236e-02, - -2.952088e-01, -2.866718e-02, 4.253592e-02, 3.828135e-02, - 7.448777e-02, -2.757399e-02, -6.067163e-02, -2.007495e-01, - -3.468005e-02, -1.678551e-01, -2.086982e-02, -2.114448e-02, - -2.844830e-02, 3.823385e-03, 8.453450e-03, 1.447659e-03, - 5.760803e-02, 7.803936e-02, -7.363023e-02, -1.894736e-03, - 6.325649e-02, 1.527100e-02, -4.378622e-02, 3.171223e-03, - 8.858634e-01, 7.191087e-02, 2.045580e-01, -3.890414e-03, - -7.661989e-02, 2.667563e-02, -2.549908e-02, -9.384236e-02, - -4.146666e-02, 2.281848e-01, 7.052436e-02, 1.180828e-03, - 1.976338e-01, 1.647339e-02, -2.741527e-02, 1.641885e-02, - -1.197201e-01, -3.670282e-02, 1.672286e-01, 5.267144e-02, - 8.803396e-02, 4.463083e-01, -8.939818e-03, 4.523633e-03, - -1.554685e-01, -1.392173e-02, 4.290194e-03, -9.498623e-03, - -2.200229e-02, -1.022839e-01, 1.553784e-02, 4.006403e-02, - -8.901481e-02, 1.353742e-01, -6.176645e-02, 2.818892e-03, - 4.842044e-02, 1.031219e-02, 4.689164e-03, 2.677023e-01, - -1.331718e-02, 2.130043e-01, 7.004514e-03, -5.422973e-01, - 7.450043e-03, 4.017003e-01, -9.216257e-03, -2.551504e-02, - -2.416791e-01, -1.451814e-01, -1.796521e-01, -1.749250e-02, - 9.023457e-02, 9.444007e-02, -5.293583e-03, -1.027239e-01, - 1.017421e-02, 1.213706e-01, -3.460192e-02, 8.999067e-03, - -1.110771e-01, 2.168397e-01, -4.417743e-02, 8.891370e-02, - -1.271863e-01, -7.239018e-02, -1.346174e-02, 5.777563e-02, - 7.088694e-02, 6.467304e-02, 7.867605e-01, -2.014701e-01, - 1.461604e-01, -6.571004e-02, 6.528026e-01, 6.720600e-01, - 4.151264e-01, -6.271046e-03, -1.568682e-02, 2.438027e-01, - 6.112317e-02, 8.257028e-02, 8.817917e-01, -1.995129e-01, - 9.260281e-02, -6.511735e-02, 6.132895e-01, 5.789503e-01, - 3.354024e-01, 1.621681e-04, -1.380093e-02, 2.710598e-01, - 1.104726e-01, 5.625401e-02, 7.417016e-01, -2.523506e-01, - 1.436054e-01, -7.903862e-02, 5.858161e-01, 4.287509e-01, - 5.370684e-01, -9.449220e-02, -9.393471e-03, 3.037375e-01, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 4.773019e-01, -2.101818e-02, 2.934896e-01, -4.207794e-01, - -2.892000e-01, -1.383682e-02, 3.842597e-01, 5.408122e-01, - -1.901669e-01, -5.255229e-02, 3.103573e-01, 7.447528e-01, - 1.010295e-01, 5.580491e-03, 4.166604e-01, -2.997382e-01, - -3.115629e-01, -2.585651e-02, 5.481771e-01, 6.307158e-01, - 4.869811e-01, 6.668988e-01, -8.661555e-02, 6.073793e-01, - 6.002924e-03, 1.855917e-02, 5.628079e-01, -1.967446e-01, - -1.365761e-01, -3.095432e-02, 6.461580e-01, 7.712716e-01, - 4.082011e-01, 8.834770e-02, -1.021050e-01, 4.353123e-01, - 2.292985e-01, -6.493770e-02, 2.730630e-01, -3.267927e-01, - -3.408634e-01, -6.609171e-02, 5.608538e-01, 7.108021e-01, - 3.760323e-01, 3.335001e-01, 8.168215e-02, 2.858790e-01}; - const static float cuda5_2_FORWARD_biases[32] = { - -1.021053e-02, 1.398318e+00, -2.447664e-01, 2.701163e-02, - 1.148165e+00, 6.030037e-01, 2.089586e-01, 5.609234e-02, - -4.842668e-01, 1.262153e-01, 2.643087e-01, 3.539835e-01, - 4.792117e-01, 4.310244e-02, 1.993983e+00, 2.597207e-01, - -2.811204e-01, 7.933383e-02, 1.056050e+00, 1.234862e+00, - 7.894841e-01, 2.019784e-01, -1.216166e-01, 8.840314e-01, - -3.542692e-01, -3.693904e-01, -2.181383e-01, 0.0, - -2.216420e-01, -1.602890e-01, 8.500483e-03, 2.072607e-01}; - const static float cuda5_2_FORWARD_alpha[8] = { - 2.549612e+08, 3.579459e+08, 1.927015e+08, 2.000000e+08, - 3.222185e+07, 8.748824e+07, 6.676129e+08, 2.775480e+08}; - const static float cuda5_2_FORWARD_beta[8] = { - 1.463412e+00, 1.553222e+00, 1.515109e+00, 2.000000e+00, - 2.117807e+00, 1.622262e+00, 1.626601e+00, 1.669380e+00}; - - if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 && - cuda_minor == 2) { - *layer_num_p = 4; - *hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units; - *layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim; - *matrices_p = cuda5_2_BACKWARD_FILTER_matrices; - *biases_p = cuda5_2_BACKWARD_FILTER_biases; - *alpha_p = cuda5_2_BACKWARD_FILTER_alpha; - *beta_p = cuda5_2_BACKWARD_FILTER_beta; - *time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred; - *mask_p = cuda5_2_BACKWARD_FILTER_mask; - } else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 && - cuda_minor == 2) { - *layer_num_p = 4; - *hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units; - *layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim; - *matrices_p = cuda5_2_BACKWARD_DATA_matrices; - *biases_p = cuda5_2_BACKWARD_DATA_biases; - *alpha_p = cuda5_2_BACKWARD_DATA_alpha; - *beta_p = cuda5_2_BACKWARD_DATA_beta; - *time_pred_p = cuda5_2_BACKWARD_DATA_time_pred; - *mask_p = cuda5_2_BACKWARD_DATA_mask; - } else if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 && - cuda_minor == 2) { - *layer_num_p = 4; - *hidden_units_p = cuda5_2_FORWARD_hidden_units; - *layers_dim_p = cuda5_2_FORWARD_layers_dim; - *matrices_p = cuda5_2_FORWARD_matrices; - *biases_p = cuda5_2_FORWARD_biases; - *alpha_p = cuda5_2_FORWARD_alpha; - *beta_p = cuda5_2_FORWARD_beta; - *time_pred_p = cuda5_2_FORWARD_time_pred; - *mask_p = cuda5_2_FORWARD_mask; - } else { - return false; - } - return true; -#endif -#if CUDNN_MAJOR == 5 && CUDNN_MINOR == 1 - - float cuda5_2_FORWARD_time_pred[9] = {0.0f}; - float cuda5_2_FORWARD_mask[9] = {0.0f}; - float cuda5_2_FORWARD_hidden_units[24] = {0.0f}; - const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 9}; - const static float cuda5_2_FORWARD_matrices[360] = { - 3.087359e-03, -2.629997e-01, 9.492566e-02, 4.831330e-02, - 4.493726e-02, -3.714851e-04, 8.981445e-02, -4.888808e-02, - -7.350665e-02, -7.113249e-01, 2.111573e-02, 6.259846e-02, - 2.931650e-02, 1.313162e-01, 1.926165e-02, 3.785147e-01, - 1.765169e-01, 6.096475e-02, 4.104461e-03, 8.656193e-03, - 1.102456e-02, 7.944959e-03, 4.644261e-02, -5.927094e-01, - -6.180425e-01, -4.314502e-01, -4.073743e-01, 3.077646e-02, - -1.029431e-01, 5.112506e-02, -8.541957e-02, 2.589677e-02, - -5.164597e-02, 1.186986e-01, -4.672555e-02, -6.755380e-02, - -2.806628e-04, 1.056535e-02, -1.438679e-01, -1.122842e-01, - 5.779694e-02, 1.705828e-01, 3.862250e-01, -1.106681e-01, - -5.471609e-02, -2.316525e-02, -4.610147e-02, 2.021985e-03, - -5.761939e-03, 1.209045e-01, -7.279532e-02, 9.754839e-02, - -6.032932e-02, -1.589997e-02, 1.985070e-03, 2.788936e-03, - -2.104690e-01, -2.731634e-01, 1.189841e-02, 2.144678e-01, - 1.771111e-01, -3.730702e-01, -3.886393e-01, -4.719765e-06, - -2.289832e-22, 0.0, 0.0, -7.619362e-33, - 0.0, 0.0, 0.0, 0.0, - 1.652513e-02, 2.785243e-02, 6.713332e-02, 3.292293e-02, - -7.087571e-01, 2.954406e-01, 2.942279e-01, 2.148153e-01, - 9.042904e-02, 3.337476e-02, 5.262762e-02, 1.355991e-01, - 6.802084e-02, 3.188081e-01, 1.053071e+00, 5.648708e-01, - 3.254285e-01, 3.829584e-01, -3.902937e-02, 8.569189e-04, - -6.860779e-03, -1.342737e-02, 9.002463e-04, 2.672171e-01, - 1.833601e-02, -4.791870e-02, -4.673452e-01, -5.951233e-04, - 1.327156e-02, 4.884608e-04, -6.395956e-04, -1.247312e-02, - 2.616015e-03, 2.045540e-02, 1.826517e-02, 2.752957e-02, - 4.864566e-03, 1.974226e-01, 8.022508e-02, 8.533795e-02, - 7.867660e-02, 1.206522e-02, 1.408663e-01, 8.814420e-29, - 2.803104e-02, -1.190598e-01, 4.397753e-01, 2.351956e-03, - 2.934275e-02, 1.909389e-02, -1.119068e-01, -5.117084e-02, - 6.178805e-03, -1.955722e-03, -4.881141e-02, 0.0, - -5.396824e-02, 1.768444e-02, -1.764243e-01, -1.029730e-02, - 3.943393e-02, -1.397969e-02, 9.628724e-02, -4.312754e-02, - -1.602866e-01, -1.405657e-02, 1.331697e-01, 0.0, - -2.396953e-02, 1.866630e-02, 3.267511e-02, -6.928004e-03, - 7.034376e-02, -6.569391e-02, -1.199368e-01, 2.414189e-02, - 3.878685e-02, 1.612695e-02, -9.410737e-02, 2.452490e-33, - -3.085373e-02, 1.452446e-02, 5.175281e-02, -2.379139e-02, - -5.039049e-02, 1.873454e-02, 9.242059e-02, -1.805802e-02, - -4.347714e-02, -3.853900e-02, 1.008241e-01, 0.0, - -9.480388e-03, 2.023331e-02, -6.792901e-03, -8.394149e-03, - -7.546303e-02, 6.270129e-03, -3.894017e-01, -4.973264e-02, - -1.555514e-01, -1.105092e-02, -1.950841e-01, -1.148950e-25, - -2.661943e-02, 9.485362e-02, -4.270326e-01, 7.918665e-03, - -1.816450e-01, -4.379404e-02, -3.889270e-02, -1.432468e-02, - 1.501353e-02, -3.272457e-02, -1.477906e-01, 0.0, - -1.104928e-01, 3.061369e-02, -1.783103e-01, -4.144012e-03, - -1.341517e-02, -8.905338e-02, -2.880624e-01, -4.843873e-02, - -8.718476e-02, -4.244976e-02, -5.811334e-02, 8.169911e-07, - 3.018601e-01, -6.359625e-02, -6.384093e-02, -2.376516e-03, - 1.381678e-01, 5.480919e-03, -1.754923e-02, 1.902135e-02, - 1.838670e-01, 1.829514e-02, 9.986089e-01, 0.0, - -3.820317e-02, -8.010733e-02, 2.023727e-01, -8.899641e-03, - -6.265503e-02, 2.848809e-01, -6.972601e-02, 9.673467e-02, - -6.779978e-02, -1.749464e-02, -1.618047e-01, 0.0, - 5.618134e-03, -7.931516e-02, -7.710180e-01, -5.023658e-03, - 2.721053e-02, 2.372581e-03, 1.131147e-01, 3.923619e-02, - 1.188756e-01, 6.569220e-02, 3.954504e-02, 4.407177e-06, - 3.772899e-02, -7.408679e-02, 2.722764e-01, 9.289873e-03, - -1.720112e-08, -1.111527e-10, -3.223340e-33, 0.0, - 0.0, 0.0, 0.0, 0.0, - -3.947499e-10, -1.125618e-07, 0.0, 0.0, - 7.252669e-01, -2.573835e-02, -3.086479e-03, 1.373577e-02, - -2.595616e-02, -1.071919e-01, -1.039699e-01, 4.686809e-01, - 6.939601e-01, 5.092673e-02, 8.983605e-01, 7.748492e-12, - 7.637465e-01, -5.160391e-02, 4.367014e-03, 5.456513e-03, - -1.755392e-02, -1.141231e-01, -9.624086e-02, 4.324957e-01, - 7.202701e-01, 5.805269e-02, 8.917692e-01, 5.552060e-13, - 6.970178e-01, -1.570065e-01, 3.382218e-02, -2.513156e-02, - -1.520863e-02, -1.164639e-01, -1.687423e-01, 4.522114e-01, - 5.808989e-01, 5.248518e-02, 8.544105e-01, 9.402750e-15, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 4.326442e-01, -5.917080e-02, 2.801385e-01, -2.795843e-02, - 1.264143e-02, -3.693263e-01, -1.749216e-01, 2.439530e-01, - 5.274415e-01, 6.522411e-01, 2.642505e-01, -1.186306e-22, - 4.592337e-01, -3.818674e-02, 1.983223e-02, -3.099717e-02, - 3.941813e-02, -5.257453e-01, -3.692166e-02, 2.670639e-01, - 6.403314e-01, 5.740828e-01, 2.307071e-01, -6.111520e-19, - 5.923415e-01, -1.620244e-01, -6.315269e-03, 1.360147e-01, - 3.776298e-02, -2.748910e-01, -9.679949e-02, 3.612375e-01, - 6.582589e-01, 1.544350e-01, 8.423274e-01, 0.0, - 4.770435e-01, -3.441220e-02, 7.110235e-02, 1.750984e-01, - -1.088923e-01, -3.269669e-01, -3.097497e-01, 3.498318e-01, - 6.162855e-01, 5.070065e-01, 4.478149e-01, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0}; - const static float cuda5_2_FORWARD_biases[33] = { - 2.354680e-01, 4.575782e-01, 6.988282e-01, 2.040031e-01, - 8.584012e-01, 8.249553e-02, 1.267146e+00, 0.0, - 3.549752e-01, -4.857582e-01, 1.279055e+00, 6.212520e-03, - 1.735605e+00, 1.737882e-01, 9.513135e-02, 1.042232e-01, - 2.587379e-02, 1.125817e-01, 4.899196e-01, 8.571400e-01, - 1.188120e+00, 1.079335e+00, 1.945481e+00, 0.0, - -4.535237e-01, -4.646283e-01, -2.796752e-01, 0.0, - -1.881813e-01, 6.431429e-02, 1.600823e-01, 3.773381e-01, - 0.0}; - const static float cuda5_2_FORWARD_alpha[9] = { - 2.371974e+08, 3.625653e+08, 1.961586e+08, 2.000000e+08, - 2.259449e+07, 1.865459e+07, 6.657476e+08, 2.487226e+08, - 2.000000e+08}; - const static float cuda5_2_FORWARD_beta[9] = { - 1.575003e+00, 1.656241e+00, 1.577959e+00, 2.000000e+00, - 2.396584e+00, 2.221534e+00, 1.692119e+00, 1.879424e+00, - 2.000000e+00}; - - float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f}; - float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f}; - float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f}; - const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6}; - const static float cuda5_2_BACKWARD_DATA_matrices[324] = { - 8.340252e-04, -7.066309e-02, 6.012942e-03, -8.961015e-04, - 5.308781e-02, 8.890389e-03, -1.695608e-02, -2.008141e-01, - -2.327795e-01, 1.816323e-03, 1.741969e-03, -4.547063e-01, - -3.278293e-01, 3.194534e-03, 5.590135e-01, 5.038606e-01, - -6.899682e-01, -6.846661e-01, 1.296691e-02, 9.286657e-03, - 6.076815e-02, 9.537656e-03, -1.845960e-01, 2.334390e-01, - 6.584574e-02, -1.502425e-01, -1.464556e-01, 2.582188e-02, - -2.801069e-01, 2.606461e-01, 5.094615e-02, 9.973006e-03, - -2.273075e-01, 1.013311e-01, -2.977537e-01, -3.584019e-01, - 1.550467e-02, -2.365348e-02, -2.361028e-01, -4.535604e-01, - -1.099842e-01, 3.337491e-02, 3.386805e-02, 5.759778e-02, - 5.773445e-02, -6.057084e-03, -5.215100e-03, -2.488342e-02, - 4.550520e-01, -6.358563e-03, -4.111410e-01, -2.748287e-01, - 6.576765e-01, 6.735925e-01, 1.382121e-02, 1.599379e-02, - 2.175570e-01, 4.235858e-01, -4.743209e-03, 8.406488e-01, - 5.463328e-01, 5.315352e-01, 5.759005e-01, -3.956826e-01, - 1.770215e-03, 4.242290e-03, 5.961310e-03, 2.629623e-03, - 3.968062e-01, 2.857247e-01, -3.694852e-01, -4.826791e-01, - -1.361759e-01, 1.741970e-02, 2.067235e-01, -3.166322e-02, - 1.676094e-02, 1.222352e-01, 3.594849e-01, 5.646787e-02, - 9.237770e-02, 2.705673e-02, 3.022863e-02, 2.661669e-01, - 1.342065e-01, 9.685011e-02, -4.619106e-01, -4.885407e-01, - -1.207667e-01, -3.344076e-02, 1.247313e-03, 9.397045e-04, - 3.326222e-03, 2.384325e-03, -5.191239e-01, 3.588830e-01, - 5.642326e-01, -2.458584e-01, -6.050721e-01, -5.983715e-04, - -3.112906e-04, -8.002273e-02, 2.754113e-01, 1.347607e-01, - 2.869407e-01, 3.228108e-01, 2.589051e-01, 2.689373e-01, - 2.097373e-03, -1.213292e-03, 2.289704e-02, 2.260412e-02, - -4.001153e-03, -3.886382e-02, 1.744227e-02, 1.228004e-03, - 5.637321e-02, 5.326664e-03, 5.775909e-02, -7.129682e-02, - 2.957929e-02, -3.619472e-02, -7.687800e-02, 2.551496e-01, - 2.791522e-02, -1.290575e-01, 7.948833e-02, 9.349618e-02, - 4.568452e-03, -2.620651e-01, 9.037835e-03, 1.652229e-01, - -1.035363e-02, -4.924298e-01, -1.359403e-01, -2.509044e-02, - 6.072426e-02, -1.067680e-01, 9.075266e-02, -5.669300e-01, - -5.016208e-02, -4.982992e-03, -4.493951e-01, 2.403491e-02, - -5.795595e-03, 8.214971e-02, 1.994753e-03, 2.271867e-03, - 8.008438e-03, -1.517102e-01, -2.790549e-02, 7.735109e-02, - -1.794875e-02, 1.122736e-02, -4.320173e-02, -9.230874e-03, - -4.703557e-02, -3.043727e-02, -1.645634e-01, -6.124438e-02, - 2.416326e-01, -2.548371e-01, 2.711228e-01, 2.171408e-01, - -1.613229e-02, -1.133995e-01, -5.881115e-01, 1.196182e-01, - -1.574013e-02, -2.309249e-02, -9.163861e-02, -1.243609e-03, - 2.755058e-03, -8.981592e-02, 4.023712e-02, 1.447185e-01, - 1.773491e-02, -4.728686e-02, 4.132702e-02, 4.325303e-02, - 9.868489e-02, -2.594438e-01, 1.111406e-02, 5.278649e-02, - -5.842348e-02, 7.532353e-02, -3.890866e-02, 7.389170e-03, - -8.200553e-02, -2.977651e-04, 2.846818e-01, -2.641009e-02, - -3.923972e-06, 1.683590e-06, 4.231356e-06, -1.460619e-05, - 1.480699e-05, -4.800242e-05, -3.605007e-05, 4.642337e-06, - -1.237117e-05, -6.065346e-05, 1.122525e-07, -4.718931e-05, - -4.836941e-02, 2.925190e-02, 5.125062e-02, -8.673830e-02, - 4.049347e-02, -1.281789e-01, 4.054615e-02, -1.102404e-01, - 1.797214e-02, 8.068577e-03, 9.849558e-02, 2.462221e-02, - -3.952334e-02, 7.078841e-02, 5.095275e-03, -5.172743e-03, - 1.358633e-01, -4.528875e-01, 4.454420e-01, -5.941349e-01, - -8.203693e-02, -2.733144e-01, -4.668098e-01, 2.087940e-01, - 2.732850e-01, 1.967585e-01, -1.648116e-02, -4.675763e-02, - -2.471467e-02, -3.507713e-02, 1.268763e-01, -4.777270e-04, - -6.884494e-02, -4.142293e-02, 4.568305e-01, -1.171813e-01, - 4.104385e-02, 4.123072e-01, 1.201161e-01, 5.688429e-02, - -6.769225e-02, 1.879334e-01, -1.869847e-01, 2.116578e-01, - 1.023851e-01, -7.956885e-03, 3.125194e-02, -3.698255e-02, - -1.742767e-02, 8.019327e-02, -2.414790e-01, 1.692867e-01, - -1.363161e-01, -2.991336e-02, 1.571377e-01, -4.675832e-05, - 3.410926e-02, -2.423313e-02, 3.784683e-01, 8.980562e-01, - 1.445573e-02, 3.742977e-01, -1.449231e-01, 5.250753e-02, - -9.320556e-02, 1.881413e-01, 1.525415e-01, 1.516415e-05, - -2.865472e-02, -4.090607e-02, 1.368707e-01, 1.152067e+00, - 3.926153e-02, 3.892255e-01, -3.988812e-01, 2.768721e-01, - 1.682807e-01, -8.165011e-02, 2.984257e-01, -2.310482e-05, - -1.301168e-01, -3.295192e-01, 1.955211e-01, 6.782165e-01, - -1.859493e-02, 5.047321e-01, -3.545281e-01, 6.802614e-01, - -2.701511e-02, 5.938844e-02, 1.288360e-01, 6.412582e-05, - 6.354152e-02, -2.929806e-01, 1.172161e-01, 5.812020e-01, - -1.526828e-03, 4.311178e-01, -1.572772e-01, 3.847064e-01, - -1.406437e-01, -8.771673e-02, 1.723672e-01, -2.926565e-05, - 1.170990e-01, -1.168602e-01, 2.353766e-01, 8.977429e-01, - 1.029375e-02, 4.529134e-01, -3.884215e-01, 2.041353e-01, - -2.684749e-02, 9.474846e-02, 1.718571e-01, 9.999280e-06, - -9.272413e-02, -1.050809e-01, 2.637663e-01, 6.296775e-01}; - const static float cuda5_2_BACKWARD_DATA_biases[30] = { - 2.758991e-01, 9.040871e-01, 6.578859e-01, 3.464146e-01, - -1.074793e-01, -1.111640e+00, -4.436951e-03, 1.027522e+00, - 5.782945e-02, -6.986979e-02, 1.183250e+00, -9.289587e-02, - 2.339573e-03, 2.321955e-01, 6.579675e-01, 9.597613e-01, - 4.900812e-02, 1.206250e-01, 1.320550e-01, 1.839768e-17, - 1.678722e-01, -3.203184e-03, 7.736452e-01, 2.727852e+00, - 1.589646e-01, -3.824490e-02, 5.180550e-01, 7.756407e-01, - 4.521459e-01, 4.122442e-01}; - const static float cuda5_2_BACKWARD_DATA_alpha[6] = { - 1.933176e+08, 4.558126e+08, 6.040167e+07, 4.608431e+07, - 6.338093e+08, 3.281159e+08}; - const static float cuda5_2_BACKWARD_DATA_beta[6] = { - 1.608048e+00, 1.659768e+00, 1.943038e+00, 1.953083e+00, - 1.738348e+00, 1.891296e+00}; - - float cuda5_2_BACKWARD_FILTER_time_pred[6] = {0.0f}; - float cuda5_2_BACKWARD_FILTER_mask[6] = {0.0f}; - float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f}; - const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 6}; - const static float cuda5_2_BACKWARD_FILTER_matrices[324] = { - 4.047185e-03, 3.388695e-04, 1.210363e-04, -6.148457e-06, - -3.252271e-03, 8.122424e-04, 1.075851e-03, 3.066259e-03, - 1.921126e-03, -1.042791e-04, -3.275821e-01, 4.278608e-03, - -2.106100e-01, 8.295547e-02, 2.430674e-01, -2.748593e-02, - -2.065240e-02, -1.395731e-02, -3.491511e-02, 3.520847e-03, - 1.790237e-02, 1.188376e-02, 5.372314e-02, 1.494784e-02, - 5.035055e-02, 6.581915e-02, 6.861494e-02, -2.199881e-03, - -2.281682e-02, -9.687833e-02, 3.909182e-03, 1.024575e-01, - 3.948949e-02, -4.566963e-02, -1.375550e-01, -6.794923e-02, - 6.135985e-04, -4.608163e-01, 2.404660e-01, 6.274750e-03, - 1.059302e-01, 1.676516e-01, -5.104349e-02, 9.925397e-02, - -1.470984e-02, 1.031084e-04, 4.374801e-02, -5.167035e-01, - -3.632444e-01, 8.170792e-02, 3.783056e-01, 3.212413e-01, - -4.803373e-01, -4.874209e-01, 2.615676e-04, 3.406848e-02, - 8.674651e-02, 3.508870e-03, -6.156053e-01, 3.270718e-01, - 3.457363e-01, 1.898023e-01, -1.473479e-01, -2.987293e-01, - 1.315816e-03, -5.991638e-03, 1.428707e-03, 1.580944e-03, - 6.320467e-01, 2.342933e-01, -7.387988e-01, -4.437208e-01, - -7.261886e-02, 5.008508e-03, 4.693171e-02, -5.879956e-02, - 1.677305e-02, 1.845511e-01, 3.830231e-01, 4.003870e-02, - 9.888364e-02, 7.434040e-04, 7.895462e-02, 2.310843e-01, - 1.044731e-02, 1.716935e-01, 1.390186e-01, -3.862206e-01, - -1.001334e-01, 1.338546e-02, -1.354914e-02, 5.464492e-02, - 3.437773e-03, -2.069449e-03, -3.513253e-02, 1.837639e-02, - -1.552736e-01, -1.349904e-02, -1.025307e-01, -4.804826e-06, - 3.284197e-02, 5.086832e-02, 5.690669e-03, 7.154379e-02, - 1.094594e+00, 1.068281e+00, 3.653902e-01, 3.107198e-01, - -7.299128e-03, -3.042033e-04, 5.593516e-03, 3.541658e-03, - 5.810616e-04, 8.030201e-03, -1.622678e-02, 1.400076e-04, - 2.819623e-03, 4.108455e-03, 5.561182e-03, 2.512096e-03, - -8.622734e-04, 5.333219e-02, 3.076694e-02, 1.795766e-01, - -2.318845e-02, -3.202521e-02, 3.119619e-01, -1.606582e-01, - -1.085588e-01, -9.067213e-02, -1.422861e-02, -3.444208e-02, - -1.635176e-04, -2.596654e-01, 1.995525e-02, 2.055750e-02, - 2.022944e-01, 4.327365e-01, -1.619481e-02, 1.125397e-01, - 7.984060e-03, -2.073076e-01, -1.761664e-02, -4.832107e-02, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 1.638518e-03, -1.793951e-02, 5.772194e-02, 2.851987e-02, - 6.163087e-02, 2.867437e-02, 5.545961e-02, -1.660824e-01, - 9.789789e-02, -1.159482e-01, 5.385513e-02, 6.836513e-02, - 5.594874e-04, -2.741018e-02, 4.838353e-02, 4.298405e-02, - 1.854298e-02, 3.633871e-02, 9.942706e-03, 3.490340e-01, - 8.440907e-02, 2.376168e-02, 4.866724e-02, -2.214078e-01, - -5.650432e-03, -8.008064e-04, 1.477945e-03, 9.983850e-04, - 2.346494e-04, 2.069148e-03, -4.035380e-03, -5.895875e-03, - -2.146410e-04, 8.988932e-04, 7.378523e-05, -3.107871e-05, - 3.014900e-03, -2.577113e-01, 8.653076e-03, -2.681585e-02, - -5.089819e-02, -2.550743e-02, -3.467115e-02, 3.631677e-01, - -5.167207e-02, 9.202915e-02, -2.041105e-02, -1.355488e-01, - -4.411176e-03, 1.459578e-01, -1.287185e-02, -5.766148e-03, - -1.725510e-01, 1.716040e-01, -1.324064e-01, -1.831788e-01, - -4.434610e-02, -7.823753e-02, -2.463202e-02, 2.183346e-02, - 5.483676e-04, -7.481821e-02, -8.179377e-03, -3.340281e-02, - -2.679154e-03, -3.484565e-02, -4.761697e-02, -7.778479e-01, - -9.353197e-02, -1.011887e-01, -3.653892e-02, 3.624209e-01, - -2.063141e-03, -1.785554e-03, 5.357111e-02, -4.105966e-02, - 4.269572e-02, -1.532830e-01, 2.175374e-02, 1.304753e-01, - 5.400207e-02, -4.020891e-02, -2.284152e-02, 1.153921e-01, - 2.909448e-03, -1.312913e-02, -1.562593e-01, -1.018874e-01, - 7.121818e-03, -1.468466e-01, 3.900497e-03, -2.249627e-02, - -5.684932e-02, 2.612863e-02, -1.410081e-01, 2.298795e-02, - 6.794739e-04, 7.064358e-01, 7.429705e-01, 0.0, - 3.578874e-01, 7.372183e-01, -2.632545e-04, -1.001730e-01, - 4.224807e-01, -1.673518e-01, 9.987204e-04, -7.437595e-02, - 4.765817e-05, 5.283366e-01, 5.804700e-01, 0.0, - 5.304079e-01, 8.826514e-01, 1.191588e-04, -2.403303e-02, - 8.384521e-02, -1.913135e-01, -2.046290e-04, -4.905949e-02, - -5.695952e-03, 4.907159e-01, 8.468218e-01, 0.0, - 3.835697e-01, 4.161280e-01, -1.292199e-03, 2.594048e-01, - 4.049456e-01, -4.400651e-01, 4.166223e-01, -1.978285e-01, - 2.546945e-04, 5.691357e-01, 7.418897e-01, 0.0, - 5.059269e-01, 8.695604e-01, -4.737849e-05, -1.666739e-02, - 1.190503e-01, -1.528916e-01, -1.769190e-04, -4.045478e-02, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 3.728615e-02, 3.964641e-01, 6.608990e-01, 0.0, - 6.230336e-01, 5.074117e-01, 8.405939e-03, -1.422498e-01, - 4.705996e-02, -2.407855e-01, -2.056813e-02, 2.624028e-01}; - const static float cuda5_2_BACKWARD_FILTER_biases[30] = { - 2.526327e-03, 1.731556e-02, 7.836947e-02, 6.594411e-02, - 1.693102e-01, 7.814206e-01, 6.354987e-01, 9.766987e-01, - 2.128775e-01, -4.894586e-01, -3.741650e-02, -1.046441e-01, - -2.802074e-02, 1.076976e+00, 1.484343e+00, 0.0, - 1.488592e+00, 2.316875e+00, -5.133961e-03, 3.100583e-01, - 6.346995e-01, 3.810246e-01, -2.523698e-01, 3.231826e-01, - -4.245956e-01, -4.564983e-01, 4.500998e-03, -5.841292e-01, - 0.0, -1.728347e-01}; - const static float cuda5_2_BACKWARD_FILTER_alpha[6] = { - 2.066506e+08, 2.177061e+08, 5.654493e+07, 2.368001e+08, - 2.000000e+08, 2.537848e+08}; - const static float cuda5_2_BACKWARD_FILTER_beta[6] = { - 1.610186e+00, 1.844894e+00, 1.895551e+00, 1.816587e+00, - 2.000000e+00, 2.252824e+00}; - - if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 && - cuda_minor == 2) { - *layer_num_p = 4; - *hidden_units_p = cuda5_2_FORWARD_hidden_units; - *layers_dim_p = cuda5_2_FORWARD_layers_dim; - *matrices_p = cuda5_2_FORWARD_matrices; - *biases_p = cuda5_2_FORWARD_biases; - *alpha_p = cuda5_2_FORWARD_alpha; - *beta_p = cuda5_2_FORWARD_beta; - *time_pred_p = cuda5_2_FORWARD_time_pred; - *mask_p = cuda5_2_FORWARD_mask; - } else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 && - cuda_minor == 2) { - *layer_num_p = 4; - *hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units; - *layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim; - *matrices_p = cuda5_2_BACKWARD_DATA_matrices; - *biases_p = cuda5_2_BACKWARD_DATA_biases; - *alpha_p = cuda5_2_BACKWARD_DATA_alpha; - *beta_p = cuda5_2_BACKWARD_DATA_beta; - *time_pred_p = cuda5_2_BACKWARD_DATA_time_pred; - *mask_p = cuda5_2_BACKWARD_DATA_mask; - } else if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 && - cuda_minor == 2) { - *layer_num_p = 4; - *hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units; - *layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim; - *matrices_p = cuda5_2_BACKWARD_FILTER_matrices; - *biases_p = cuda5_2_BACKWARD_FILTER_biases; - *alpha_p = cuda5_2_BACKWARD_FILTER_alpha; - *beta_p = cuda5_2_BACKWARD_FILTER_beta; - *time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred; - *mask_p = cuda5_2_BACKWARD_FILTER_mask; - } else { - return false; - } - return true; -#endif - - return false; -} - -// vim: syntax=cpp.doxygen diff --git a/dnn/src/x86/conv_bias/f32/winograd_algo.cpp b/dnn/src/x86/conv_bias/f32/winograd_algo.cpp index e8c358c0538699ced171af443158a3f8aca87868..95dd34a55f0950335482eb882adaf43396c9ee8f 100644 --- a/dnn/src/x86/conv_bias/f32/winograd_algo.cpp +++ b/dnn/src/x86/conv_bias/f32/winograd_algo.cpp @@ -16,7 +16,6 @@ #include "src/x86/conv_bias/opr_impl.h" #include "src/x86/conv_bias/postprocess_helper.h" #include "src/x86/handle.h" -#include "src/x86/profile.h" #include "midout.h"