matrix_solve.cu.cc 6.9 KB
Newer Older
W
Weilong Wu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/math/matrix_solve.h"
16

W
Weilong Wu 已提交
17 18 19
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/solve_op.h"
#include "paddle/fluid/platform/device_context.h"
20 21
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
W
Weilong Wu 已提交
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39

namespace paddle {
namespace platform {
class CUDADeviceContext;
}  // namespace platform
}  // namespace paddle

namespace paddle {
namespace operators {
namespace math {

template <typename DeviceContext, typename T>
class MatrixSolveFunctor;

template <typename T>
class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
40 41
                  const framework::Tensor& a,
                  const framework::Tensor& b,
W
Weilong Wu 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
                  framework::Tensor* out) {
#ifndef PADDLE_WITH_HIP

    // solve the equation: Ax = B,
    // use cuBlas cublas<S/D>getrfBatched funcion to performs the LU
    // factorization of each matrix A,
    // and then use cuBlas cublas<S/D>getriBatched function to solve the
    // equation after LU factorization.
    // ref:
    // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
    const auto& a_dims = a.dims();
    const int a_rank = a_dims.size();
    int n = a_dims[a_rank - 1];
    int lda = n;
    int batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;

    const auto& b_dims = b.dims();
    const int b_rank = b_dims.size();
    int nrhs = b_dims[b_rank - 1];
    int ldb = b_dims[b_rank - 2];

    // make sure the out dims is right
    out->Resize(b_dims);
    out->mutable_data<T>(context.GetPlace());

    // copy input A to a temporary tensor tmp_a,
    // LU factorization, written back to original matrix A, so in the beginning,
    // it's necessary to create a temporary tensor tmp_a.
70
    Tensor tmp_a(a.dtype());
W
Weilong Wu 已提交
71 72
    tmp_a.Resize(a.dims());
    tmp_a.mutable_data<T>(context.GetPlace());
73
    framework::TensorCopy(a, context.GetPlace(), &tmp_a);
W
Weilong Wu 已提交
74 75 76 77 78

    // copy input B to a temporary tensor tmp_b, and transpose tmp_b,
    // because cuBlas assumes column-major while Paddle uses row-majar.
    Tensor tmp_b(b.type());
    const auto& new_dims_vec = getNewDimsVec(b_dims);
79
    tmp_b.Resize(phi::make_ddim(new_dims_vec));
W
Weilong Wu 已提交
80
    tmp_b.mutable_data<T>(context.GetPlace());
81
    phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
W
Weilong Wu 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
    std::vector<int> new_axis = getNewAxis(b_rank);
    trans(context, b, &tmp_b, new_axis);

    const T* a_data_in_gpu = tmp_a.data<T>();
    const T* b_data_in_gpu = tmp_b.data<T>();

    std::vector<const T*> cpu_ptrs(batch_size * 2);
    for (int i = 0; i < batch_size; ++i) {
      cpu_ptrs[i] = a_data_in_gpu + i * n * n;
      cpu_ptrs[i + batch_size] = b_data_in_gpu + i * n * nrhs;
    }

    // Copy the addresses of A and tmp_b from host to device.
    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
97 98 99 100 101 102
    memory::Copy(context.GetPlace(),
                 tmp_gpu_ptrs_data->ptr(),
                 platform::CPUPlace(),
                 static_cast<void*>(cpu_ptrs.data()),
                 cpu_ptrs.size() * sizeof(T*),
                 context.stream());
W
Weilong Wu 已提交
103 104 105 106 107 108 109 110 111 112

    T** gpu_tmp_b_ptrs =
        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;

    // Allocate device memory for BatchedGETRF's info and pivots.
    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
    memory::allocation::AllocationPtr tmp_gpu_info_data =
        memory::Alloc(context, num_ints * sizeof(int));
    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());

113
    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
W
Weilong Wu 已提交
114 115 116 117 118 119 120 121 122 123 124

    // only for singular checking
    std::vector<int> info;
    info.resize(batch_size);

    int* gpu_pivot_ptr =
        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;

    // This function performs the LU factorization of each matrix A by the
    // equation A = L * U. L and U are written back to original matrix A,
    // and diagonal elements of L are discarded.
125 126 127 128 129
    blas.BatchedGETRF(n,
                      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
                      gpu_pivot_ptr,
                      gpu_info_ptr,
                      batch_size);
W
Weilong Wu 已提交
130 131

    // check whether BatchedGETRF is executed successfully or not
132 133 134 135 136 137
    memory::Copy(platform::CPUPlace(),
                 info.data(),
                 context.GetPlace(),
                 gpu_info_ptr,
                 sizeof(int) * batch_size,
                 context.stream());
W
Weilong Wu 已提交
138
    for (int i = 0; i < batch_size; ++i) {
139 140
      PADDLE_ENFORCE_EQ(info[i],
                        0,
W
Weilong Wu 已提交
141 142 143 144
                        platform::errors::PreconditionNotMet(
                            "For batch [%d]: U(%d, %d) is zero, singular U. "
                            "Please check the matrix value and change it to a "
                            "non-singular matrix",
145 146 147
                            i,
                            info[i],
                            info[i]));
W
Weilong Wu 已提交
148 149 150 151 152 153 154
    }

    // hold the result code from BatchedGETRS
    int host_info = 0;

    // to solve the equation after LU factorization
    CBLAS_TRANSPOSE transA = CblasTrans;
155 156 157 158 159 160 161 162 163 164
    blas.BatchedGETRS(transA,
                      n,
                      nrhs,
                      reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
                      lda,
                      gpu_pivot_ptr,
                      gpu_tmp_b_ptrs,
                      ldb,
                      &host_info,
                      batch_size);
W
Weilong Wu 已提交
165 166

    // check whether BatchedGETRS is executed successfully or not
167 168
    PADDLE_ENFORCE_EQ(host_info,
                      0,
W
Weilong Wu 已提交
169 170 171 172 173 174
                      platform::errors::InvalidArgument(
                          "The [%d]'th argument to cublas*getrsBatched had "
                          "an illegal value.",
                          -host_info));

    // transpose tmp_b to get the final result in row-major form.
175
    phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans2;
W
Weilong Wu 已提交
176 177 178 179 180 181 182 183 184 185 186 187 188 189
    trans2(context, tmp_b, out, new_axis);

#else
    compute_solve_eigen<platform::CUDADeviceContext, T>(context, a, b, out);
#endif
  }
};

template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;

}  // namespace math
}  // namespace operators
}  // namespace paddle