lstsq_op.cu 11.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef PADDLE_WITH_HIP
// HIP not support cusolver

#include <string>
#include <vector>
20

21
#include "paddle/fluid/framework/phi_utils.h"
22 23 24
#include "paddle/fluid/operators/lstsq_op.h"
#include "paddle/fluid/operators/qr_op.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
25
#include "paddle/phi/kernels/triangular_solve_kernel.h"
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

namespace paddle {
namespace operators {

using paddle::framework::Tensor;

template <typename DeviceContext, typename T>
class LstsqCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor& x = *context.Input<Tensor>("X");
    const Tensor& y = *context.Input<Tensor>("Y");
    auto* solution = context.Output<Tensor>("Solution");

    auto dito =
        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
                                                 T>(context);
    auto& dev_ctx =
        context.template device_context<platform::CUDADeviceContext>();

    auto x_dims = x.dims();
    auto y_dims = y.dims();
    int dim_size = x_dims.size();
    int m = x_dims[dim_size - 2];
    int n = x_dims[dim_size - 1];
    int nrhs = y_dims[dim_size - 1];
    int min_mn = std::min(m, n);
    int max_mn = std::max(m, n);
    int k = min_mn;

    int x_stride = MatrixStride(x);
    int y_stride = MatrixStride(y);
    int tau_stride = min_mn;
    int batch_count = BatchCount(x);

    Tensor new_x, new_y;
    new_x.mutable_data<T>(context.GetPlace(),
                          size_t(batch_count * m * n * sizeof(T)));
    new_y.mutable_data<T>(context.GetPlace(),
                          size_t(batch_count * m * nrhs * sizeof(T)));
    framework::TensorCopy(x, context.GetPlace(), &new_x);
    framework::TensorCopy(y, context.GetPlace(), &new_y);

    // Prepare tau
70
    auto tau_dims_vec = phi::vectorize<int>(x_dims);
71 72 73 74 75
    tau_dims_vec.pop_back();
    tau_dims_vec[tau_dims_vec.size() - 1] = min_mn;
    Tensor tau = dito.Fill(tau_dims_vec, 0);
    auto tau_data = tau.mutable_data<T>(context.GetPlace());

76 77 78 79
    using Context =
        typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
    auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);

80 81 82 83 84 85 86
    if (m >= n) {
      Tensor tmp_x = dito.Transpose(new_x);
      Tensor tmp_y = dito.Transpose(new_y);
      auto x_data = tmp_x.mutable_data<T>(context.GetPlace());
      auto y_data = tmp_y.mutable_data<T>(context.GetPlace());

      // step 1, compute QR factorization using geqrf
87 88 89 90 91 92 93 94 95
      BatchedGeqrf<DeviceContext, T>(dev_ctx,
                                     batch_count,
                                     m,
                                     n,
                                     x_data,
                                     m,
                                     tau_data,
                                     x_stride,
                                     tau_stride);
96 97

      // Step 2, Y <- Q^H Y
98 99 100 101 102
      BatchedOrmqr<DeviceContext, T>(dev_ctx,
                                     true,
                                     true,
                                     batch_count,
                                     m,
103
                                     nrhs,
104 105 106 107 108 109 110
                                     k,
                                     x_data,
                                     x_stride,
                                     tau_data,
                                     tau_stride,
                                     y_data,
                                     y_stride);
111 112 113 114 115 116 117 118 119

      Tensor trans_r = dito.Transpose(tmp_x);
      Tensor slice_r = dito.Slice(trans_r, {-2}, {0}, {min_mn});
      Tensor res_r = dito.TrilTriu(slice_r, 0, false);

      Tensor trans_y = dito.Transpose(tmp_y);
      Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn});

      // Step 3, solve R X = Y
120 121
      phi::TriangularSolveKernel<T, Context>(
          phi_dev_ctx, res_r, slice_y, true, false, false, solution);
122

123 124 125 126 127
    } else {
      auto x_data = new_x.mutable_data<T>(context.GetPlace());
      auto y_data = new_y.mutable_data<T>(context.GetPlace());

      // step 1, compute QR factorization using geqrf
128 129 130 131 132 133 134 135 136
      BatchedGeqrf<DeviceContext, T>(dev_ctx,
                                     batch_count,
                                     n,
                                     m,
                                     x_data,
                                     n,
                                     tau_data,
                                     x_stride,
                                     tau_stride);
137 138 139

      // Step 2, solve R^H Z = Y
      Tensor trans_r = dito.Transpose(new_x);
140 141 142
      Tensor slice_r = dito.Slice(trans_r, {-2}, {0}, {min_mn});
      Tensor res_r = dito.TrilTriu(slice_r, 0, false);

143
      phi::TriangularSolveKernel<T, Context>(
144
          phi_dev_ctx, res_r, new_y, true, true, false, solution);
145 146

      // Step 3, X <- Q Z
147 148 149
      BatchedOrgqr<DeviceContext, T>(dev_ctx,
                                     batch_count,
                                     n,
150
                                     m,
151 152 153 154 155 156
                                     min_mn,
                                     x_data,
                                     n,
                                     tau_data,
                                     x_stride,
                                     tau_stride);
157 158 159 160 161 162 163 164 165 166
      Tensor trans_q = dito.Transpose(new_x);
      Tensor slice_q = dito.Slice(trans_q, {-1}, {0}, {m});
      Tensor solu_tensor = dito.Matmul(slice_q, *solution, false, false);
      framework::TensorCopy(solu_tensor, solution->place(), solution);
    }
  }
};

template <>
void BatchedOrmqr<platform::CUDADeviceContext, float>(
167 168 169 170 171 172 173 174 175 176 177 178 179
    const platform::CUDADeviceContext& dev_ctx,
    bool left,
    bool transpose,
    int batch_size,
    int m,
    int n,
    int k,
    float* a,
    int a_stride,
    float* tau,
    int tau_stride,
    float* other,
    int other_stride) {
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
  int lwork = 0;
  auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
  auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
  int lda = std::max<int>(1, left ? m : n);
  int ldc = std::max<int>(1, m);

  auto handle = dev_ctx.cusolver_dn_handle();
  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSormqr_bufferSize(
      handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
  auto info = memory::Alloc(dev_ctx, sizeof(int));
  int* info_d = reinterpret_cast<int*>(info->ptr());

  for (int i = 0; i < batch_size; ++i) {
    float* a_working_ptr = &a[i * a_stride];
    float* tau_working_ptr = &tau[i * tau_stride];
    float* other_working_ptr = &other[i * other_stride];
196 197 198 199 200

    handle = dev_ctx.cusolver_dn_handle();
    auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
    float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());

201
    // compute ormgr
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cusolverDnSormqr(handle,
                                            side,
                                            trans,
                                            m,
                                            n,
                                            k,
                                            a_working_ptr,
                                            lda,
                                            tau_working_ptr,
                                            other_working_ptr,
                                            ldc,
                                            workspace_ptr,
                                            lwork,
                                            info_d));
217 218 219

    // check the error info
    int info_h;
220 221 222 223 224 225
    memory::Copy(platform::CPUPlace(),
                 &info_h,
                 dev_ctx.GetPlace(),
                 info_d,
                 sizeof(int),
                 dev_ctx.stream());
226
    PADDLE_ENFORCE_EQ(
227 228
        info_h,
        0,
229 230 231 232 233 234 235
        platform::errors::PreconditionNotMet(
            "For batch [%d]: CUSolver info is not zero but [%d]", i, info_h));
  }
}

template <>
void BatchedOrmqr<platform::CUDADeviceContext, double>(
236 237 238 239 240 241 242 243 244 245 246 247 248
    const platform::CUDADeviceContext& dev_ctx,
    bool left,
    bool transpose,
    int batch_size,
    int m,
    int n,
    int k,
    double* a,
    int a_stride,
    double* tau,
    int tau_stride,
    double* other,
    int other_stride) {
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  int lwork = 0;
  auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
  auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
  int lda = std::max<int>(1, left ? m : n);
  int ldc = std::max<int>(1, m);

  auto handle = dev_ctx.cusolver_dn_handle();
  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDormqr_bufferSize(
      handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
  auto info = memory::Alloc(dev_ctx, sizeof(int));
  int* info_d = reinterpret_cast<int*>(info->ptr());

  for (int i = 0; i < batch_size; ++i) {
    double* a_working_ptr = &a[i * a_stride];
    double* tau_working_ptr = &tau[i * tau_stride];
    double* other_working_ptr = &other[i * other_stride];
265 266 267 268 269

    handle = dev_ctx.cusolver_dn_handle();
    auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
    double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());

270
    // compute ormgr
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cusolverDnDormqr(handle,
                                            side,
                                            trans,
                                            m,
                                            n,
                                            k,
                                            a_working_ptr,
                                            lda,
                                            tau_working_ptr,
                                            other_working_ptr,
                                            ldc,
                                            workspace_ptr,
                                            lwork,
                                            info_d));
286 287 288

    // check the error info
    int info_h;
289 290 291 292 293 294
    memory::Copy(platform::CPUPlace(),
                 &info_h,
                 dev_ctx.GetPlace(),
                 info_d,
                 sizeof(int),
                 dev_ctx.stream());
295
    PADDLE_ENFORCE_EQ(
296 297
        info_h,
        0,
298 299 300 301 302 303 304 305 306 307 308
        platform::errors::PreconditionNotMet(
            "For batch [%d]: CUSolver info is not zero but [%d]", i, info_h));
  }
}

}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;

REGISTER_OP_CUDA_KERNEL(
309 310
    lstsq,
    ops::LstsqCUDAKernel<paddle::platform::CUDADeviceContext, float>,
311 312 313
    ops::LstsqCUDAKernel<paddle::platform::CUDADeviceContext, double>);

#endif  // not PADDLE_WITH_HIP