!316 Edit GPU ops

Merge pull request !316 from VectorSL/edit

!316 Edit GPU ops
Merge pull request !316 from VectorSL/edit
f746caf5 · mindspore-ci-bot · Gitee · 2bcff36e · 930c9101 · f746caf5
9 changed file
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/unary_op_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/unary_op_impl.cu
@@ -53,6 +53,13 @@ __global__ void ReciprocalKernel(T *input, T *output, size_t count) {
  return;
 }
 template <typename T>
+__global__ void SquareKernel(T *input, T *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i] * input[i];
+  }
+  return;
+}
+template <typename T>
 void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
  ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
  return;
@@ -72,12 +79,18 @@ void Reciprocal(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
  ReciprocalKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
  return;
 }
-
+template <typename T>
+void Square(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
+  SquareKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
 template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Negative<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Reciprocal<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
+template void Square<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Reciprocal<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
+template void Square<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/unary_op_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/unary_op_impl.cuh
@@ -26,5 +26,6 @@ template <typename T>
 void Negative(T *input, T *output, size_t count, cudaStream_t cuda_stream);
 template <typename T>
 void Reciprocal(T *input, T *output, size_t count, cudaStream_t cuda_stream);
-
+template <typename T>
+void Square(T *input, T *output, size_t count, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
--- a/mindspore/ccsrc/kernel/gpu/gpu_kernel_factory.cc
+++ b/mindspore/ccsrc/kernel/gpu/gpu_kernel_factory.cc
@@ -41,8 +41,9 @@ void GpuKernelFactory::CheckIOParam(const std::string &kernel_name, const Kernel
                                    size_t attr_index) {
  if (kernel_info->GetInputNum() != iter_second->at(attr_index).first.GetInputSize()) {
    if (iter_second->at(attr_index).first.GetAllSame()) {
+      auto dtype = iter_second->at(attr_index).first.GetInputAttr(0).first;
      for (size_t attr = 1; attr < kernel_info->GetInputNum(); ++attr) {
-        (void)iter_second->at(attr_index).first.AddInputAttr(kernel_info->GetInputDeviceType(0));
+        (void)iter_second->at(attr_index).first.AddInputAttr(dtype);
      }
    } else {
      MS_LOG(EXCEPTION) << "op[" << kernel_name << "] Input size is mismatching!";
@@ -50,8 +51,9 @@ void GpuKernelFactory::CheckIOParam(const std::string &kernel_name, const Kernel
  }
  if (kernel_info->GetOutputNum() != iter_second->at(attr_index).first.GetOutputSize()) {
    if (iter_second->at(attr_index).first.GetAllSame()) {
+      auto dtype = iter_second->at(attr_index).first.GetOutputAttr(0).first;
      for (size_t attr = 1; attr < kernel_info->GetOutputNum(); ++attr) {
-        (void)iter_second->at(attr_index).first.AddOutputAttr(kernel_info->GetOutputDeviceType(0));
+        (void)iter_second->at(attr_index).first.AddOutputAttr(dtype);
      }
    } else {
      MS_LOG(EXCEPTION) << "op[" << kernel_name << "] Output size is mismatching!";

--- a/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.cc
@@ -38,5 +38,13 @@ MS_REG_GPU_KERNEL_ONE(
 MS_REG_GPU_KERNEL_ONE(
  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BinaryOpGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(
+  Maximum,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BinaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  Maximum,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BinaryOpGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.h
@@ -27,12 +27,16 @@
 #include "kernel/gpu/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
-enum BinaryOpType { BINARY_OP_ADD = 0, BINARY_OP_SUB, BINARY_OP_MUL, BINARY_OP_DIV, BINARY_OP_INVALID_TYPE = 255 };
-const std::map<std::string, BinaryOpType> kBinaryOpTypeMap = {
-  {"Sub", BINARY_OP_SUB},
-  {"Mul", BINARY_OP_MUL},
-  {"RealDiv", BINARY_OP_DIV},
+enum BinaryOpType {
+  BINARY_OP_ADD = 0,
+  BINARY_OP_SUB,
+  BINARY_OP_MUL,
+  BINARY_OP_DIV,
+  BINARY_OP_MAX,
+  BINARY_OP_INVALID_TYPE = 255
 };
+static const std::map<std::string, BinaryOpType> kBinaryOpTypeMap = {
+  {"Sub", BINARY_OP_SUB}, {"Mul", BINARY_OP_MUL}, {"RealDiv", BINARY_OP_DIV}, {"Maximum", BINARY_OP_MAX}};
 template <typename T>
 class BinaryOpGpuKernel : public GpuKernel {
 public:
@@ -84,6 +88,10 @@ class BinaryOpGpuKernel : public GpuKernel {
        inputB_addr = workspace_addr;
        break;
      }
+      case BINARY_OP_MAX: {
+        inputB_addr = input_addr2;
+        break;
+      }
      default: {
        MS_LOG(EXCEPTION) << "Binary operation " << binary_op_type_ << " is not supported.";
      }
@@ -201,6 +209,10 @@ class BinaryOpGpuKernel : public GpuKernel {
        tensor_op_ = CUDNN_OP_TENSOR_ADD;
        break;
      }
+      case BINARY_OP_MAX: {
+        tensor_op_ = CUDNN_OP_TENSOR_MAX;
+        break;
+      }
      default: {
        MS_LOG(EXCEPTION) << "Binary operation " << binary_op_type_ << " is not supported.";
      }

--- a/mindspore/ccsrc/kernel/gpu/math/unary_op_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/math/unary_op_gpu_kernel.cc
@@ -38,5 +38,9 @@ MS_REG_GPU_KERNEL_ONE(ZerosLike, KernelAttr().AddInputAttr(kNumberTypeFloat32).A
                      UnaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(ZerosLike, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      UnaryOpGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      UnaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Square, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      UnaryOpGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/gpu/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/unary_op_gpu_kernel.h
@@ -33,13 +33,15 @@ enum UnaryOptype {
  UNARY_OP_NEG,
  UNARY_OP_RECIPROCAL,
  UNARY_OP_ZEROSLIKE,
+  UNARY_OP_SQUARE,
  UNARY_OP_INVALID_TYPE = 255
 };
-const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
-                                                            {"Log", UNARY_OP_LOG},
-                                                            {"Neg", UNARY_OP_NEG},
-                                                            {"Reciprocal", UNARY_OP_RECIPROCAL},
-                                                            {"ZerosLike", UNARY_OP_ZEROSLIKE}};
+static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
+                                                                   {"Log", UNARY_OP_LOG},
+                                                                   {"Neg", UNARY_OP_NEG},
+                                                                   {"Reciprocal", UNARY_OP_RECIPROCAL},
+                                                                   {"ZerosLike", UNARY_OP_ZEROSLIKE},
+                                                                   {"Square", UNARY_OP_SQUARE}};
 template <typename T>
 class UnaryOpGpuKernel : public GpuKernel {
 public:
@@ -74,6 +76,10 @@ class UnaryOpGpuKernel : public GpuKernel {
        Reciprocal(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        break;
      }
+      case UNARY_OP_SQUARE: {
+        Square(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
+        break;
+      }
      case UNARY_OP_ZEROSLIKE: {
        return true;
      }
@@ -93,12 +99,12 @@ class UnaryOpGpuKernel : public GpuKernel {
    }
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but negative op needs 1 inputs.";
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but unary op needs 1 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
-      MS_LOG(ERROR) << "Output number is " << output_num << ", but negative op needs 1 output.";
+      MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output.";
      return false;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);

--- a/mindspore/ccsrc/kernel/gpu/nn/flatten_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/flatten_gpu_kernel.h
@@ -48,14 +48,10 @@ class FlattenGpuFwdKernel : public GpuKernel {
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    input_size_ = sizeof(T);
    for (size_t i = 0; i < shape.size(); ++i) {
-      if (input_size_ == 0) {
-        input_size_ = 1;
-      }
      input_size_ *= shape[i];
    }
-    input_size_ = input_size_ * sizeof(T);
-
    InitSizeLists();
    return true;
  }

--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@@ -189,6 +189,12 @@ VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args) {
    } else if (utils::isa<PyObjectRef>(arg)) {
      auto value = utils::cast<PyObjectRef>(arg).object_;
      inputs.push_back(py::cast<tensor::TensorPtr>(value));
+    } else if (utils::isa<VectorRefPtr>(arg)) {
+      auto args_new = utils::cast<VectorRef>(arg);
+      (void)std::transform(args_new.begin(), args_new.end(), std::back_inserter(inputs),
+                           [](const BaseRef &v) { return utils::cast<tensor::TensorPtr>(v); });
+    } else {
+      MS_LOG(WARNING) << "Invalid input type.";
    }
  }