gpu ops code and test case.

f679568d · linqingke · ca6756b5 · f679568d · f679568d · f679568d
30 changed file
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(
+  GatherNd,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
+  GatherNdGpuFwdKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(
+  GatherNd,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
+  GatherNdGpuFwdKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(
+  GatherNd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  GatherNdGpuFwdKernel, int, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_GATHERND_GPU_KERNEL_H
+#define MINDSPORE_GATHERND_GPU_KERNEL_H
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/gathernd.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename S>
+class GatherNdGpuFwdKernel : public GpuKernel {
+ public:
+  GatherNdGpuFwdKernel() : dev_batch_strides_(nullptr), dev_batch_indices_(nullptr) {}
+  ~GatherNdGpuFwdKernel() {
+    if (dev_batch_strides_ != nullptr) {
+      device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(static_cast<void *>(dev_batch_strides_));
+    }
+    if (dev_batch_indices_ != nullptr) {
+      device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(static_cast<void *>(dev_batch_indices_));
+    }
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    VARIABLE_NOT_USED(workspace);
+    T *input_addr = GetDeviceAddress<T>(inputs, 0);
+    S *indices_addr = GetDeviceAddress<S>(inputs, 1);
+    T *output_addr = GetDeviceAddress<T>(outputs, 0);
+
+    GatherNd(input_addr, indices_addr, output_addr, dims_[0], dims_[1], dims_[2], dev_batch_strides_,
+             dev_batch_indices_, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    InitResource();
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 2) {
+      MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherNdGpuFwdKernel needs 2.";
+    }
+    input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+
+    Reshape();
+
+    size_t dim_indices_last = dims_[dims_.size() - 1];
+    batch_strides_.resize(dim_indices_last, 0);
+    batch_indices_.resize(dim_indices_last, 0);
+
+    if (dim_indices_last > 0) {
+      batch_strides_[dim_indices_last - 1] = input_shapes_[dim_indices_last - 1];
+      batch_indices_[dim_indices_last - 1] = dims_[1];
+    }
+    for (size_t i = dim_indices_last - 1; i > 0; --i) {
+      batch_strides_[i - 1] = input_shapes_[i - 1];
+      batch_indices_[i - 1] = batch_indices_[i] * input_shapes_[i];
+    }
+
+    size_t strides_len = sizeof(S) * batch_strides_.size();
+    void *dev_batch_strides_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(strides_len);
+    if (dev_batch_strides_work == nullptr) {
+      MS_LOG(EXCEPTION) << "Failed to alloc dev_batch_strides_work, size: " << strides_len;
+    }
+    dev_batch_strides_ = static_cast<S *>(dev_batch_strides_work);
+
+    size_t indices_len = sizeof(S) * batch_indices_.size();
+    void *dev_batch_indices_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(indices_len);
+    if (dev_batch_indices_work == nullptr) {
+      MS_LOG(EXCEPTION) << "Failed to alloc dev_batch_indices_work, size: " << indices_len;
+    }
+    dev_batch_indices_ = static_cast<S *>(dev_batch_indices_work);
+
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpy(dev_batch_strides_, &batch_strides_[0], strides_len, cudaMemcpyHostToDevice),
+                               "cudaMemcpy failed in GatherNdGpuFwdKernel::Init.");
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpy(dev_batch_indices_, &batch_indices_[0], indices_len, cudaMemcpyHostToDevice),
+                               "cudaMemcpy failed in GatherNdGpuFwdKernel::Init.");
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    size_t size = GetSize(input_shapes_);
+    input_size_list_.push_back(size);
+
+    size = GetSize(indices_shapes_);
+    input_size_list_.push_back(size);
+
+    size = GetSize(output_shapes_);
+    output_size_list_.push_back(size);
+  }
+
+ private:
+  void Reshape() {
+    size_t dim_of_indices = 1;
+    for (size_t i = 0; i < indices_shapes_.size() - IntToSize(1); i++) {
+      dim_of_indices *= indices_shapes_[i];
+    }
+
+    size_t dim_after_indices = 1;
+    size_t dim_indices_last = indices_shapes_[indices_shapes_.size() - IntToSize(1)];
+    for (size_t i = dim_indices_last; i < input_shapes_.size(); i++) {
+      dim_after_indices *= input_shapes_[i];
+    }
+    dims_.emplace_back(dim_of_indices);
+    dims_.emplace_back(dim_after_indices);
+    dims_.emplace_back(dim_indices_last);
+    return;
+  }
+  size_t GetSize(const std::vector<size_t> &shape) const {
+    if (shape.size() == 0) {
+      return 0;
+    }
+    size_t result = sizeof(T);
+    for (size_t i = 0; i < shape.size(); i++) {
+      result *= shape[i];
+    }
+    return result;
+  }
+
+  std::vector<size_t> input_shapes_;
+  std::vector<size_t> indices_shapes_;
+  std::vector<size_t> output_shapes_;
+
+  std::vector<size_t> dims_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  std::vector<S> batch_strides_;
+  std::vector<S> batch_indices_;
+
+  S *dev_batch_strides_;
+  S *dev_batch_indices_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_GATHERND_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ScatterNdGpuFwdKernel, float, int)
+MS_REG_GPU_KERNEL_TWO(
+  ScatterNd,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  ScatterNdGpuFwdKernel, half, int)
+MS_REG_GPU_KERNEL_TWO(
+  ScatterNd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  ScatterNdGpuFwdKernel, int, int)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_SCATTER_ND_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_KERNEL_GPU_SCATTER_ND_GPU_KERNEL_H
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/cuda_impl/scatter_nd.cuh"
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename S>
+class ScatterNdGpuFwdKernel : public GpuKernel {
+ public:
+  ScatterNdGpuFwdKernel()
+      : input_size_(1),
+        indices_size_(1),
+        output_size_(1),
+        block_size_(1),
+        indices_stride_(nullptr),
+        work_shape_(nullptr),
+        indices_dim_0_(0),
+        indices_dim_1_(0) {}
+  ~ScatterNdGpuFwdKernel() {
+    if (indices_stride_ != nullptr) {
+      device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(static_cast<void *>(indices_stride_));
+    }
+    if (work_shape_ != nullptr) {
+      device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(static_cast<void *>(work_shape_));
+    }
+  }
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    VARIABLE_NOT_USED(workspace);
+    S *indices = GetDeviceAddress<S>(inputs, 0);
+    T *update = GetDeviceAddress<T>(inputs, 1);
+    T *output = GetDeviceAddress<T>(outputs, 0);
+
+    ScatterNd(indices, update, output, block_size_, input_size_, output_size_, indices_dim_0_, indices_dim_1_,
+              indices_stride_, work_shape_, reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 2) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but transpose needs 2 input.";
+      return false;
+    }
+    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
+    if (output_num != 1) {
+      MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output.";
+      return false;
+    }
+
+    input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+
+    vec_work_shape_ = GetAttr<std::vector<S>>(kernel_node, "shape");
+
+    GetSize();
+
+    size_t indices_len = sizeof(S) * vec_indices_stride_.size();
+    void *indices_stride_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(indices_len);
+    if (indices_stride_work == nullptr) {
+      MS_LOG(EXCEPTION) << "Failed to alloc indices_stride_work, size: " << indices_len;
+    }
+    indices_stride_ = static_cast<S *>(indices_stride_work);
+
+    size_t vec_work_len = sizeof(S) * vec_work_shape_.size();
+    void *work_shape_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(vec_work_len);
+    if (work_shape_work == nullptr) {
+      MS_LOG(EXCEPTION) << "Failed to alloc work_shape_work, size: " << vec_work_len;
+    }
+    work_shape_ = static_cast<S *>(work_shape_work);
+
+    CHECK_CUDA_RET_WITH_EXCEPT(
+      cudaMemcpy(indices_stride_, &vec_indices_stride_[0], indices_len, cudaMemcpyHostToDevice),
+      "cudaMemcpy failed in ScatterNdGpuFwdKernel::Init.");
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpy(work_shape_, &vec_work_shape_[0], vec_work_len, cudaMemcpyHostToDevice),
+                               "cudaMemcpy failed in ScatterNdGpuFwdKernel::Init.");
+    InitSizeLists();
+
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(indices_size_);
+    input_size_list_.push_back(input_size_);
+    output_size_list_.push_back(output_size_);
+    return;
+  }
+
+  void GetSize() {
+    indices_size_ = sizeof(S);
+    for (size_t i = 0; i < indices_shapes_.size(); i++) {
+      indices_size_ *= indices_shapes_[i];
+    }
+    input_size_ = sizeof(T);
+    for (size_t i = 0; i < input_shapes_.size(); i++) {
+      input_size_ *= input_shapes_[i];
+    }
+    output_size_ = sizeof(T);
+    for (size_t i = 0; i < output_shapes_.size(); i++) {
+      output_size_ *= output_shapes_[i];
+    }
+
+    // calculate indices dim 0/1
+    indices_dim_0_ = indices_shapes_[0];
+    indices_dim_1_ = indices_shapes_[1];
+
+    // calculate block_size
+    for (size_t i = indices_dim_1_; i < output_shapes_.size(); i++) {
+      block_size_ *= output_shapes_[i];
+    }
+
+    // calculate indices_stride
+    for (size_t i = 0; i < indices_dim_1_; i++) {
+      vec_indices_stride_.push_back(0);
+    }
+
+    vec_indices_stride_[indices_dim_1_ - 1] = block_size_;
+
+    for (size_t i = indices_dim_1_ - 1; i > 0; --i) {
+      vec_indices_stride_[i - 1] = vec_indices_stride_[i] * output_shapes_[i];
+    }
+  }
+
+ private:
+  std::vector<size_t> input_shapes_;
+  std::vector<size_t> indices_shapes_;
+  std::vector<size_t> output_shapes_;
+  std::vector<S> vec_indices_stride_;
+  std::vector<S> vec_work_shape_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+
+  size_t input_size_;
+  size_t indices_size_;
+  size_t output_size_;
+  size_t block_size_;
+
+  S *indices_stride_;
+  S *work_shape_;
+  size_t indices_dim_0_;
+  size_t indices_dim_1_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_SCATTER_ND_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cu
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cuh"
+
+template <typename T>
+__global__ void BoundingBoxDecodeKernel(const size_t size, const T *rois, const T *deltas, T *bboxes, const float m1,
+                                        const float m2, const float m3, const float m4, const float s1, const float s2,
+                                        const float s3, const float s4, const int max_height, const int max_width,
+                                        const float ratio_clip) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+    const size_t left_x = i * 4;
+    const size_t left_y = i * 4 + 1;
+    const size_t right_x = i * 4 + 2;
+    const size_t right_y = i * 4 + 3;
+
+    T dx = deltas[left_x] * s1 + m1;
+    T dy = deltas[left_y] * s2 + m2;
+    T dw = deltas[right_x] * s3 + m3;
+    T dh = deltas[right_y] * s4 + m4;
+
+    T max_ratio = abs(log(ratio_clip));
+
+    dw = dw > max_ratio ? max_ratio : (dw < (-max_ratio) ? (-max_ratio) : dw);
+    dh = dh > max_ratio ? max_ratio : (dh < (-max_ratio) ? (-max_ratio) : dh);
+
+    T px = (rois[left_x] + rois[right_x]) * 0.5f;
+    T py = (rois[left_y] + rois[right_y]) * 0.5f;
+    T pw = rois[right_x] - rois[left_x] + 1.0f;
+    T ph = rois[right_y] - rois[left_y] + 1.0f;
+
+    T gx = px + pw * dx;
+    T gy = py + ph * dy;
+    T gw = pw * exp(dw);
+    T gh = ph * exp(dh);
+
+    T x1 = gx - gw * 0.5f + 0.5f;
+    T y1 = gy - gh * 0.5f + 0.5f;
+    T x2 = gx + gw * 0.5f - 0.5f;
+    T y2 = gy + gh * 0.5f - 0.5f;
+
+    x1 = x1 > max_width ? max_width : (x1 < 0 ? 0 : x1);
+    y1 = y1 > max_height ? max_height : (y1 < 0 ? 0 : y1);
+    x2 = x2 > max_width ? max_width : (x2 < 0 ? 0 : x2);
+    y2 = y2 > max_height ? max_height : (y2 < 0 ? 0 : y2);
+
+    bboxes[left_x] = x1;
+    bboxes[left_y] = y1;
+    bboxes[right_x] = x2;
+    bboxes[right_y] = y2;
+  }
+}
+
+template <typename T>
+void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes, const float &m1, const float &m2,
+                       const float &m3, const float &m4, const float &s1, const float &s2, const float &s3,
+                       const float &s4, const int &max_height, const int &max_width, const float &ratio_clip,
+                       cudaStream_t cuda_stream) {
+  BoundingBoxDecodeKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, rois, deltas, bboxes, m1, m2, m3, m4,
+                                                                             s1, s2, s3, s4, max_height, max_width,
+                                                                             ratio_clip);
+}
+
+template void BoundingBoxDecode<float>(const size_t size, const float *rois, const float *deltas, float *bboxes,
+                                       const float &m1, const float &m2, const float &m3, const float &m4,
+                                       const float &s1, const float &s2, const float &s3, const float &s4,
+                                       const int &max_height, const int &max_width, const float &ratio_clip,
+                                       cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cuh
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T>
+void BoundingBoxDecode(const size_t size, const T *rois, const T *deltas, T *bboxes, const float &m1, const float &m2,
+                       const float &m3, const float &m4, const float &s1, const float &s2, const float &s3,
+                       const float &s4, const int &max_height, const int &max_width, const float &ratio_clip,
+                       cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_DECODE_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cu
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cuh"
+
+template <typename T>
+__global__ void BoundingBoxEncodeKernel(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas,
+                                        const float m1, const float m2, const float m3, const float m4, const float s1,
+                                        const float s2, const float s3, const float s4) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+    const size_t left_x = i * 4;
+    const size_t left_y = i * 4 + 1;
+    const size_t right_x = i * 4 + 2;
+    const size_t right_y = i * 4 + 3;
+
+    T px = (anchor_box[left_x] + anchor_box[right_x]) * 0.5f;
+    T py = (anchor_box[left_y] + anchor_box[right_y]) * 0.5f;
+    T pw = anchor_box[right_x] - anchor_box[left_x] + 1.0f;
+    T ph = anchor_box[right_y] - anchor_box[left_y] + 1.0f;
+
+    T gx = (groundtruth_box[left_x] + groundtruth_box[right_x]) * 0.5f;
+    T gy = (groundtruth_box[left_y] + groundtruth_box[right_y]) * 0.5f;
+    T gw = groundtruth_box[right_x] - groundtruth_box[left_x] + 1.0f;
+    T gh = groundtruth_box[right_y] - groundtruth_box[left_y] + 1.0f;
+
+    T dx = (gx - px) / pw;
+    T dy = (gy - py) / ph;
+    T dw = log(gw / pw);
+    T dh = log(gh / ph);
+
+    deltas[left_x] = (dx - m1) / s1;
+    deltas[left_y] = (dy - m2) / s2;
+    deltas[right_x] = (dw - m3) / s3;
+    deltas[right_y] = (dh - m4) / s4;
+  }
+}
+
+template <typename T>
+void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, const float &m1,
+                       const float &m2, const float &m3, const float &m4, const float &s1, const float &s2,
+                       const float &s3, const float &s4, cudaStream_t cuda_stream) {
+  BoundingBoxEncodeKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, anchor_box, groundtruth_box, deltas,
+                                                                             m1, m2, m3, m4, s1, s2, s3, s4);
+}
+
+template void BoundingBoxEncode<float>(const size_t size, const float *anchor_box, const float *groundtruth_box,
+                                       float *deltas, const float &m1, const float &m2, const float &m3,
+                                       const float &m4, const float &s1, const float &s2, const float &s3,
+                                       const float &s4, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cuh
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T>
+void BoundingBoxEncode(const size_t size, const T *anchor_box, const T *groundtruth_box, T *deltas, const float &m1,
+                       const float &m2, const float &m3, const float &m4, const float &s1, const float &s2,
+                       const float &s3, const float &s4, cudaStream_t cuda_stream);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_BOUNDINGBOX_ENCODE_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@@ -69,6 +69,25 @@ struct AddFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs + rhs); }
 };

+template <typename T, typename S>
+struct FloorDivFunc {
+  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return floor(static_cast<float>(lhs / rhs)); }
+};
+
+template <>
+struct FloorDivFunc<half, half> {
+  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
+    return __float2half(floor(__half2float(lhs)/ __half2float(rhs)));
+  }
+};
+
+template <>
+struct FloorDivFunc<half, bool> {
+  // invalid branch
+  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { return false; }
+};
+
+
 template <>
 struct PowerFunc<half, bool> {
  // invalid branch
@@ -77,6 +96,7 @@ struct PowerFunc<half, bool> {

 __device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }

+
 template <typename T, typename S, typename Func>
 __device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3,
                                                  const int &r0, const int &r1, const int &r2, const int &r3,
@@ -116,16 +136,19 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
                                                      output);
    case BROADCAST_TYPE_REALDIV:
      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                        output);
+                                                      output);
    case BROADCAST_TYPE_MUL:
      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                    output);
+                                                      output);
    case BROADCAST_TYPE_SUB:
      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                    output);
+                                                      output);
    case BROADCAST_TYPE_ADD:
      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                    output);
+                                                      output);
+    case BROADCAST_TYPE_FLOORDIV:
+      return BroadcastOperator<T, S, FloorDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
+                                                      output);
  }
 }

@@ -167,6 +190,8 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
      return NoBroadcastOperator<T, S, SubFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_ADD:
      return NoBroadcastOperator<T, S, AddFunc<T, S>>(nums, input0, input1, output);
+    case BROADCAST_TYPE_FLOORDIV:
+      return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output);
  }
 }

@@ -195,7 +220,7 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
  int nums = o0 * o1 * o2 * o3;
  BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
-                                                                  output_addr);
+          output_addr);
 }

 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
@@ -226,9 +251,8 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                          half *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
-                          cudaStream_t stream);
-
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
+                          int *output, cudaStream_t stream);
 template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
                          cudaStream_t stream);

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@@ -29,6 +29,7 @@ enum BroadcastOpType {
  BROADCAST_TYPE_MUL = 6,
  BROADCAST_TYPE_SUB = 7,
  BROADCAST_TYPE_ADD = 8,
+  BROADCAST_TYPE_FLOORDIV = 9,
  BROADCAST_TYPE_INVALID = 0xffffffff,
 };


--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/gathernd.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/gathernd.cu
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/gathernd.cuh"
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T, typename S>
+__global__ void GatherNdKernel(T *input, S *indices, T *output, const size_t output_dim0, const size_t output_dim1,
+                               const size_t indices_dim1, S *batch_indices, S *batch_strides) {
+  int num = output_dim0 * output_dim1;
+  int i, j;
+  for (int write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
+       write_index += blockDim.x * gridDim.x) {
+    i = write_index / output_dim1 % output_dim0;
+    j = write_index % output_dim1;
+
+    bool out_of_bound = false;
+    int read_index = 0;
+    int indices_i = 0;
+    for (size_t k = 0; k < indices_dim1; k++) {
+      size_t ind = indices_dim1 * i + k;
+      indices_i = indices[ind];
+      out_of_bound |= !(indices_i < batch_indices[k]);
+      read_index += indices_i * batch_strides[k];
+    }
+    read_index += j;
+
+    if (!out_of_bound) {
+      output[write_index] = input[read_index];
+    } else {
+      output[write_index] = 0;
+    }
+  }
+  return;
+}
+template <typename T, typename S>
+void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1,
+              const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream) {
+  int size = output_dim0 * output_dim1;
+  GatherNdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
+                                                               indices_dim1, batch_indices, batch_strides);
+  return;
+}
+
+template void GatherNd<float, int>(float *input, int *indices, float *output, const size_t &output_dim0,
+                                   const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
+                                   int *batch_strides, cudaStream_t stream);
+template void GatherNd<half, int>(half *input, int *indices, half *output, const size_t &output_dim0,
+                                  const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
+                                  int *batch_strides, cudaStream_t stream);
+template void GatherNd<int, int>(int *input, int *indices, int *output, const size_t &output_dim0,
+                                 const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
+                                 int *batch_strides, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/gathernd.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/gathernd.cuh
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_GATHERND_GPU_CU_H
+#define MINDSPORE_GATHERND_GPU_CU_H
+
+#include "runtime/device/gpu/cuda_common.h"
+
+template <typename T, typename S>
+void GatherNd(T *input, S *indices, T *output, const size_t &output_dim0, const size_t &output_dim1,
+              const size_t &indices_dim1, S *batch_indices, S *batch_strides, cudaStream_t stream);
+
+#endif  // MINDSPORE_GATHERND_GPU_CU_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd.cu
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/scatter_nd.cuh"
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T, typename S>
+__global__ void ScatterNdKernel(S *indices, T *update, T *output, const size_t block_size, const size_t input_size,
+                                const size_t output_size, const size_t indices_dim_0, const size_t indices_dim_1,
+                                S *indices_stride, S *work_shape) {
+  int i, j;
+  for (int read_index = blockIdx.x * blockDim.x + threadIdx.x; read_index < input_size;
+       read_index += blockDim.x * gridDim.x) {
+    int write_index = 0;
+    bool out_bound = false;
+
+    i = read_index / block_size;
+    j = read_index % block_size;
+
+    for (size_t k = 0; k < indices_dim_1; k++) {
+      S indices_i = indices[i * indices_dim_1 + k];
+      out_bound |= indices_i >= work_shape[k];
+      write_index += indices_i * indices_stride[k];
+    }
+
+    write_index += j;
+    out_bound |= write_index >= output_size;
+
+    if (!out_bound) {
+      output[write_index] = update[read_index];
+    }
+  }
+}
+
+template <typename T, typename S>
+void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+               const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride,
+               S *work_shape, cudaStream_t stream) {
+  ScatterNdKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(indices, update, output, block_size, input_size,
+                                                                      output_size, indices_dim_0, indices_dim_1,
+                                                                      indices_stride, work_shape);
+  return;
+}
+
+template void ScatterNd<float, int>(int *indices, float *update, float *output, const size_t &block_size,
+                                    const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
+                                    const size_t &indices_dim_1, int *indices_stride, int *work_shape,
+                                    cudaStream_t stream);
+template void ScatterNd<half, int>(int *indices, half *update, half *output, const size_t &block_size,
+                                   const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
+                                   const size_t &indices_dim_1, int *indices_stride, int *work_shape,
+                                   cudaStream_t stream);
+template void ScatterNd<int, int>(int *indices, int *update, int *output, const size_t &block_size,
+                                  const size_t &input_size, const size_t &output_size, const size_t &indices_dim_0,
+                                  const size_t &indices_dim_1, int *indices_stride, int *work_shape,
+                                  cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/scatter_nd.cuh
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_SCATTER_ND_GPU_CU_H
+#define MINDSPORE_SCATTER_ND_GPU_CU_H
+
+#include "runtime/device/gpu/cuda_common.h"
+
+template <typename T, typename S>
+void ScatterNd(S *indices, T *update, T *output, const size_t &block_size, const size_t &input_size,
+               const size_t &output_size, const size_t &indices_dim_0, const size_t &indices_dim_1, S *indices_stride,
+               S *work_shape, cudaStream_t stream);
+#endif  // MINDSPORE_SCATTER_ND_GPU_CU_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sgd_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sgd_impl.cu
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "backend/kernel_compiler/gpu/cuda_impl/sgd_impl.cuh"
+
+template <typename T>
+__global__ void SGDKernel(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *grad,
+                          const T *momentum, const T *lr, T *param, T *accum, T *stat) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
+    T grad_new = grad[i];
+    if (weight_decay != static_cast<T>(0)) {
+      grad_new += param[i] * weight_decay;
+    }
+
+    if (momentum[0] != static_cast<T>(0)) {
+      if (stat[i] == static_cast<T>(0)) {
+        accum[i] = grad_new;
+        stat[i] = 0;
+      } else {
+        accum[i] = accum[i] * momentum[0] + (1.0 - dampening) * grad_new;
+      }
+
+      if (nesterov) {
+        grad_new += accum[i] * momentum[0];
+      } else {
+        grad_new = accum[i];
+      }
+    }
+
+    param[i] -= lr[0] * grad_new;
+  }
+}
+
+template <typename T>
+void SGD(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *lr, const T *momentum,
+         const T *grad, T *param, T *accum, T *stat, cudaStream_t cuda_stream) {
+  SGDKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, dampening, weight_decay, nesterov, grad, momentum,
+                                                               lr, param, accum, stat);
+}
+
+template void SGD(const int size, const float dampening, const float weight_decay, const bool nesterov, const float *lr,
+                  const float *momentum, const float *grad, float *param, float *accum, float *stat,
+                  cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sgd_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sgd_impl.cuh
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+
+template <typename T>
+void SGD(const int size, const T dampening, const T weight_decay, const bool nesterov, const T *lr, const T *momentum,
+         const T *grad, T *param, T *accum, T *stat, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SGD_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
@@ -51,6 +51,10 @@ MS_REG_GPU_KERNEL_TWO(
  TensorAdd,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)
+MS_REG_GPU_KERNEL_TWO(
+  FloorDiv,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BroadcastOpGpuKernel, float, float)

 // fp16
 MS_REG_GPU_KERNEL_TWO(
@@ -85,6 +89,10 @@ MS_REG_GPU_KERNEL_TWO(
  TensorAdd,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
+MS_REG_GPU_KERNEL_TWO(
+  FloorDiv,
+  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  BroadcastOpGpuKernel, half, half)

 // int32
 MS_REG_GPU_KERNEL_TWO(

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@@ -96,10 +96,10 @@ class BroadcastOpGpuKernel : public GpuKernel {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);

    static std::map<std::string, BroadcastOpType> kBroadcastTypeMap = {
-      {"Greater", BROADCAST_TYPE_GREATER},  {"Less", BROADCAST_TYPE_LESS}, {"Maximum", BROADCAST_TYPE_MAXIMUM},
-      {"Minimum", BROADCAST_TYPE_MINIMUM},  {"Pow", BROADCAST_TYPE_POWER}, {"RealDiv", BROADCAST_TYPE_REALDIV},
-      {"FloorDiv", BROADCAST_TYPE_REALDIV}, {"Mul", BROADCAST_TYPE_MUL},   {"Sub", BROADCAST_TYPE_SUB},
-      {"TensorAdd", BROADCAST_TYPE_ADD},
+      {"Greater", BROADCAST_TYPE_GREATER},   {"Less", BROADCAST_TYPE_LESS}, {"Maximum", BROADCAST_TYPE_MAXIMUM},
+      {"Minimum", BROADCAST_TYPE_MINIMUM},   {"Pow", BROADCAST_TYPE_POWER}, {"RealDiv", BROADCAST_TYPE_REALDIV},
+      {"Mul", BROADCAST_TYPE_MUL},           {"Sub", BROADCAST_TYPE_SUB},   {"TensorAdd", BROADCAST_TYPE_ADD},
+      {"FloorDiv", BROADCAST_TYPE_FLOORDIV},
    };

    auto iter = kBroadcastTypeMap.find(kernel_name);

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sgd_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sgd_gpu_kernel.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/nn/sgd_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(SGD,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
+                      SGDGpuKernel, float)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sgd_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sgd_gpu_kernel.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_NN_SGD_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_NN_SGD_KERNEL_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/cuda_impl/sgd_impl.cuh"
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class SGDGpuKernel : public GpuKernel {
+ public:
+  SGDGpuKernel() : size_(1), dampening_(0.0), weight_decay_(0.0), nesterov_(false) {}
+  ~SGDGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream) override {
+    T *param = GetDeviceAddress<T>(inputs, 0);
+    T *grad = GetDeviceAddress<T>(inputs, 1);
+    T *lr = GetDeviceAddress<T>(inputs, 2);
+    T *accum = GetDeviceAddress<T>(inputs, 3);
+    T *momentum = GetDeviceAddress<T>(inputs, 4);
+    T *stat = GetDeviceAddress<T>(inputs, 5);
+
+    SGD(size_, dampening_, weight_decay_, nesterov_, lr, momentum, grad, param, accum, stat,
+        reinterpret_cast<cudaStream_t>(stream));
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    dampening_ = GetAttr<float>(kernel_node, "dampening");
+    weight_decay_ = GetAttr<float>(kernel_node, "weight_decay");
+    nesterov_ = GetAttr<bool>(kernel_node, "nesterov");
+
+    auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    for (auto &dim : input_shape) {
+      size_ *= dim;
+    }
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    size_t input_size = size_ * sizeof(T);
+    input_size_list_.push_back(input_size);  // parameter
+    input_size_list_.push_back(input_size);  // gradient
+    input_size_list_.push_back(sizeof(T));   // lr
+    input_size_list_.push_back(input_size);  // accum
+    input_size_list_.push_back(sizeof(T));   // momentum
+    input_size_list_.push_back(input_size);  // stat
+    output_size_list_.push_back(input_size);
+  }
+
+ private:
+  size_t size_;
+  float dampening_;
+  float weight_decay_;
+  bool nesterov_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_NN_SGD_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_decode_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_decode_gpu_kernel.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/other/boundingbox_decode_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(
+  BoundingBoxDecode,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BoundingBoxDecodeGpuKernel, float)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_decode_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_decode_gpu_kernel.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_DECODE_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_DECODE_GPU_KERNEL_H
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cuh"
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class BoundingBoxDecodeGpuKernel : public GpuKernel {
+ public:
+  BoundingBoxDecodeGpuKernel() : rois_size_(0), deltas_size_(0), bboxes_size_(0), wh_ratio_clip_(0.016) {}
+
+  ~BoundingBoxDecodeGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *rois_addr = GetDeviceAddress<T>(inputs, 0);
+    T *deltas_addr = GetDeviceAddress<T>(inputs, 1);
+    T *bboxes_addr = GetDeviceAddress<T>(outputs, 0);
+
+    if (inputs[0]->size != inputs[1]->size) {
+      MS_LOG(ERROR) << "Rois box size must equal with deltas box size -" << inputs[1]->size << ", but got"
+                    << inputs[0]->size;
+      return false;
+    }
+
+    const size_t coordinate = 4;
+    const size_t block_size = inputs[0]->size / sizeof(T);
+    if ((block_size % coordinate) != 0) {
+      MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
+      return false;
+    }
+
+    BoundingBoxDecode(block_size / coordinate, rois_addr, deltas_addr, bboxes_addr, means_[0], means_[1], means_[2],
+                      means_[3], stds_[0], stds_[1], stds_[2], stds_[3], max_shape_[0], max_shape_[1], wh_ratio_clip_,
+                      reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 2) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but BoundingBoxDecode needs 2 inputs.";
+      return false;
+    }
+    rois_size_ = sizeof(T);
+    deltas_size_ = sizeof(T);
+    bboxes_size_ = sizeof(T);
+
+    auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < logits_shape.size(); i++) {
+      rois_size_ *= logits_shape[i];
+    }
+
+    auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    for (size_t i = 0; i < labels_shape.size(); i++) {
+      deltas_size_ *= labels_shape[i];
+    }
+
+    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < output_shape.size(); i++) {
+      bboxes_size_ *= output_shape[i];
+    }
+
+    InitSizeLists();
+
+    const size_t coordinate_size = 4;
+    if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueTuple>() ||
+        AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueList>()) {
+      means_ = GetAttr<std::vector<float>>(kernel_node, "means");
+    } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<FloatImm>()) {
+      float mean = GetAttr<int>(kernel_node, "means");
+      for (size_t i = 0; i < coordinate_size; i++) {
+        means_.emplace_back(mean);
+      }
+    } else {
+      MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
+    }
+
+    if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueTuple>() ||
+        AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueList>()) {
+      stds_ = GetAttr<std::vector<float>>(kernel_node, "stds");
+    } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<FloatImm>()) {
+      float std = GetAttr<int>(kernel_node, "stds");
+      for (size_t i = 0; i < coordinate_size; i++) {
+        stds_.emplace_back(std);
+      }
+    } else {
+      MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
+    }
+
+    max_shape_ = GetAttr<std::vector<int>>(kernel_node, "max_shape");
+    wh_ratio_clip_ = GetAttr<float>(kernel_node, "wh_ratio_clip");
+
+    if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
+      MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
+    }
+
+    if (max_shape_.size() < 2) {
+      MS_LOG(EXCEPTION) << "The size of max_shape is less than 2.";
+    }
+
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(rois_size_);
+    input_size_list_.push_back(deltas_size_);
+    output_size_list_.push_back(bboxes_size_);
+  }
+
+ private:
+  size_t rois_size_;
+  size_t deltas_size_;
+  size_t bboxes_size_;
+  std::vector<float> means_;
+  std::vector<float> stds_;
+  std::vector<int> max_shape_;
+  float wh_ratio_clip_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_DECODE_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_encode_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_encode_gpu_kernel.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/other/boundingbox_encode_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(
+  BoundingBoxEncode,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  BoundingBoxEncodeGpuKernel, float)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_encode_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_encode_gpu_kernel.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_ENCODE_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_ENCODE_GPU_KERNEL_H
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cuh"
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class BoundingBoxEncodeGpuKernel : public GpuKernel {
+ public:
+  BoundingBoxEncodeGpuKernel() : anchor_size_(0), groundtruth_size_(0), deltas_size_(0) {}
+
+  ~BoundingBoxEncodeGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *anchor_addr = GetDeviceAddress<T>(inputs, 0);
+    T *groundtruth_addr = GetDeviceAddress<T>(inputs, 1);
+    T *deltas_addr = GetDeviceAddress<T>(outputs, 0);
+
+    if (inputs[0]->size != inputs[1]->size) {
+      MS_LOG(ERROR) << "Anchor box size must equal with groundtruth box size -" << inputs[1]->size << ", but got"
+                    << inputs[0]->size;
+      return false;
+    }
+
+    const size_t coordinate = 4;
+    const size_t block_size = inputs[0]->size / sizeof(T);
+    if ((block_size % coordinate) != 0) {
+      MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
+      return false;
+    }
+
+    BoundingBoxEncode(block_size / coordinate, anchor_addr, groundtruth_addr, deltas_addr, means_[0], means_[1],
+                      means_[2], means_[3], stds_[0], stds_[1], stds_[2], stds_[3],
+                      reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    if (input_num != 2) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but BoundingBoxEncode needs 2 inputs.";
+      return false;
+    }
+    anchor_size_ = sizeof(T);
+    groundtruth_size_ = sizeof(T);
+    deltas_size_ = sizeof(T);
+
+    auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < logits_shape.size(); i++) {
+      anchor_size_ *= logits_shape[i];
+    }
+
+    auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+    for (size_t i = 0; i < labels_shape.size(); i++) {
+      groundtruth_size_ *= labels_shape[i];
+    }
+
+    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < output_shape.size(); i++) {
+      deltas_size_ *= output_shape[i];
+    }
+
+    InitSizeLists();
+
+    const size_t coordinate_size = 4;
+    if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueTuple>() ||
+        AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<ValueList>()) {
+      means_ = GetAttr<std::vector<float>>(kernel_node, "means");
+    } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("means")->isa<FloatImm>()) {
+      float mean = GetAttr<int>(kernel_node, "means");
+      for (size_t i = 0; i < coordinate_size; i++) {
+        means_.emplace_back(mean);
+      }
+    } else {
+      MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
+    }
+
+    if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueTuple>() ||
+        AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<ValueList>()) {
+      stds_ = GetAttr<std::vector<float>>(kernel_node, "stds");
+    } else if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stds")->isa<FloatImm>()) {
+      float std = GetAttr<int>(kernel_node, "stds");
+      for (size_t i = 0; i < coordinate_size; i++) {
+        stds_.emplace_back(std);
+      }
+    } else {
+      MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
+    }
+
+    if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
+      MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
+    }
+
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(anchor_size_);
+    input_size_list_.push_back(groundtruth_size_);
+    output_size_list_.push_back(deltas_size_);
+  }
+
+ private:
+  size_t anchor_size_;
+  size_t groundtruth_size_;
+  size_t deltas_size_;
+  std::vector<float> means_;
+  std::vector<float> stds_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_ENCODE_GPU_KERNEL_H
--- a/tests/st/ops/gpu/test_boundingbox_decode_op.py
+++ b/tests/st/ops/gpu/test_boundingbox_decode_op.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetBoundingBoxDecode(nn.Cell):
+    def __init__(self, means=(0.0, 0.0, 0.0, 0.0), stds=(1.0, 1.0, 1.0, 1.0)):
+        super(NetBoundingBoxDecode, self).__init__()
+        self.decode = P.BoundingBoxDecode(max_shape=(768, 1280), means=means, stds=stds,
+                                          wh_ratio_clip=0.016)
+
+    def construct(self, anchor, groundtruth):
+        return self.decode(anchor, groundtruth)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_boundingbox_decode():
+    anchor = np.array([[4, 1, 2, 1], [2, 2, 2, 3]], np.float32)
+    deltas = np.array([[3, 1, 2, 2], [1, 2, 1, 4]], np.float32)
+    means = (0.1, 0.1, 0.2, 0.2)
+    stds = (2.0, 2.0, 3.0, 3.0)
+    anchor_box = Tensor(anchor, mindspore.float32)
+    deltas_box = Tensor(deltas, mindspore.float32)
+    expect_deltas = np.array([[28.6500, 0.0000, 0.0000, 33.8500],
+                              [0.0000, 0.0000, 15.8663, 72.7000]], np.float32)
+
+    error = np.ones(shape=[2, 4]) * 1.0e-4
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    boundingbox_decode = NetBoundingBoxDecode(means, stds)
+    output = boundingbox_decode(anchor_box, deltas_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
+    boundingbox_decode = NetBoundingBoxDecode(means, stds)
+    output = boundingbox_decode(anchor_box, deltas_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
--- a/tests/st/ops/gpu/test_boundingbox_encode_op.py
+++ b/tests/st/ops/gpu/test_boundingbox_encode_op.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetBoundingBoxEncode(nn.Cell):
+    def __init__(self, means=(0.0, 0.0, 0.0, 0.0), stds=(1.0, 1.0, 1.0, 1.0)):
+        super(NetBoundingBoxEncode, self).__init__()
+        self.encode = P.BoundingBoxEncode(means=means, stds=stds)
+
+    def construct(self, anchor, groundtruth):
+        return self.encode(anchor, groundtruth)
+
+def bbox2delta(proposals, gt, means, stds):
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = np.log(gw / pw)
+    dh = np.log(gh / ph)
+    means = np.array(means, np.float32)
+    stds = np.array(stds, np.float32)
+    deltas = np.stack([(dx - means[0]) / stds[0], (dy - means[1]) / stds[1],
+                       (dw - means[2]) / stds[2], (dh - means[3]) / stds[3]], axis=-1)
+
+    return deltas
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_boundingbox_encode():
+    anchor = np.array([[4, 1, 6, 9], [2, 5, 5, 9]]).astype(np.float32)
+    gt = np.array([[3, 2, 7, 7], [1, 5, 5, 8]]).astype(np.float32)
+    means = (0.1, 0.1, 0.2, 0.2)
+    stds = (2.0, 2.0, 3.0, 3.0)
+    anchor_box = Tensor(anchor, mindspore.float32)
+    groundtruth_box = Tensor(gt, mindspore.float32)
+    expect_deltas = bbox2delta(anchor, gt, means, stds)
+
+    error = np.ones(shape=[2, 4]) * 1.0e-6
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    boundingbox_encode = NetBoundingBoxEncode(means, stds)
+    output = boundingbox_encode(anchor_box, groundtruth_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
+    boundingbox_encode = NetBoundingBoxEncode(means, stds)
+    output = boundingbox_encode(anchor_box, groundtruth_box)
+    diff = output.asnumpy() - expect_deltas
+    assert np.all(abs(diff) < error)
--- a/tests/st/ops/gpu/test_floordiv_op.py
+++ b/tests/st/ops/gpu/test_floordiv_op.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+class NetFloorDiv(nn.Cell):
+    def __init__(self):
+        super(NetFloorDiv, self).__init__()
+        self.floordiv = P.FloorDiv()
+
+    def construct(self, x, y):
+        return self.floordiv(x, y)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_floor_div():
+    x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.float32)
+    x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(np.float32)
+    y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    y3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    x4_np = np.array(768).astype(np.float32)
+    y4_np = np.array(3072.5).astype(np.float32)
+    x5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
+    y5_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float16)
+    x6_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.int32)
+    y6_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.int32)
+
+    x0 = Tensor(x0_np)
+    y0 = Tensor(y0_np)
+    x1 = Tensor(x1_np)
+    y1 = Tensor(y1_np)
+    x2 = Tensor(x2_np)
+    y2 = Tensor(y2_np)
+    x3 = Tensor(x3_np)
+    y3 = Tensor(y3_np)
+    x4 = Tensor(x4_np)
+    y4 = Tensor(y4_np)
+    x5 = Tensor(x5_np)
+    y5 = Tensor(y5_np)
+    x6 = Tensor(x6_np)
+    y6 = Tensor(y6_np)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    floor_div = NetFloorDiv()
+    output0 = floor_div(x0, y0)
+    expect0 = np.floor_divide(x0_np, y0_np)
+    diff0 = output0.asnumpy() - expect0
+    error0 = np.ones(shape=expect0.shape) * 1.0e-5
+    assert np.all(diff0 < error0)
+    assert output0.shape == expect0.shape
+
+    output1 = floor_div(x1, y1)
+    expect1 = np.floor_divide(x1_np, y1_np)
+    diff1 = output1.asnumpy() - expect1
+    error1 = np.ones(shape=expect1.shape) * 1.0e-5
+    assert np.all(diff1 < error1)
+    assert output1.shape == expect1.shape
+
+    output2 = floor_div(x2, y2)
+    expect2 = np.floor_divide(x2_np, y2_np)
+    diff2 = output2.asnumpy() - expect2
+    error2 = np.ones(shape=expect2.shape) * 1.0e-5
+    assert np.all(diff2 < error2)
+    assert output2.shape == expect2.shape
+
+    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
+    output3 = floor_div(x3, y3)
+    expect3 = np.floor_divide(x3_np, y3_np)
+    diff3 = output3.asnumpy() - expect3
+    error3 = np.ones(shape=expect3.shape) * 1.0e-5
+    assert np.all(diff3 < error3)
+    assert output3.shape == expect3.shape
+
+    output4 = floor_div(x4, y4)
+    expect4 = np.floor_divide(x4_np, y4_np)
+    diff4 = output4.asnumpy() - expect4
+    error4 = np.ones(shape=expect4.shape) * 1.0e-5
+    assert np.all(diff4 < error4)
+    assert output4.shape == expect4.shape
+
+    output5 = floor_div(x5, y5)
+    expect5 = np.floor_divide(x5_np, y5_np)
+    diff5 = output5.asnumpy() - expect5
+    error5 = np.ones(shape=expect5.shape) * 1.0e-5
+    assert np.all(diff5 < error5)
+    assert output5.shape == expect5.shape
+
+    output6 = floor_div(x6, y6)
+    expect6 = np.floor_divide(x6_np, y6_np)
+    diff6 = output6.asnumpy() - expect6
+    error6 = np.ones(shape=expect6.shape) * 1.0e-5
+    assert np.all(diff6 < error6)
+    assert output6.shape == expect6.shape
--- a/tests/st/ops/gpu/test_gathernd_op.py
+++ b/tests/st/ops/gpu/test_gathernd_op.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+from mindspore import Tensor
+from mindspore.ops import operations as P
+import mindspore.nn as nn
+import mindspore.context as context
+
+class GatherNdNet(nn.Cell):
+    def __init__(self):
+        super(GatherNdNet, self).__init__()
+        self.gathernd = P.GatherNd()
+
+    def construct(self, x, indices):
+        return self.gathernd(x, indices)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_gathernd0():
+    x = Tensor(np.arange(3 * 2, dtype=np.float32).reshape(3, 2))
+    indices = Tensor(np.array([[1, 1], [0, 1]]).astype(np.int32))
+    expect = np.array([3., 1.])
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    gathernd = GatherNdNet()
+    output = gathernd(x, indices)
+
+    error = np.ones(shape=output.asnumpy().shape) * 1.0e-6
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_gathernd1():
+    x = Tensor(np.arange(2 * 3 * 4 * 5, dtype=np.float32).reshape(2, 3, 4, 5))
+    indices = Tensor(np.array([[[[[l, k, j, i] for i in [1, 3, 4]] for j in range(4)]
+                                for k in range(3)] for l in range(2)], dtype='i4'))
+    expect = np.array([[[[1., 3., 4.],
+                         [6., 8., 9.],
+                         [11., 13., 14.],
+                         [16., 18., 19.]],
+
+                        [[21., 23., 24.],
+                         [26., 28., 29.],
+                         [31., 33., 34.],
+                         [36., 38., 39.]],
+
+                        [[41., 43., 44.],
+                         [46., 48., 49.],
+                         [51., 53., 54.],
+                         [56., 58., 59.]]],
+
+                       [[[61., 63., 64.],
+                         [66., 68., 69.],
+                         [71., 73., 74.],
+                         [76., 78., 79.]],
+
+                        [[81., 83., 84.],
+                         [86., 88., 89.],
+                         [91., 93., 94.],
+                         [96., 98., 99.]],
+
+                        [[101., 103., 104.],
+                         [106., 108., 109.],
+                         [111., 113., 114.],
+                         [116., 118., 119.]]]])
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    gather = GatherNdNet()
+    output = gather(x, indices)
+
+    error = np.ones(shape=output.asnumpy().shape) * 1.0e-6
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_gathernd2():
+    x = Tensor(np.array([[4., 5., 4., 1., 5.],
+                         [4., 9., 5., 6., 4.],
+                         [9., 8., 4., 3., 6.],
+                         [0., 4., 2., 2., 8.],
+                         [1., 8., 6., 2., 8.],
+                         [8., 1., 9., 7., 3.],
+                         [7., 9., 2., 5., 7.],
+                         [9., 8., 6., 8., 5.],
+                         [3., 7., 2., 7., 4.],
+                         [4., 2., 8., 2., 9.]]).astype(np.float16))
+
+    indices = Tensor(np.array([[4000], [1], [300000]]).astype(np.int32))
+    expect = np.array([[0., 0., 0., 0., 0.],
+                       [4., 9., 5., 6., 4.],
+                       [0., 0., 0., 0., 0.]])
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    gathernd = GatherNdNet()
+    output = gathernd(x, indices)
+
+    error = np.ones(shape=output.asnumpy().shape) * 1.0e-6
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_gathernd3():
+    x = Tensor(np.array([[4, 5, 4, 1, 5],
+                         [4, 9, 5, 6, 4],
+                         [9, 8, 4, 3, 6],
+                         [0, 4, 2, 2, 8],
+                         [1, 8, 6, 2, 8],
+                         [8, 1, 9, 7, 3],
+                         [7, 9, 2, 5, 7],
+                         [9, 8, 6, 8, 5],
+                         [3, 7, 2, 7, 4],
+                         [4, 2, 8, 2, 9]]
+                        ).astype(np.int32))
+
+    indices = Tensor(np.array([[4000], [1], [300000]]).astype(np.int32))
+    expect = np.array([[0, 0, 0, 0, 0],
+                       [4, 9, 5, 6, 4],
+                       [0, 0, 0, 0, 0]])
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    gathernd = GatherNdNet()
+    output = gathernd(x, indices)
+
+    error = np.ones(shape=output.asnumpy().shape) * 1.0e-6
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
--- a/tests/st/ops/gpu/test_scatter_nd.py
+++ b/tests/st/ops/gpu/test_scatter_nd.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+class Net(nn.Cell):
+    def __init__(self, _shape):
+        super(Net, self).__init__()
+        self.shape = _shape
+        self.scatternd = P.ScatterNd()
+
+    def construct(self, indices, update):
+        return self.scatternd(indices, update, self.shape)
+
+def scatternd_net(indices, update, _shape, expect):
+    scatternd = Net(_shape)
+    output = scatternd(Tensor(indices), Tensor(update))
+    error = np.ones(shape=output.asnumpy().shape) * 1.0e-6
+    diff = output.asnumpy() - expect
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_traning
+@pytest.mark.env_onecard
+def test_scatternd():
+    arr_indices = np.array([[0, 1], [1, 1]]).astype(np.int32)
+    arr_update = np.array([3.2, 1.1]).astype(np.float32)
+    shape = (2, 2)
+    expect = np.array([[0., 3.2],
+                       [0., 1.1]])
+    scatternd_net(arr_indices, arr_update, shape, expect)
--- a/tests/st/ops/gpu/test_sgd_op.py
+++ b/tests/st/ops/gpu/test_sgd_op.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.nn import Dense
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import SGD
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+class NetSGD(nn.Cell):
+    def __init__(self):
+        super(NetSGD, self).__init__()
+        self.batch_size = 1
+        self.reshape = P.Reshape()
+        weight = Tensor(np.ones([10, 16]).astype(np.float32) * 0.01)
+        self.fc1 = Dense(16, 10, weight_init=weight)
+
+    def construct(self, input_x):
+        output = self.reshape(input_x, (self.batch_size, -1))
+        output = self.fc1(output)
+        return output
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_SGD():
+    epoch = 3
+    net = NetSGD()
+    learning_rate = 0.1
+    momentum = 0.9
+    dampening = 0.0
+    weight_decay = 0.0
+    nesterov = True
+    loss_scale = 1.0
+
+    optimizer = SGD(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum, dampening,
+                    weight_decay, nesterov, loss_scale)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+    losses = []
+    for _ in range(epoch):
+        data = Tensor(np.arange(0, 16).reshape(1, 1, 4, 4).astype(np.float32) * 0.01)
+        label = Tensor(np.array([0]).astype(np.int32))
+        loss = train_network(data, label)
+        losses.append(loss.asnumpy())
+
+    last_loss = 100.0
+    for loss in losses:
+        assert last_loss > loss
+        last_loss = loss
+    return losses