resize Ops support data_layout:channel_last, test=develop, test=document_preview (#19914)

cb8f3c03 · Zhang Ting · hong · 9901f696 · cb8f3c03 · cb8f3c03
8 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -194,11 +194,11 @@ paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon'
 paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', '49368d724023a66b41b0071be41c0ba5'))
 paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '9a7a3b88a4fae41d58d3ca9b10ba0591'))
 paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '7e8e4bf1f0f8612961ed113e8af8f0c5'))
-paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '0e8567334d72a214c2e3ce0ce19e4d37'))
+paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1, 'NCHW')), ('document', 'd29d829607b5ff12924197a3ba296c89'))
 paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', 'bd97ebfe4bdf5110a5fcb8ecb626a447'))
-paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '0a7b98e57eb74bab6e3c2a95e41298a7'))
+paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCHW')), ('document', '44da7890c8a362a83a1c0902a1dc1e4d'))
-paddle.fluid.layers.resize_trilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '6baf2ddf375d3059e5aa74d7fde76517'))
+paddle.fluid.layers.resize_trilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCDHW')), ('document', '5b4d0f823f94c260fe5e6f7eec60a797'))
-paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '699bf1de6af91235367e9c7a9a6e252c'))
+paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 'NCHW')), ('document', '0107a5cbae1aef3f381d3d769a6068eb'))
 paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', 'f985c9b66e3aec96fa753a8eb44c991c'))
 paddle.fluid.layers.gather_nd (ArgSpec(args=['input', 'index', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3cc24f9cf135770aa6263dba25b457f9'))
 paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name', 'overwrite'], varargs=None, keywords=None, defaults=(None, True)), ('document', '69b22affd4a6326502af166f04c095ab'))

--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -19,6 +19,7 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
+using DataLayout = framework::DataLayout;
 static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
  auto dim_x = ctx->GetInputDim("X");
@@ -28,6 +29,8 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
      "bilinear" == interp_method || "nearest" == interp_method,
      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
      "Input(X) dimension is 4");
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
  if (ctx->HasInputs("SizeTensor")) {
    // top prority size
@@ -38,8 +41,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
        "Attr(out_shape)'s length must be 2 for 4-D input tensor.");
    int out_h = ctx->Attrs().Get<int>("out_h");
    int out_w = ctx->Attrs().Get<int>("out_w");
-    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    framework::DDim dim_out;
-    ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
    return;
  }
@@ -55,8 +63,12 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
    float scale = ctx->Attrs().Get<float>("scale");
    if (scale > 0) {
      // round down
-      out_h = static_cast<int>(dim_x[2] * scale);
+      out_h = (data_layout == DataLayout::kNCHW
-      out_w = static_cast<int>(dim_x[3] * scale);
+                   ? static_cast<int>(dim_x[2] * scale)
+                   : static_cast<int>(dim_x[1] * scale));
+      out_w = (data_layout == DataLayout::kNCHW
+                   ? static_cast<int>(dim_x[3] * scale)
+                   : static_cast<int>(dim_x[2] * scale));
      // protect when input shape is -1
      out_h = out_h > 0 ? out_h : -1;
      out_w = out_w > 0 ? out_w : -1;
@@ -75,8 +87,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
    return;
  }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+  framework::DDim dim_out;
-  ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_h, out_w, dim_x[3]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
 }
 static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
@@ -86,6 +103,8 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
  PADDLE_ENFORCE("trilinear" == interp_method,
                 "Interpolation method can only be \"trilinear\" when Input(X) "
                 "dimension is 5");
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
  if (ctx->HasInputs("SizeTensor")) {
    // top prority size
@@ -97,8 +116,13 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
    int out_d = ctx->Attrs().Get<int>("out_d");
    int out_h = ctx->Attrs().Get<int>("out_h");
    int out_w = ctx->Attrs().Get<int>("out_w");
-    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_d, out_h, out_w});
+    framework::DDim dim_out;
-    ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
+    if (data_layout == DataLayout::kNCHW) {
+      dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+    } else {
+      dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+    }
+    ctx->SetOutputDim("Out", dim_out);
    return;
  }
@@ -115,9 +139,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
    float scale = ctx->Attrs().Get<float>("scale");
    if (scale > 0) {
      // round down
-      out_d = static_cast<int>(dim_x[2] * scale);
+      out_d = (data_layout == DataLayout::kNCHW
-      out_h = static_cast<int>(dim_x[3] * scale);
+                   ? static_cast<int>(dim_x[2] * scale)
-      out_w = static_cast<int>(dim_x[4] * scale);
+                   : static_cast<int>(dim_x[1] * scale));
+      out_h = (data_layout == DataLayout::kNCHW
+                   ? static_cast<int>(dim_x[3] * scale)
+                   : static_cast<int>(dim_x[2] * scale));
+      out_w = (data_layout == DataLayout::kNCHW
+                   ? static_cast<int>(dim_x[4] * scale)
+                   : static_cast<int>(dim_x[3] * scale));
      // protect when input shape is -1
      out_d = out_d > 0 ? out_d : -1;
      out_h = out_h > 0 ? out_h : -1;
@@ -138,8 +168,13 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
    return;
  }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_d, out_h, out_w});
+  framework::DDim dim_out;
-  ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w};
+  } else {
+    dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]};
+  }
+  ctx->SetOutputDim("Out", dim_out);
 }
 class InterpolateOp : public framework::OperatorWithKernel {
@@ -213,6 +248,13 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
              "The output tensor of interpolate operator, "
              "This is a tensor in same rank with Input(X).");
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
    AddAttr<int>("out_d", "output depth of interpolate op.").SetDefault(0);
    AddAttr<int>("out_h", "output height of interpolate op.").SetDefault(0);
    AddAttr<int>("out_w", "output width of interpolate op.").SetDefault(0);

--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -17,6 +17,7 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
+using DataLayout = framework::DataLayout;
 template <typename T>
 __global__ void KeNearestNeighborInterpFw(
@@ -24,7 +25,7 @@ __global__ void KeNearestNeighborInterpFw(
    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
    const size_t out_img_w, const size_t output_h, const size_t output_w,
    const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners) {
+    const bool align_corners, const DataLayout data_layout) {
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -33,20 +34,32 @@ __global__ void KeNearestNeighborInterpFw(
    int out_id_w = tid % output_w;
    int in_img_size = input_w / num_channels;
    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
    int in_img_idy = (align_corners)
                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
                         : static_cast<int>(ratio_h * out_img_idy);
-    int out_img_idx = tid % out_img_w;
    int in_img_idx = (align_corners)
                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
                         : static_cast<int>(ratio_w * out_img_idx);
-    out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
+    if (data_layout == DataLayout::kNCHW) {
-                  in_img_idy * in_img_w + in_img_idx];
+      out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    } else {
+      out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    }
  }
 }
@@ -56,7 +69,7 @@ __global__ void KeNearestNeighborInterpBw(
    const size_t input_w, const T* out, const size_t out_img_h,
    const size_t out_img_w, const size_t output_h, const size_t output_w,
    const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners) {
+    const bool align_corners, const DataLayout data_layout) {
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -65,20 +78,33 @@ __global__ void KeNearestNeighborInterpBw(
    int out_id_w = tid % output_w;
    int in_img_size = input_w / num_channels;
    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
    int in_img_idy = (align_corners)
                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
                         : static_cast<int>(ratio_h * out_img_idy);
-    int out_img_idx = tid % out_img_w;
    int in_img_idx = (align_corners)
                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
                         : static_cast<int>(ratio_w * out_img_idx);
-    T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+    T* in_pos;
-                    in_img_idy * in_img_w + in_img_idx];
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
    const T out_pos = out[out_id_h * output_w + out_id_w];
    platform::CudaAtomicAdd(in_pos, out_pos);
  }
@@ -90,7 +116,8 @@ __global__ void KeBilinearInterpFw(
    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
    const size_t out_img_w, const size_t output_h, const size_t output_w,
    const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const int align_mode) {
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -100,9 +127,18 @@ __global__ void KeBilinearInterpFw(
    int out_id_w = tid % output_w;
    int in_img_size = input_w / num_channels;
    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
    int in_img_idy = align_flag
                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
                         : static_cast<int>(ratio_h * out_img_idy);
@@ -114,7 +150,6 @@ __global__ void KeBilinearInterpFw(
        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
    T h2lambda = 1.f - h1lambda;
-    int out_img_idx = tid % out_img_w;
    int in_img_idx = align_flag
                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
                         : static_cast<int>(ratio_w * out_img_idx);
@@ -126,14 +161,28 @@ __global__ void KeBilinearInterpFw(
        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
    T w2lambda = 1.f - w1lambda;
-    const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+    if (data_layout == DataLayout::kNCHW) {
-                          in_img_idy * in_img_w + in_img_idx];
+      const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                            in_img_idy * in_img_w + in_img_idx];
-    // bilinear interpolation
+      // bilinear interpolation
-    out[out_id_h * output_w + out_id_w] =
+      out[out_id_h * output_w + out_id_w] =
-        h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+          h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
-        h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
-                    w1lambda * in_pos[h_id * in_img_w + w_id]);
+                      w1lambda * in_pos[h_id * in_img_w + w_id]);
+    } else {
+      const T* in_pos =
+          &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+              in_img_idx * num_channels + channel_id];
+      // bilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          h2lambda *
+              (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] +
+                      w1lambda * in_pos[h_id * in_img_w * num_channels +
+                                        w_id * num_channels]);
+    }
  }
 }
@@ -143,7 +192,8 @@ __global__ void KeBilinearInterpBw(
    const size_t input_w, const T* out, const size_t out_img_h,
    const size_t out_img_w, const size_t output_h, const size_t output_w,
    const size_t num_channels, const T ratio_h, const T ratio_w,
-    const bool align_corners, const int align_mode) {
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -153,9 +203,18 @@ __global__ void KeBilinearInterpBw(
    int out_id_w = tid % output_w;
    int in_img_size = input_w / num_channels;
    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int channel_id, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idy = (out_id_w % out_img_size) / out_img_w;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idy = out_id_w / (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
                                : ratio_h * out_img_idy;
    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
@@ -164,10 +223,8 @@ __global__ void KeBilinearInterpBw(
    src_h = (src_h > 0) ? src_h : 0;
    T h1lambda =
        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
    T h2lambda = 1.f - h1lambda;
-    int out_img_idx = tid % out_img_w;
    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
                                : ratio_w * out_img_idx;
    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
@@ -178,15 +235,34 @@ __global__ void KeBilinearInterpBw(
        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
    T w2lambda = 1.f - w1lambda;
-    T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+    T* in_pos;
-                    in_img_idy * in_img_w + in_img_idx];
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idy * in_img_w + in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
    const T* out_pos = &out[out_id_h * output_w + out_id_w];
-    platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+    if (data_layout == DataLayout::kNCHW) {
-    platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-                            h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
-    platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
-                            h1lambda * w1lambda * out_pos[0]);
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+                              h1lambda * w1lambda * out_pos[0]);
+    } else {
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                              h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
+                              h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
+          h1lambda * w1lambda * out_pos[0]);
+    }
  }
 }
@@ -197,7 +273,8 @@ __global__ void KeTrilinearInterpFw(
    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
    const size_t output_h, const size_t output_w, const size_t num_channels,
    const float ratio_d, const float ratio_h, const float ratio_w,
-    const bool align_corners, const int align_mode) {
+    const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -207,9 +284,21 @@ __global__ void KeTrilinearInterpFw(
    int out_id_w = tid % output_w;
    int in_img_size = input_w / num_channels;
    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-    int out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
    int in_img_idt = align_flag
                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
                         : static_cast<int>(ratio_d * out_img_idt);
@@ -221,7 +310,6 @@ __global__ void KeTrilinearInterpFw(
        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
    T d2lambda = 1.f - d1lambda;
-    int out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
    int in_img_idy = align_flag
                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
                         : static_cast<int>(ratio_h * out_img_idy);
@@ -233,7 +321,6 @@ __global__ void KeTrilinearInterpFw(
        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
    T h2lambda = 1.f - h1lambda;
-    int out_img_idx = tid % out_img_w;
    int in_img_idx = align_flag
                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
                         : static_cast<int>(ratio_w * out_img_idx);
@@ -245,23 +332,49 @@ __global__ void KeTrilinearInterpFw(
        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
    T w2lambda = 1.f - w1lambda;
-    int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+    if (data_layout == DataLayout::kNCHW) {
-                      (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
-                      in_img_idx;
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
-    const T* in_pos1 = &in[in_pos1_idx];
+                        in_img_idx;
-    int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      const T* in_pos1 = &in[in_pos1_idx];
-    const T* in_pos2 = &in[in_pos2_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      const T* in_pos2 = &in[in_pos2_idx];
-    // trilinear interpolation
-    out[out_id_h * output_w + out_id_w] =
+      // trilinear interpolation
-        d2lambda *
+      out[out_id_h * output_w + out_id_w] =
-            (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) +
+          d2lambda *
-             h1lambda * (w2lambda * in_pos1[h_id * in_img_w] +
+              (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) +
-                         w1lambda * in_pos1[h_id * in_img_w + w_id])) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w] +
-        d1lambda *
+                           w1lambda * in_pos1[h_id * in_img_w + w_id])) +
-            (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) +
+          d1lambda *
-             h1lambda * (w2lambda * in_pos2[h_id * in_img_w] +
+              (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) +
-                         w1lambda * in_pos2[h_id * in_img_w + w_id]));
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w] +
+                           w1lambda * in_pos2[h_id * in_img_w + w_id]));
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      const T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      const T* in_pos2 = &in[in_pos2_idx];
+      // trilinear interpolation
+      out[out_id_h * output_w + out_id_w] =
+          d2lambda *
+              (h2lambda * (w2lambda * in_pos1[0] +
+                           w1lambda * in_pos1[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos1[h_id * in_img_w * num_channels +
+                                              w_id * num_channels])) +
+          d1lambda *
+              (h2lambda * (w2lambda * in_pos2[0] +
+                           w1lambda * in_pos2[w_id * num_channels]) +
+               h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] +
+                           w1lambda * in_pos2[h_id * in_img_w * num_channels +
+                                              w_id * num_channels]));
+    }
  }
 }
@@ -272,7 +385,7 @@ __global__ void KeTrilinearInterpBw(
    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
    const size_t output_h, const size_t output_w, const size_t num_channels,
    const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners,
-    const int align_mode) {
+    const int align_mode, const DataLayout data_layout) {
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -282,9 +395,21 @@ __global__ void KeTrilinearInterpBw(
    int out_id_w = tid % output_w;
    int in_img_size = input_w / num_channels;
    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-    int out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
    int in_img_idt = align_flag
                         ? static_cast<int>(ratio_d * (out_img_idt + 0.5) - 0.5)
                         : static_cast<int>(ratio_d * out_img_idt);
@@ -296,7 +421,6 @@ __global__ void KeTrilinearInterpBw(
        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
    T d2lambda = 1.f - d1lambda;
-    int out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
    int in_img_idy = align_flag
                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
                         : static_cast<int>(ratio_h * out_img_idy);
@@ -308,7 +432,6 @@ __global__ void KeTrilinearInterpBw(
        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
    T h2lambda = 1.f - h1lambda;
-    int out_img_idx = tid % out_img_w;
    int in_img_idx = align_flag
                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
                         : static_cast<int>(ratio_w * out_img_idx);
@@ -320,32 +443,64 @@ __global__ void KeTrilinearInterpBw(
        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
    T w2lambda = 1.f - w1lambda;
-    int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
+    if (data_layout == DataLayout::kNCHW) {
-                      (in_img_idt * in_img_h + in_img_idy) * in_img_w +
+      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
-                      in_img_idx;
+                        (in_img_idt * in_img_h + in_img_idy) * in_img_w +
-    T* in_pos1 = &in[in_pos1_idx];
+                        in_img_idx;
-    int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      T* in_pos1 = &in[in_pos1_idx];
-    T* in_pos2 = &in[in_pos2_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w;
+      T* in_pos2 = &in[in_pos2_idx];
-    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
-    // trilinear interpolation grad
-    platform::CudaAtomicAdd(&in_pos1[0],
+      // trilinear interpolation grad
-                            d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[0],
-    platform::CudaAtomicAdd(&in_pos1[w_id],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
-                            d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id],
-    platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
-                            d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
-    platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
-                            d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
-    platform::CudaAtomicAdd(&in_pos2[0],
+                              d2lambda * h1lambda * w1lambda * out_pos[0]);
-                            d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
-    platform::CudaAtomicAdd(&in_pos2[w_id],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
-                            d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id],
-    platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
-                            d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
-    platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
-                            d1lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+                              d1lambda * h1lambda * w1lambda * out_pos[0]);
+    } else {
+      int in_pos1_idx = out_id_h * input_w +
+                        in_img_idt * in_img_h * in_img_w * num_channels +
+                        in_img_idy * in_img_w * num_channels +
+                        in_img_idx * num_channels + channel_id;
+      T* in_pos1 = &in[in_pos1_idx];
+      int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels;
+      T* in_pos2 = &in[in_pos2_idx];
+      const T* out_pos = &out[out_id_h * output_w + out_id_w];
+      // trilinear interpolation grad
+      platform::CudaAtomicAdd(&in_pos1[0],
+                              d2lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
+                              d2lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
+                              d2lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
+          d2lambda * h1lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[0],
+                              d1lambda * h2lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
+                              d1lambda * h2lambda * w1lambda * out_pos[0]);
+      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
+                              d1lambda * h1lambda * w2lambda * out_pos[0]);
+      platform::CudaAtomicAdd(
+          &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
+          d1lambda * h1lambda * w1lambda * out_pos[0]);
+    }
  }
 }
@@ -354,10 +509,10 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
                                 const Tensor& input, Tensor* output) {
  auto* input_data = input.data<T>();
-  const int n = input.dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input.dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_h = input.dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_w = input.dims()[3];
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -401,8 +556,13 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
      out_w, 0,
      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  auto output_data =
+  framework::DDim dim_out;
-      output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
  if (in_h == out_h && in_w == out_w) {
    framework::TensorCopy(input, ctx.GetPlace(), output);
@@ -433,12 +593,12 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
    KeNearestNeighborInterpFw<
        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners);
+        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
  } else if ("bilinear" == interp_method) {
    KeBilinearInterpFw<
        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
+        out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
  }
 }
@@ -447,11 +607,10 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
                                 const Tensor& input, Tensor* output) {
  auto* input_data = input.data<T>();
-  const int n = input.dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input.dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_d = input.dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_h = input.dims()[3];
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-  const int in_w = input.dims()[4];
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -502,8 +661,13 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
      out_w, 0,
      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  auto output_data =
+  framework::DDim dim_out;
-      output->mutable_data<T>({n, c, out_d, out_h, out_w}, ctx.GetPlace());
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+  auto output_data = output->mutable_data<T>(dim_out, ctx.GetPlace());
  if (in_d == out_d && in_h == out_h && in_w == out_w) {
    framework::TensorCopy(input, ctx.GetPlace(), output);
@@ -540,7 +704,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
-        align_mode);
+        align_mode, data_layout);
  }
 }
@@ -548,10 +712,10 @@ template <typename T>
 static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
                                 Tensor* input_grad, const Tensor output_grad) {
  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input->dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_h = input->dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_w = input->dims()[3];
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -589,8 +753,14 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
  }
  auto* output_grad_data = output_grad.data<T>();
-  auto* input_grad_data =
+  framework::DDim dim_grad;
-      input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
  math::SetConstant<platform::CUDADeviceContext, T> zero;
  zero(device_ctx, input_grad, static_cast<T>(0.0));
@@ -624,12 +794,13 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
    KeNearestNeighborInterpBw<
        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners);
+        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
  } else if ("bilinear" == interp_method) {
    KeBilinearInterpBw<
        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
+        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
+        data_layout);
  }
 }
@@ -638,11 +809,10 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
                                 Tensor* input_grad,
                                 const Tensor& output_grad) {
  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input->dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_d = input->dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_h = input->dims()[3];
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-  const int in_w = input->dims()[4];
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -684,8 +854,13 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
  }
  auto* output_grad_data = output_grad.data<T>();
-  auto* input_grad_data =
+  framework::DDim dim_grad;
-      input_grad->mutable_data<T>({n, c, in_d, in_h, in_w}, ctx.GetPlace());
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
  math::SetConstant<platform::CUDADeviceContext, T> zero;
  zero(device_ctx, input_grad, static_cast<T>(0.0));
@@ -725,7 +900,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
-        align_mode);
+        align_mode, data_layout);
  }
 }

--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -22,6 +22,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
 inline std::vector<int> get_new_shape(
    const std::vector<const Tensor*>& list_new_shape_tensor) {
@@ -57,12 +58,30 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
  return vec_new_data;
 }
+inline void ExtractNCDWH(const framework::DDim& dims,
+                         const DataLayout& data_layout, int* N, int* C, int* D,
+                         int* H, int* W) {
+  *N = dims[0];
+  if (dims.size() == 4) {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3];
+    *D = 1;
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4];
+    *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2];
+    *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3];
+  }
+}
 template <typename T>
 static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
                                       const float ratio_h, const float ratio_w,
                                       const int n, const int c,
                                       const int out_h, const int out_w,
-                                       const bool align_corners) {
+                                       const bool align_corners,
+                                       const DataLayout& data_layout) {
  auto input_t = EigenTensor<T, 4>::From(input);
  auto output_t = EigenTensor<T, 4>::From(*output);
  for (int k = 0; k < out_h; k++) {  // loop for images
@@ -75,7 +94,11 @@ static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
      for (int i = 0; i < n; i++) {    // loop for batches
        for (int j = 0; j < c; j++) {  // loop for channels
-          output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
+          if (data_layout == DataLayout::kNCHW) {
+            output_t(i, j, k, l) = input_t(i, j, in_k, in_l);
+          } else {
+            output_t(i, k, l, j) = input_t(i, in_k, in_l, j);
+          }
        }
      }
    }
@@ -88,7 +111,8 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
                                  const int in_h, const int in_w, const int n,
                                  const int c, const int out_h, const int out_w,
                                  const bool align_corners,
-                                  const bool align_mode) {
+                                  const bool align_mode,
+                                  const DataLayout data_layout) {
  auto input_t = EigenTensor<T, 4>::From(input);
  auto output_t = EigenTensor<T, 4>::From(*output);
  bool align_flag = (align_mode == 0 && !align_corners);
@@ -154,11 +178,21 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
      for (int k = 0; k < out_h; k++) {  // loop for images
        for (int l = 0; l < out_w; l++) {
          // bilinear interpolation
-          T out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
+          T out_t;
+          if (data_layout == DataLayout::kNCHW) {
+            out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
-          output_t(i, j, k, l) = out_t;
+            output_t(i, j, k, l) = out_t;
+          } else {
+            out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] +
+                    input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] +
+                    input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] +
+                    input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l];
+            output_t(i, k, l, j) = out_t;
+          }
        }
      }
    }
@@ -170,7 +204,8 @@ static void TrilinearInterpolation(
    const Tensor& input, Tensor* output, const float ratio_d,
    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
    const int in_w, const int n, const int c, const int out_d, const int out_h,
-    const int out_w, const bool align_corners, const bool align_mode) {
+    const int out_w, const bool align_corners, const bool align_mode,
+    const DataLayout& data_layout) {
  auto input_t = EigenTensor<T, 5>::From(input);
  auto output_t = EigenTensor<T, 5>::From(*output);
  bool align_flag = (align_mode == 0 && !align_corners);
@@ -263,23 +298,43 @@ static void TrilinearInterpolation(
        for (int k = 0; k < out_h; k++) {
          for (int l = 0; l < out_w; l++) {
            // trilinear interpolation
-            T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
+            if (data_layout == DataLayout::kNCHW) {
-                          vd_s[k] * vd_e[l] +
+              T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
-                      input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
-                          vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
-                      input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
-                          vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
-                      input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
-                          vd_n[k] * vd_w[l] +
+                        input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
-                      input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
+                            vd_n[k] * vd_w[l] +
-                          vd_s[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
-                      input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
-                          vd_s[k] * vd_w[l] +
+                        input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
-                      input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
-                          vd_n[k] * vd_e[l] +
+                        input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
-                      input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
-                          vd_n[k] * vd_w[l];
+                        input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
-            output_t(b, i, j, k, l) = out_t;
+                            vd_n[k] * vd_w[l];
+              output_t(b, i, j, k, l) = out_t;
+            } else {
+              T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] *
+                            vd_n[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] *
+                            vd_s[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] *
+                            vd_s[k] * vd_w[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] *
+                            vd_n[k] * vd_e[l] +
+                        input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] *
+                            vd_n[k] * vd_w[l];
+              output_t(b, j, k, l, i) = out_t;
+            }
          }
        }
      }
@@ -291,7 +346,7 @@ template <typename T>
 static void NearestNeighborInterpolateGrad(
    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
    const float ratio_w, const int n, const int c, const int out_h,
-    const int out_w, const bool align_corners) {
+    const int out_w, const bool align_corners, const DataLayout data_layout) {
  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
@@ -305,7 +360,11 @@ static void NearestNeighborInterpolateGrad(
      for (int i = 0; i < n; i++) {    // loop for batches
        for (int j = 0; j < c; j++) {  // loop for channels
-          input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
+          if (data_layout == DataLayout::kNCHW) {
+            input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l);
+          } else {
+            input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j);
+          }
        }
      }
    }
@@ -313,13 +372,11 @@ static void NearestNeighborInterpolateGrad(
 }
 template <typename T>
-static void BilinearInterpolationGrad(const Tensor& output_grad,
+static void BilinearInterpolationGrad(
-                                      Tensor* input_grad, const float ratio_h,
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
-                                      const float ratio_w, const int in_h,
+    const float ratio_w, const int in_h, const int in_w, const int n,
-                                      const int in_w, const int n, const int c,
+    const int c, const int out_h, const int out_w, const bool align_corners,
-                                      const int out_h, const int out_w,
+    const int align_mode, const DataLayout data_layout) {
-                                      const bool align_corners,
-                                      const int align_mode) {
  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
  bool align_flag = (align_mode == 0 && !align_corners);
@@ -346,11 +403,19 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
      for (int i = 0; i < n; i++) {    // loop for batches
        for (int j = 0; j < c; j++) {  // loop for channels
          // bilinear interpolation grad
-          const T grad = output_grad_t(i, j, k, l);
+          if (data_layout == DataLayout::kNCHW) {
-          input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
+            const T grad = output_grad_t(i, j, k, l);
-          input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
-          input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
-          input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
+            input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
+          } else {
+            const T grad = output_grad_t(i, k, l, j);
+            input_grad_t(i, y_n, x_w, j) += static_cast<T>(grad * d_s * d_e);
+            input_grad_t(i, y_s, x_w, j) += static_cast<T>(grad * d_n * d_e);
+            input_grad_t(i, y_n, x_e, j) += static_cast<T>(grad * d_s * d_w);
+            input_grad_t(i, y_s, x_e, j) += static_cast<T>(grad * d_n * d_w);
+          }
        }
      }
    }
@@ -362,7 +427,8 @@ static void TrilinearInterpolationGrad(
    const Tensor& output_grad, Tensor* input_grad, const float ratio_d,
    const float ratio_h, const float ratio_w, const int in_d, const int in_h,
    const int in_w, const int n, const int c, const int out_d, const int out_h,
-    const int out_w, const bool align_corners, const int align_mode) {
+    const int out_w, const bool align_corners, const int align_mode,
+    const DataLayout data_layout) {
  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
  bool align_flag = (align_mode == 0 && !align_corners);
@@ -399,23 +465,43 @@ static void TrilinearInterpolationGrad(
        for (int b = 0; b < n; b++) {    // loop for batches
          for (int i = 0; i < c; i++) {  // loop for channels
            // trilinear interpolation grad
-            const T grad = output_grad_t(b, i, j, k, l);
+            if (data_layout == DataLayout::kNCHW) {
-            input_grad_t(b, i, t_f, y_n, x_w) +=
+              const T grad = output_grad_t(b, i, j, k, l);
-                static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, i, t_f, y_n, x_w) +=
-            input_grad_t(b, i, t_f, y_n, x_e) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
-                static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, i, t_f, y_n, x_e) +=
-            input_grad_t(b, i, t_f, y_s, x_w) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
-                static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, i, t_f, y_s, x_w) +=
-            input_grad_t(b, i, t_f, y_s, x_e) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
-                static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, i, t_f, y_s, x_e) +=
-            input_grad_t(b, i, t_b, y_n, x_w) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
-                static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, i, t_b, y_n, x_w) +=
-            input_grad_t(b, i, t_b, y_n, x_e) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
-                static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, i, t_b, y_n, x_e) +=
-            input_grad_t(b, i, t_b, y_s, x_w) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
-                static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, i, t_b, y_s, x_w) +=
-            input_grad_t(b, i, t_b, y_s, x_e) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
-                static_cast<T>(grad * d_f * d_n * d_w);
+              input_grad_t(b, i, t_b, y_s, x_e) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            } else {
+              const T grad = output_grad_t(b, j, k, l, i);
+              input_grad_t(b, t_f, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_e);
+              input_grad_t(b, t_f, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_s * d_w);
+              input_grad_t(b, t_f, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_e);
+              input_grad_t(b, t_f, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_b * d_n * d_w);
+              input_grad_t(b, t_b, y_n, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_e);
+              input_grad_t(b, t_b, y_n, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_s * d_w);
+              input_grad_t(b, t_b, y_s, x_w, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_e);
+              input_grad_t(b, t_b, y_s, x_e, i) +=
+                  static_cast<T>(grad * d_f * d_n * d_w);
+            }
          }
        }
      }
@@ -426,10 +512,10 @@ static void TrilinearInterpolationGrad(
 template <typename T>
 static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
                                const Tensor& input, Tensor* output) {
-  const int n = input.dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input.dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_h = input.dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_w = input.dims()[3];
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -470,7 +556,13 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
  PADDLE_ENFORCE_GT(
      out_w, 0,
      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_h, out_w};
+  } else {
+    dim_out = {n, out_h, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
  if (in_h == out_h && in_w == out_w) {
    framework::TensorCopy(input, ctx.GetPlace(), output);
@@ -490,21 +582,21 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
  if ("bilinear" == interp_method) {
    BilinearInterpolation<T>(input, output, ratio_h, ratio_w, in_h, in_w, n, c,
-                             out_h, out_w, align_corners, align_mode);
+                             out_h, out_w, align_corners, align_mode,
+                             data_layout);
  } else if ("nearest" == interp_method) {
    NearestNeighborInterpolate<T>(input, output, ratio_h, ratio_w, n, c, out_h,
-                                  out_w, align_corners);
+                                  out_w, align_corners, data_layout);
  }
 }
 template <typename T>
 static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
                                const Tensor& input, Tensor* output) {
-  const int n = input.dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input.dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_d = input.dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_h = input.dims()[3];
+  ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-  const int in_w = input.dims()[4];
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -552,7 +644,15 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
  PADDLE_ENFORCE_GT(
      out_w, 0,
      "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0.");
-  output->mutable_data<T>({n, c, out_d, out_h, out_w}, ctx.GetPlace());
+  framework::DDim dim_out;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_out = {n, c, out_d, out_h, out_w};
+  } else {
+    dim_out = {n, out_d, out_h, out_w, c};
+  }
+  output->mutable_data<T>(dim_out, ctx.GetPlace());
  if (in_d == out_d && in_h == out_h && in_w == out_w) {
    framework::TensorCopy(input, ctx.GetPlace(), output);
@@ -578,7 +678,7 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
  if ("trilinear" == interp_method) {
    TrilinearInterpolation<T>(input, output, ratio_d, ratio_h, ratio_w, in_d,
                              in_h, in_w, n, c, out_d, out_h, out_w,
-                              align_corners, align_mode);
+                              align_corners, align_mode, data_layout);
  }
 }
@@ -586,10 +686,10 @@ template <typename T>
 static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
                                Tensor* input_grad, const Tensor& output_grad) {
  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input->dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_h = input->dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_w = input->dims()[3];
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -623,7 +723,14 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
    out_w = new_size[1];
  }
-  input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_h, in_w};
+  } else {
+    dim_grad = {n, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
  math::SetConstant<platform::CPUDeviceContext, T> zero;
  zero(device_ctx, input_grad, static_cast<T>(0.0));
@@ -647,10 +754,11 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
  if ("bilinear" == interp_method) {
    BilinearInterpolationGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
                                 in_h, in_w, n, c, out_h, out_w, align_corners,
-                                 align_mode);
+                                 align_mode, data_layout);
  } else if ("nearest" == interp_method) {
    NearestNeighborInterpolateGrad<T>(output_grad, input_grad, ratio_h, ratio_w,
-                                      n, c, out_h, out_w, align_corners);
+                                      n, c, out_h, out_w, align_corners,
+                                      data_layout);
  }
 }
@@ -658,11 +766,10 @@ template <typename T>
 static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
                                Tensor* input_grad, const Tensor output_grad) {
  auto* input = ctx.Input<Tensor>("X");
-  const int n = input->dims()[0];
+  const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const int c = input->dims()[1];
+  const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-  const int in_d = input->dims()[2];
+  int n, c, in_d, in_h, in_w;
-  const int in_h = input->dims()[3];
+  ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-  const int in_w = input->dims()[4];
  auto interp_method = ctx.Attr<std::string>("interp_method");
  bool align_corners = ctx.Attr<bool>("align_corners");
@@ -700,7 +807,13 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
    out_w = new_size[2];
  }
-  input_grad->mutable_data<T>({n, c, in_d, in_h, in_w}, ctx.GetPlace());
+  framework::DDim dim_grad;
+  if (data_layout == DataLayout::kNCHW) {
+    dim_grad = {n, c, in_d, in_h, in_w};
+  } else {
+    dim_grad = {n, in_d, in_h, in_w, c};
+  }
+  input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
  math::SetConstant<platform::CPUDeviceContext, T> zero;
  zero(device_ctx, input_grad, static_cast<T>(0.0));
@@ -727,9 +840,9 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
  }
  if ("trilinear" == interp_method) {
-    TrilinearInterpolationGrad<T>(output_grad, input_grad, ratio_d, ratio_h,
+    TrilinearInterpolationGrad<T>(
-                                  ratio_w, in_d, in_h, in_w, n, c, out_d, out_h,
+        output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n,
-                                  out_w, align_corners, align_mode);
+        c, out_d, out_h, out_w, align_corners, align_mode, data_layout);
  }
 }

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -8019,15 +8019,17 @@ def image_resize(input,
                 resample='BILINEAR',
                 actual_shape=None,
                 align_corners=True,
-                 align_mode=1):
+                 align_mode=1,
+                 data_format='NCHW'):
    """
    **Resize a Batch of Images**
-    The input must be a tensor of the shape (num_batches, channels, in_h, in_w)
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w) 
-    or (num_batches, channels, in_d, in_h, in_w), and the resizing only applies 
+    or (num_batches, in_h, in_w, channels), or a 5-D Tensor of the shape 
-    on the last two/three dimensions(depth, hight and width).
+    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels), 
+    and the resizing only applies on the three dimensions(depth, hight and width).
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the 
+    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
    future and only use :attr:`out_shape` instead.
    Supporting resample methods:
@@ -8144,16 +8146,13 @@ def image_resize(input,
    Args:
-        input (Variable): The input tensor of image resize layer,
+        input (Variable): 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
-                          This is a 4-D tensor of the shape
+                          its data format is specified by :attr:`data_format`.
-                          (num_batches, channels, in_h, in_w) or a
-                          5-D tensor of the shape
-                          (num_batches, channls, in_d, in_h, in_w).
        out_shape(list|tuple|Variable|None): Output shape of image resize
-             layer, the shape is (out_h, out_w) when input is a 4-D tensor and is
+             layer, the shape is (out_h, out_w) when input is a 4-D Tensor and is
-             (out_d, out_h, out_w) when input is a 5-D tensor. Default: None. If 
+             (out_d, out_h, out_w) when input is a 5-D Tensor. Default: None. If 
-             a list, each element can be an integer or a tensor Variable of shape: [1].
+             a list, each element can be an integer or a Tensor Variable of shape: [1].
-             If a tesnosr Variable, its dimensions size should be a 1.
+             If a Tensor Variable, its dimensions size should be a 1.
        scale(float|Variable|None): The multiplier for the input height or width. At
             least one of :attr:`out_shape` or :attr:`scale` must be set.
             And :attr:`out_shape` has a higher priority than :attr:`scale`.
@@ -8181,12 +8180,16 @@ def image_resize(input,
                               Default: True
        align_mode(int)  :  An optional for bilinear interpolation. can be \'0\' 
                            for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for 
-                            src_idx = scale*dst_index .
+                            src_idx = scale*dst_index.
+        data_format(str, optional): NCHW(num_batches, channels, height, width) or 
+                                    NHWC(num_batches, height, width, channels) for 4-D Tensor,
+                                    NCDHW(num_batches, channels, depth, height, width) or 
+                                    NDHWC(num_batches, depth, height, width, channels) for 5-D Tensor.
+                                    Default: 'NCHW'.
    Returns:
-        Variable: The output is a 4-D tensor of the shape
+        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
-        (num_batches, channls, out_h, out_w) or a 5-D tensor of the shape
+        or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
-        (num_batches, channels, out_d, out_h, out_w).
    Raises:
        TypeError: out_shape should be a list or tuple or Variable.
@@ -8201,6 +8204,7 @@ def image_resize(input,
        ValueError: scale should be greater than zero.
        TypeError: align_corners shoule be a bool value
        ValueError: align_mode can only be '0' or '1'
+        ValueError: data_format can only be 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
    Examples:
        .. code-block:: python
@@ -8259,9 +8263,23 @@ def image_resize(input,
    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
    dtype = helper.input_dtype()
+    if len(input.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
+        raise ValueError(
+            "Got wrong value for param `data_format`: " + data_format +
+            " received but only `NCHW` or `NHWC` supported for 4-D input.")
+    elif len(input.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
+        raise ValueError(
+            "Got wrong value for param `data_format`: " + data_format +
+            " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
    def _is_list_or_turple_(data):
        return (isinstance(data, list) or isinstance(data, tuple))
+    if data_format == 'NCHW' or data_format == 'NCDHW':
+        data_layout = 'NCHW'
+    if data_format == 'NHWC' or data_format == 'NDHWC':
+        data_layout = 'NHWC'
    inputs = {"X": input}
    attrs = {
        "out_d": -1,
@@ -8269,7 +8287,8 @@ def image_resize(input,
        "out_w": -1,
        "interp_method": resample_type,
        "align_corners": align_corners,
-        "align_mode": align_mode
+        "align_mode": align_mode,
+        "data_layout": data_layout
    }
    if out_shape is not None:
@@ -8368,7 +8387,8 @@ def resize_bilinear(input,
                    name=None,
                    actual_shape=None,
                    align_corners=True,
-                    align_mode=1):
+                    align_mode=1,
+                    data_format='NCHW'):
    """
    Resize input by performing bilinear interpolation based on given
    output shape which specified by actual_shape, out_shape and scale
@@ -8401,7 +8421,7 @@ def resize_bilinear(input,
            else:
-              scale_factor = float(in_size/out_size)     
+              scale_factor = float(in_size/out_size)
        Bilinear interpolation:
@@ -8414,31 +8434,24 @@ def resize_bilinear(input,
              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
          else:
              input : (N,C,H_in,W_in)
              output: (N,C,H_out,W_out) where:
              H_out = H_{in} * scale_{factor}
              W_out = W_{in} * scale_{factor}
    Args:
-        input(${x_type}): input should be a 4-D tensor of shape 
+        input(${x_type}): 4-D Tensor, its data type is float32, float64, or uint8,
-                          (num_batches, channels, in_h, in_w).
+                          its data format is specified by :attr:`data_format`.
        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
            layer, the shape is (out_h, out_w).Default: None. If a list, each 
-            element can be an integer or a tensor Variable with shape: [1]. If a 
+            element can be an integer or a Tensor Variable with shape: [1]. If a 
-            tensor Variable, its dimension size should be 1.
+            Tensor Variable, its dimension size should be 1.
        scale(float|Variable|None): The multiplier for the input height or width. At
             least one of :attr:`out_shape` or :attr:`scale` must be set. 
             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
             Default: None.
        name(str|None): The output variable name.
        actual_shape(Variable): An optional input to specify output shape
                                dynamically. If provided, image resize
@@ -8455,9 +8468,12 @@ def resize_bilinear(input,
                                Default: None
        align_corners(bool): ${align_corners_comment}
        align_mode(bool): ${align_mode_comment}
+        data_format(str, optional): NCHW(num_batches, channels, height, width) or 
+                                    NHWC(num_batches, height, width, channels). Default: 'NCHW'.
    Returns:
-        A 4-D tensor in shape of (num_batches, channels, out_h, out_w)
+        A 4-D Tensor in shape of (num_batches, channels, out_h, out_w) or
+        (num_batches, out_h, out_w, channels).
    Examples:
        .. code-block:: python
@@ -8491,7 +8507,7 @@ def resize_bilinear(input,
    """
    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape,
-                        align_corners, align_mode)
+                        align_corners, align_mode, data_format)
 @templatedoc(op_type="trilinear_interp")
@@ -8501,7 +8517,8 @@ def resize_trilinear(input,
                     name=None,
                     actual_shape=None,
                     align_corners=True,
-                     align_mode=1):
+                     align_mode=1,
+                     data_format='NCDHW'):
    """
    Resize input by performing trilinear interpolation based on given
    output shape which specified by actual_shape, out_shape and scale
@@ -8538,6 +8555,7 @@ def resize_trilinear(input,
        Bilinear interpolation:
          if:
              align_corners = False , align_mode = 0
              input : (N,C,D_in,H_in,W_in)
@@ -8547,7 +8565,6 @@ def resize_trilinear(input,
              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
          else:
              input : (N,C,D_in,H_in,W_in)
@@ -8557,22 +8574,17 @@ def resize_trilinear(input,
              H_out = H_{in} * scale_{factor}
              W_out = W_{in} * scale_{factor}
    Args:
-        input(${x_type}): input should be a 5-D tensor of shape 
+        input(${x_type}): 5-D Tensor, its data type is float32, float64, or uint8,
-                          (num_batches, channls, in_d, in_h, in_w).
+                          its data format is specified by :attr:`data_format`.
        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
            layer, the shape is (out_d, out_h, out_w). Default: None. If a list, 
-            each element can be  an integer or a tensor Variable with shape: [1]. If 
+            each element can be  an integer or a Tensor Variable with shape: [1]. If 
-            a tensor Variable, its dimension size should be 1.
+            a Tensor Variable, its dimension size should be 1.
        scale(float|Variable|None): The multiplier for the input depth, height or width.
             At least one of :attr:`out_shape` or :attr:`scale` must be set. 
             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
             Default: None.
        name(str|None): The output variable name.
        actual_shape(Variable): An optional input to specify output shape
                                dynamically. If provided, image resize
@@ -8589,9 +8601,13 @@ def resize_trilinear(input,
                                Default: None
        align_corners(bool): ${align_corners_comment}
        align_mode(bool): ${align_mode_comment}
+        data_format(str, optional): NCDHW(num_batches, channels, depth, height, width) or 
+                                    NDHWC(num_batches, depth, height, width, channels).
+                                    Default: 'NCDHW'.
    Returns:
-        A 5-D tensor in shape (num_batches, channels, out_d, out_h, out_w)
+        A 5-D Tensor in shape of (num_batches, channels, out_d, out_h, out_w) or 
+        (num_batches, out_d, out_h, out_w, channels).
    Examples:
        .. code-block:: python
@@ -8622,11 +8638,10 @@ def resize_trilinear(input,
            scale_tensor = fluid.layers.data(name="scale", shape=[1], dtype="float32", append_batch_size=False)
            out4 = fluid.layers.resize_trilinear(input, scale=scale_tensor)
            # out4.shape = [-1, 3, -1, -1, -1]
    """
    return image_resize(input, out_shape, scale, name, 'TRILINEAR',
-                        actual_shape, align_corners, align_mode)
+                        actual_shape, align_corners, align_mode, data_format)
 @templatedoc(op_type="nearest_interp")
@@ -8635,12 +8650,12 @@ def resize_nearest(input,
                   scale=None,
                   name=None,
                   actual_shape=None,
-                   align_corners=True):
+                   align_corners=True,
+                   data_format='NCHW'):
    """
    Resize input by performing nearest neighbor interpolation in both the
-    3rd dimension(in height direction) and the 4th dimension(in width
+    height direction and the width direction based on given output shape 
-    direction) based on given output shape which is specified by actual_shape,
+    which is specified by actual_shape, out_shape and scale in priority order.
-    out_shape and scale in priority order.
    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the 
    future and only use :attr:`out_shape` instead.
@@ -8652,13 +8667,11 @@ def resize_nearest(input,
        For scale:
            if align_corners = True && out_size > 1 :
              scale_factor = (in_size-1.0)/(out_size-1.0)
            else:
              scale_factor = float(in_size/out_size)
        Nearest neighbor interpolation:
@@ -8685,19 +8698,16 @@ def resize_nearest(input,
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
    Args:
-        input(${x_type}): input should be a 4-D tensor of shape 
+        input(${x_type}): 4-D Tensor, its data type is float32, float64, or uint8,
-                          (num_batches, channls, in_h, in_w).
+                          its data format is specified by :attr:`data_format`.
        out_shape(list|tuple|Variable|None): Output shape of resize nearest
            layer, the shape is (out_h, out_w). Default: None. If a list, each 
            element can be integer or a tensor Variable with shape: [1]. If a 
            tensor Variable, its dimension size should be 1.
        scale(float|Variable|None): The multiplier for the input height or width. At
             least one of :attr:`out_shape` or :attr:`scale` must be set. 
             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
             Default: None.
        name(str|None): The output variable name.
        actual_shape(Variable): An optional input to specify output shape
                                dynamically. If provided, image resize
@@ -8713,9 +8723,13 @@ def resize_nearest(input,
                                errors would be occured in graph constructing stage.
                                Default: None
        align_corners(bool): ${align_corners_comment}
+        data_format(str, optional): NCHW(num_batches, channels, height, width) or 
+                                    NHWC(num_batches, height, width, channels).
+                                    Default: 'NCHW'.
    Returns:
-        A 4-D tensor in shape of (num_batches, channels, out_h, out_w)
+        A 4-D Tensor in shape of (num_batches, channels, out_h, out_w) or 
+        (num_batches, out_h, out_w, channels).
    Examples:
        .. code-block:: python
@@ -8746,11 +8760,18 @@ def resize_nearest(input,
            scale_tensor = fluid.layers.data(name="scale", shape=[1], dtype="float32", append_batch_size=False)
            out4 = fluid.layers.resize_nearest(input, scale=scale_tensor)
            # out4.shape = [-1, 3, -1, -1]
    """
-    return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape,
+    return image_resize(
-                        align_corners)
+        input,
+        out_shape,
+        scale,
+        name,
+        'NEAREST',
+        actual_shape,
+        align_corners,
+        align_mode=1,
+        data_format=data_format)
 def image_resize_short(input, out_short_len, resample='BILINEAR'):

--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -27,8 +27,11 @@ def bilinear_interp_np(input,
                       out_size=None,
                       actual_shape=None,
                       align_corners=True,
-                       align_mode=0):
+                       align_mode=0,
+                       data_layout='NCHW'):
    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
    if out_size is not None:
        out_h = out_size[0]
        out_w = out_size[1]
@@ -83,6 +86,10 @@ def bilinear_interp_np(input,
                                        w1lambda*input[:, :, h, w+wid]) + \
                h1lambda*(w2lambda*input[:, :, h+hid, w] +
                          w1lambda*input[:, :, h+hid, w+wid])
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
    return out.astype(input.dtype)
@@ -90,20 +97,28 @@ class TestBilinearInterpOp(OpTest):
    def setUp(self):
        self.out_size = None
        self.actual_shape = None
+        self.data_layout = 'NCHW'
        self.init_test_case()
        self.op_type = "bilinear_interp"
        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
        if self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
+            out_h = int(in_h * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
+            out_w = int(in_w * self.scale)
        else:
            out_h = self.out_h
            out_w = self.out_w
        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
                                       self.actual_shape, self.align_corners,
-                                       self.align_mode)
+                                       self.align_mode, self.data_layout)
        self.inputs = {'X': input_np}
        if self.out_size is not None:
            self.inputs['OutSize'] = self.out_size
@@ -116,7 +131,8 @@ class TestBilinearInterpOp(OpTest):
            'scale': self.scale,
            'interp_method': self.interp_method,
            'align_corners': self.align_corners,
-            'align_mode': self.align_mode
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
        }
        self.outputs = {'Out': output_np}
@@ -229,6 +245,19 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp):
        self.align_mode = 1
+class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 4, 4, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NHWC"
 class TestBilinearInterpOpUint8(OpTest):
    def setUp(self):
        self.out_size = None

--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -26,8 +26,11 @@ def nearest_neighbor_interp_np(X,
                               out_w,
                               out_size=None,
                               actual_shape=None,
-                               align_corners=True):
+                               align_corners=True,
+                               data_layout='NCHW'):
    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
    if out_size is not None:
        out_h = out_size[0]
        out_w = out_size[1]
@@ -63,6 +66,9 @@ def nearest_neighbor_interp_np(X,
                in_j = int(ratio_w * j)
                out[:, :, i, j] = X[:, :, in_i, in_j]
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
    return out.astype(X.dtype)
@@ -70,20 +76,28 @@ class TestNearestInterpOp(OpTest):
    def setUp(self):
        self.out_size = None
        self.actual_shape = None
+        self.data_layout = 'NCHW'
        self.init_test_case()
        self.op_type = "nearest_interp"
        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
        if self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
+            out_h = int(in_h * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
+            out_w = int(in_w * self.scale)
        else:
            out_h = self.out_h
            out_w = self.out_w
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+        output_np = nearest_neighbor_interp_np(
-                                               self.out_size, self.actual_shape,
+            input_np, out_h, out_w, self.out_size, self.actual_shape,
-                                               self.align_corners)
+            self.align_corners, self.data_layout)
        self.inputs = {'X': input_np}
        if self.out_size is not None:
            self.inputs['OutSize'] = self.out_size
@@ -95,6 +109,7 @@ class TestNearestInterpOp(OpTest):
            'scale': self.scale,
            'interp_method': self.interp_method,
            'align_corners': self.align_corners,
+            'data_layout': self.data_layout
        }
        self.outputs = {'Out': output_np}
@@ -198,6 +213,18 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
        self.align_corners = True
+class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 4, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 8]).astype("int32")
+        self.align_corners = True
+        self.data_layout = "NHWC"
 class TestNearestInterpOpUint8(OpTest):
    def setUp(self):
        self.out_size = None
@@ -399,6 +426,7 @@ class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
 class TestNearestAPI(OpTest):
    def test_case(self):
        x = fluid.layers.data(name="x", shape=[3, 6, 6], dtype="float32")
+        y = fluid.layers.data(name="y", shape=[6, 6, 3], dtype="float32")
        dim = fluid.layers.data(
            name="dim", shape=[1], dtype="int32", append_batch_size=False)
@@ -418,7 +446,8 @@ class TestNearestAPI(OpTest):
            dtype="float32",
            append_batch_size=False)
-        out1 = fluid.layers.resize_nearest(x, out_shape=[12, 12])
+        out1 = fluid.layers.resize_nearest(
+            y, out_shape=[12, 12], data_format='NHWC')
        out2 = fluid.layers.resize_nearest(x, out_shape=[12, dim])
        out3 = fluid.layers.resize_nearest(x, out_shape=shape_tensor)
        out4 = fluid.layers.resize_nearest(
@@ -436,6 +465,7 @@ class TestNearestAPI(OpTest):
        results = exe.run(fluid.default_main_program(),
                          feed={
                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 1)),
                              "dim": dim_data,
                              "shape_tensor": shape_data,
                              "actual_size": actual_size_data,
@@ -446,8 +476,20 @@ class TestNearestAPI(OpTest):
        expect_res = nearest_neighbor_interp_np(
            x_data, out_h=12, out_w=12, align_corners=True)
-        for res in results:
+        self.assertTrue(
-            self.assertTrue(np.allclose(res, expect_res))
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+    def test_exception(self):
+        # for 4-D input, data_format can only be NCHW or NHWC
+        input = fluid.layers.data(
+            name="input", shape=[3, 6, 6], dtype="float32")
+        try:
+            out = fluid.layers.resize_nearest(
+                input, out_shape=[4, 8], data_format='NDHWC')
+        except:
+            pass
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
@@ -28,8 +28,11 @@ def trilinear_interp_np(input,
                        out_size=None,
                        actual_shape=None,
                        align_corners=True,
-                        align_mode=0):
+                        align_mode=0,
+                        data_layout='NCDHW'):
    """trilinear interpolation implement in shape [N, C, D, H, W]"""
+    if data_layout == "NDHWC":
+        input = np.transpose(input, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
    if out_size is not None:
        out_d = out_size[0]
        out_h = out_size[1]
@@ -114,6 +117,9 @@ def trilinear_interp_np(input,
                              w1lambda * input[:, :, d+did, h, w+wid]) + \
                    h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] + \
                              w1lambda * input[:, :, d+did, h+hid, w+wid]))
+    if data_layout == "NDHWC":
+        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
    return out.astype(input.dtype)
@@ -121,28 +127,42 @@ class TestTrilinearInterpOp(OpTest):
    def setUp(self):
        self.out_size = None
        self.actual_shape = None
+        self.data_layout = 'NCDHW'
        self.init_test_case()
        self.op_type = "trilinear_interp"
        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCDHW":
+            in_d = self.input_shape[2]
+            in_h = self.input_shape[3]
+            in_w = self.input_shape[4]
+        else:
+            in_d = self.input_shape[1]
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
        if self.scale > 0:
-            out_d = int(self.input_shape[2] * self.scale)
+            out_d = int(in_d * self.scale)
-            out_h = int(self.input_shape[3] * self.scale)
+            out_h = int(in_h * self.scale)
-            out_w = int(self.input_shape[4] * self.scale)
+            out_w = int(in_w * self.scale)
        else:
            out_d = self.out_d
            out_h = self.out_h
            out_w = self.out_w
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+        output_np = trilinear_interp_np(
-                                        self.out_size, self.actual_shape,
+            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
-                                        self.align_corners, self.align_mode)
+            self.align_corners, self.align_mode, self.data_layout)
        self.inputs = {'X': input_np}
        if self.out_size is not None:
            self.inputs['OutSize'] = self.out_size
        if self.actual_shape is not None:
            self.inputs['OutSize'] = self.actual_shape
+        # c++ end treat NCDHW the same way as NCHW
+        if self.data_layout == 'NCDHW':
+            data_layout = 'NCHW'
+        else:
+            data_layout = 'NHWC'
        self.attrs = {
            'out_d': self.out_d,
            'out_h': self.out_h,
@@ -150,7 +170,8 @@ class TestTrilinearInterpOp(OpTest):
            'scale': self.scale,
            'interp_method': self.interp_method,
            'align_corners': self.align_corners,
-            'align_mode': self.align_mode
+            'align_mode': self.align_mode,
+            'data_layout': data_layout
        }
        self.outputs = {'Out': output_np}
@@ -284,6 +305,20 @@ class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
        self.align_mode = 1
+class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 4, 4, 4, 3]
+        self.out_d = 2
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NDHWC"
 class TestTrilinearInterpOpUint8(OpTest):
    def setUp(self):
        self.out_size = None
@@ -536,6 +571,7 @@ class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
 class TestTrilinearInterpAPI(OpTest):
    def test_case(self):
        x = fluid.layers.data(name="x", shape=[3, 6, 9, 4], dtype="float32")
+        y = fluid.layers.data(name="y", shape=[6, 9, 4, 3], dtype="float32")
        dim = fluid.layers.data(name="dim", shape=[1], dtype="int32")
        shape_tensor = fluid.layers.data(
@@ -554,7 +590,8 @@ class TestTrilinearInterpAPI(OpTest):
            dtype="float32",
            append_batch_size=False)
-        out1 = fluid.layers.resize_trilinear(x, out_shape=[12, 18, 8])
+        out1 = fluid.layers.resize_trilinear(
+            y, out_shape=[12, 18, 8], data_format='NDHWC')
        out2 = fluid.layers.resize_trilinear(x, out_shape=[12, dim, 8])
        out3 = fluid.layers.resize_trilinear(x, out_shape=shape_tensor)
        out4 = fluid.layers.resize_trilinear(
@@ -572,6 +609,7 @@ class TestTrilinearInterpAPI(OpTest):
        results = exe.run(fluid.default_main_program(),
                          feed={
                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 4, 1)),
                              "dim": dim_data,
                              "shape_tensor": shape_data,
                              "actual_size": actual_size_data,
@@ -582,8 +620,20 @@ class TestTrilinearInterpAPI(OpTest):
        expect_res = trilinear_interp_np(
            x_data, out_d=12, out_h=18, out_w=8, align_mode=1)
-        for res in results:
+        self.assertTrue(
-            self.assertTrue(np.allclose(res, expect_res))
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 4, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
+    def test_exception(self):
+        input = fluid.layers.data(
+            name="input", shape=[3, 6, 9, 4], dtype="float32")
+        try:
+            # for 5-D input, data_format only can be NCDHW or NDHWC
+            out = fluid.layers.resize_trilinear(
+                input, out_shape=[4, 8, 4], data_format='NHWC')
+        except:
+            pass
 if __name__ == "__main__":