Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into dev_partial_fc_mirror

Former-commit-id: f2fcb711d12da1b42cfee85c18c8acafd3583c17

Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into dev_partial_fc_mirror
Former-commit-id: f2fcb711d12da1b42cfee85c18c8acafd3583c17
08589650 · guo-ran · eb37fdfb · 4542a232 · 08589650 · 08589650
13 changed file
--- a/.github/actions/whl/action.yml
+++ b/.github/actions/whl/action.yml
@@ -18,6 +18,7 @@ runs:
        set -x
        src_dir=${PWD}
        tmp_dir="${{ inputs.tmp_dir }}"
+        mkdir -p ${tmp_dir}
        cd ${tmp_dir}
        docker run --rm -v $PWD:/p -w $PWD:/p busybox rm -rf /p/wheelhouse
        python3 ${src_dir}/docker/package/manylinux/build_wheel.py \

--- a/oneflow/core/common/maybe.h
+++ b/oneflow/core/common/maybe.h
@@ -187,41 +187,48 @@ inline bool MaybeIsOk(Maybe<void>&& maybe) {

 #if defined(__GNUC__) || defined(__CUDACC__) || defined(__clang__)

+// fix CUDA 11.1 compiler crashes
+#if defined(__CUDACC__)
+#define MAYBE_CONST_AUTO_REF const auto
+#else
+#define MAYBE_CONST_AUTO_REF const auto&
+#endif  // defined(__CUDACC__)
+
 #define TRY(...) __MaybeErrorStackCheckWrapper__(__VA_ARGS__)
-#define JUST(...)                                                     \
-  ({                                                                  \
-    const auto& maybe = __MaybeErrorStackCheckWrapper__(__VA_ARGS__); \
-    if (!maybe.IsOk()) {                                              \
-      auto* stack_frame = maybe.error()->add_stack_frame();           \
-      stack_frame->set_location(MAYBE_FAILED_LOC);                    \
-      stack_frame->set_function(__FUNCTION__);                        \
-      return maybe.error();                                           \
-    }                                                                 \
-    maybe;                                                            \
-  })                                                                  \
+#define JUST(...)                                                              \
+  ({                                                                           \
+    MAYBE_CONST_AUTO_REF maybe = __MaybeErrorStackCheckWrapper__(__VA_ARGS__); \
+    if (!maybe.IsOk()) {                                                       \
+      auto* stack_frame = maybe.error()->add_stack_frame();                    \
+      stack_frame->set_location(MAYBE_FAILED_LOC);                             \
+      stack_frame->set_function(__FUNCTION__);                                 \
+      return maybe.error();                                                    \
+    }                                                                          \
+    maybe;                                                                     \
+  })                                                                           \
      .Data_YouAreNotAllowedToCallThisFuncOutsideThisFile()
-#define CHECK_JUST(...)                                               \
-  ({                                                                  \
-    const auto& maybe = __MaybeErrorStackCheckWrapper__(__VA_ARGS__); \
-    if (!maybe.IsOk()) {                                              \
-      auto* stack_frame = maybe.error()->add_stack_frame();           \
-      stack_frame->set_location(MAYBE_FAILED_LOC);                    \
-      stack_frame->set_function(__FUNCTION__);                        \
-      LOG(FATAL) << maybe.GetSerializedError();                       \
-    }                                                                 \
-    maybe;                                                            \
-  })                                                                  \
+#define CHECK_JUST(...)                                                        \
+  ({                                                                           \
+    MAYBE_CONST_AUTO_REF maybe = __MaybeErrorStackCheckWrapper__(__VA_ARGS__); \
+    if (!maybe.IsOk()) {                                                       \
+      auto* stack_frame = maybe.error()->add_stack_frame();                    \
+      stack_frame->set_location(MAYBE_FAILED_LOC);                             \
+      stack_frame->set_function(__FUNCTION__);                                 \
+      LOG(FATAL) << maybe.GetSerializedError();                                \
+    }                                                                          \
+    maybe;                                                                     \
+  })                                                                           \
      .Data_YouAreNotAllowedToCallThisFuncOutsideThisFile()

 #define CHECK_OK(...) CHECK(MaybeIsOk(std::move(__VA_ARGS__)))

-#define OF_RETURN_IF_ERROR(...)                                                \
-  const auto& maybe_##__LINE__ = __MaybeErrorStackCheckWrapper__(__VA_ARGS__); \
-  if (!maybe_##__LINE__.IsOk()) {                                              \
-    auto* stack_frame = maybe_##__LINE__.error()->add_stack_frame();           \
-    stack_frame->set_location(MAYBE_FAILED_LOC);                               \
-    stack_frame->set_function(__FUNCTION__);                                   \
-    return maybe_##__LINE__.error();                                           \
+#define OF_RETURN_IF_ERROR(...)                                                         \
+  MAYBE_CONST_AUTO_REF maybe_##__LINE__ = __MaybeErrorStackCheckWrapper__(__VA_ARGS__); \
+  if (!maybe_##__LINE__.IsOk()) {                                                       \
+    auto* stack_frame = maybe_##__LINE__.error()->add_stack_frame();                    \
+    stack_frame->set_location(MAYBE_FAILED_LOC);                                        \
+    stack_frame->set_function(__FUNCTION__);                                            \
+    return maybe_##__LINE__.error();                                                    \
  }

 #else

--- a/oneflow/core/common/util.h
+++ b/oneflow/core/common/util.h
@@ -143,7 +143,8 @@ inline uint32_t NewRandomSeed() {

 #define DIM_SEQ           \
  OF_PP_MAKE_TUPLE_SEQ(1) \
-  OF_PP_MAKE_TUPLE_SEQ(2) OF_PP_MAKE_TUPLE_SEQ(3) OF_PP_MAKE_TUPLE_SEQ(4) OF_PP_MAKE_TUPLE_SEQ(5)
+  OF_PP_MAKE_TUPLE_SEQ(2) \
+  OF_PP_MAKE_TUPLE_SEQ(3) OF_PP_MAKE_TUPLE_SEQ(4) OF_PP_MAKE_TUPLE_SEQ(5) OF_PP_MAKE_TUPLE_SEQ(6)

 #define BOOL_SEQ (true)(false)


--- a/oneflow/core/kernel/arg_where_kernel.cpp
+++ b/oneflow/core/kernel/arg_where_kernel.cpp
@@ -56,7 +56,8 @@ class ArgWhereKernel : public KernelIf<DeviceType::kCPU> {
  REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, 2)              \
  REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, 3)              \
  REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, 4)              \
-  REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, 5)
+  REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, 5)              \
+  REGISTER_ARG_WHERE_KERNEL(device_type_v, dtype, itype, 6)

 #define REGISTER_ARG_WHERE_KERNELS(device_type_v, dtype_pair, itype_pair)          \
  REGISTER_ARG_WHERE_KERNELS_AT_NDIMS(device_type_v, OF_PP_PAIR_FIRST(dtype_pair), \

--- a/oneflow/core/kernel/arg_where_kernel_util.h
+++ b/oneflow/core/kernel/arg_where_kernel_util.h
@@ -41,7 +41,9 @@ struct ArgWhereKernelUtil {
  INSTANTIATE_ARG_WHERE_KERNEL_UTIL_INTERNAL(device_type_v, OF_PP_PAIR_FIRST(dtype_pair), \
                                             OF_PP_PAIR_FIRST(itype_pair), 4)             \
  INSTANTIATE_ARG_WHERE_KERNEL_UTIL_INTERNAL(device_type_v, OF_PP_PAIR_FIRST(dtype_pair), \
-                                             OF_PP_PAIR_FIRST(itype_pair), 5)
+                                             OF_PP_PAIR_FIRST(itype_pair), 5)             \
+  INSTANTIATE_ARG_WHERE_KERNEL_UTIL_INTERNAL(device_type_v, OF_PP_PAIR_FIRST(dtype_pair), \
+                                             OF_PP_PAIR_FIRST(itype_pair), 6)

 }  // namespace oneflow


--- a/oneflow/core/ndarray/binary_func.h
+++ b/oneflow/core/ndarray/binary_func.h
@@ -270,7 +270,7 @@ struct BinaryFuncFloorMod<half> final {
 #if __CUDA_ARCH__ >= 530
    const half trunc_mod = __float2half(fmodf(__half2float(x), __half2float(y)));
    return __hne(trunc_mod, GetZeroVal<half>())
-                   && __hne(__hlt(y, GetZeroVal<half>()), __hlt(trunc_mod, half(0)))
+                   && __hlt(y, GetZeroVal<half>()) != __hlt(trunc_mod, half(0))
               ? trunc_mod + y
               : trunc_mod;
 #else

--- a/oneflow/core/ndarray/xpu_broadcast_ndarray.h
+++ b/oneflow/core/ndarray/xpu_broadcast_ndarray.h
@@ -60,6 +60,7 @@ SPECIALIZE_XPU_BROADCAST_NDARRAY_UTIL(1);
 SPECIALIZE_XPU_BROADCAST_NDARRAY_UTIL(2);
 SPECIALIZE_XPU_BROADCAST_NDARRAY_UTIL(3);
 SPECIALIZE_XPU_BROADCAST_NDARRAY_UTIL(4);
+SPECIALIZE_XPU_BROADCAST_NDARRAY_UTIL(5);
 #undef SPECIALIZE_XPU_BROADCAST_NDARRAY_UTIL
 #undef IMPLACE_SET_SRC_COORD


--- a/oneflow/core/ndarray/xpu_shape.h
+++ b/oneflow/core/ndarray/xpu_shape.h
@@ -110,6 +110,7 @@ SPECIALIZE_XPU_SHAPE_UTIL(0);
 SPECIALIZE_XPU_SHAPE_UTIL(1);
 SPECIALIZE_XPU_SHAPE_UTIL(2);
 SPECIALIZE_XPU_SHAPE_UTIL(3);
+SPECIALIZE_XPU_SHAPE_UTIL(4);
 #undef SPECIALIZE_XPU_SHAPE_UTIL
 #undef EXTRACT_COORD
 #undef COORD_MUL_STRIDE

--- a/oneflow/python/ops/array_ops.py
+++ b/oneflow/python/ops/array_ops.py
@@ -156,6 +156,59 @@ def gather(
        )


+@oneflow_export("flatten")
+def flatten(
+    input: remote_blob_util.BlobDef,
+    start_dim: int = 0,
+    end_dim: int = -1,
+    name: Optional[str] = None,
+) -> remote_blob_util.BlobDef:
+    r"""Flattens a contiguous range of dims in a Blob.
+    
+    Args:
+        input: A `Blob`.
+        start_dim: The first dim to flatten.
+        end_dim: The last dim to flatten.
+        name: A name for the operation (optional).
+    Returns:
+        A `Blob`, has the same type as `input`. 
+
+    For example: 
+
+    .. code-block:: python 
+
+        import oneflow as flow
+        import numpy as np
+        import oneflow.typing as tp
+
+        @flow.global_function()
+        def flatten_Job(input: tp.Numpy.Placeholder(shape=(4, 4, 3, 2), dtype=flow.float32)
+        ) -> tp.Numpy:
+            flatten_blob = flow.flatten(input, start_dim=1, end_dim=-1)
+            return flatten_blob
+
+
+        input = np.zeros((4, 4, 3, 2)).astype(np.float32)
+        out = flatten_Job(input)
+
+        # out.shape (4, 24)
+
+    """
+    if name is None:
+        name = id_util.UniqueStr("Flatten_")
+    return (
+        flow.user_op_builder(name)
+        .Op("flatten")
+        .Input("in", [input])
+        .Output("out")
+        .Attr("start_dim", start_dim)
+        .Attr("end_dim", end_dim)
+        .Build()
+        .InferAndTryRun()
+        .RemoteBlobList()[0]
+    )
+
+
 def infer_shape(x, shape):
    dim_index_need_infer = shape.index(-1) if shape.count(-1) == 1 else None
    in_elem_cnt = reduce(operator.mul, x.shape, 1)

--- a/oneflow/python/test/ops/test_flatten.py
+++ b/oneflow/python/test/ops/test_flatten.py
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+import oneflow as flow
+from test_util import GenArgList
+import test_global_storage
+
+
+def compare_with_numpy(test_case, device_type, input_shape, start_end_dim):
+    assert device_type in ["gpu", "cpu"]
+    flow.clear_default_session()
+
+    func_config = flow.FunctionConfig()
+    func_config.default_data_type(flow.float)
+
+    start_dim = start_end_dim[0]
+    end_dim = start_end_dim[1]
+
+    @flow.global_function(type="train", function_config=func_config)
+    def FlattenJob() -> flow.typing.Numpy:
+        with flow.scope.placement(device_type, "0:0"):
+            x = flow.get_variable(
+                "in",
+                shape=input_shape,
+                dtype=flow.float,
+                initializer=flow.random_uniform_initializer(minval=2, maxval=5),
+                trainable=True,
+            )
+
+            loss = flow.flatten(x, start_dim=start_dim, end_dim=end_dim)
+            flow.optimizer.SGD(
+                flow.optimizer.PiecewiseConstantScheduler([], [1e-4]), momentum=0
+            ).minimize(loss)
+
+            flow.watch(x, test_global_storage.Setter("x"))
+            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
+
+            return loss
+
+    # OneFlow
+    check_point = flow.train.CheckPoint()
+    check_point.init()
+    of_out = FlattenJob()
+
+    # Numpy
+    of_x = test_global_storage.Get("x")
+    of_x_shape = of_x.shape
+    of_x_diff = test_global_storage.Get("x_diff")
+
+    true_end_dim = end_dim + len(of_x_shape) if end_dim < 0 else end_dim
+    new_shape = []
+    for i in range(0, start_dim):
+        new_shape.append(of_x_shape[i])
+    flatten_dim = 1
+    for i in range(start_dim, true_end_dim + 1):
+        flatten_dim *= of_x_shape[i]
+    new_shape.append(flatten_dim)
+    for i in range(true_end_dim + 1, len(of_x_shape)):
+        new_shape.append(of_x_shape[i])
+
+    np_out = np.reshape(of_x, tuple(new_shape))
+
+    test_case.assertTrue(of_out.shape == np_out.shape)
+    test_case.assertTrue(np.allclose(of_out, np_out, rtol=1e-5, atol=1e-5))
+    test_case.assertTrue(
+        np.allclose(of_x_diff, np.ones(of_x_diff.shape), rtol=1e-5, atol=1e-5)
+    )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFlatten(flow.unittest.TestCase):
+    def test_flatten(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_case"] = [test_case]
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(2, 3, 4, 5)]
+        arg_dict["start_end_dim"] = [(0, -1), (1, 3), (2, -2)]
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy(*arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/oneflow/python/test/ops/test_transpose.py
+++ b/oneflow/python/test/ops/test_transpose.py
@@ -113,6 +113,14 @@ class TestTranspose(flow.unittest.TestCase):
        x = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]).astype(np.float32)
        transpose_batchaxis_non_change_job(x)

+    def test_transpose_dim6(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device_type"] = ["gpu", "cpu"]
+        arg_dict["input_shape"] = [(2, 3, 4, 5, 6, 7)]
+        arg_dict["perm"] = [(2, 0, 1, 3, 5, 4)]
+        for arg in GenArgList(arg_dict):
+            compare_with_tensorflow(*arg)
+

 if __name__ == "__main__":
    unittest.main()
--- a/oneflow/user/kernels/flatten_kernel.cpp
+++ b/oneflow/user/kernels/flatten_kernel.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/copy_data_content_kernel.h"
+
+namespace oneflow {
+
+#define REGISTER_FLATTEN_KERNEL(device)                                                         \
+  REGISTER_USER_KERNEL("flatten")                                                               \
+      .SetCreateFn<CopyDataContentKernel<device>>()                                             \
+      .SetIsMatchedHob(user_op::HobDeviceTag() == device)                                       \
+      .SetInplaceProposalFn([](const user_op::InferContext&,                                    \
+                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> { \
+        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));                      \
+        return Maybe<void>::Ok();                                                               \
+      });
+
+REGISTER_FLATTEN_KERNEL(DeviceType::kCPU)
+#ifdef WITH_CUDA
+REGISTER_FLATTEN_KERNEL(DeviceType::kGPU)
+#endif
+
+}  // namespace oneflow
--- a/oneflow/user/ops/flatten_op.cpp
+++ b/oneflow/user/ops/flatten_op.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+
+namespace oneflow {
+
+namespace {
+
+Maybe<void> GetSbpFn(user_op::SbpContext* ctx) {
+  const auto& in_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("in", 0).shape();
+  const int32_t start_dim = ctx->Attr<int32_t>("start_dim");
+  const int32_t end_dim = ctx->Attr<int32_t>("end_dim");
+
+  CHECK_GE_OR_RETURN(start_dim, 0);
+  CHECK_LT_OR_RETURN(start_dim, in_shape.NumAxes());
+  const int32_t true_end_dim = end_dim < 0 ? end_dim + in_shape.NumAxes() : end_dim;
+  CHECK_GE_OR_RETURN(true_end_dim, 0);
+  CHECK_LT_OR_RETURN(true_end_dim, in_shape.NumAxes());
+  CHECK_LE_OR_RETURN(start_dim, true_end_dim);
+
+  for (int i = 0; i <= start_dim; ++i) {
+    ctx->NewBuilder().Split(user_op::OpArg("in", 0), i).Split(user_op::OpArg("out", 0), i).Build();
+  }
+  const int32_t diff = true_end_dim - start_dim;
+  for (int i = true_end_dim + 1; i < in_shape.NumAxes(); ++i) {
+    ctx->NewBuilder()
+        .Split(user_op::OpArg("in", 0), i)
+        .Split(user_op::OpArg("out", 0), i - diff)
+        .Build();
+  }
+
+  ctx->NewBuilder().PartialSum(ctx->inputs()).PartialSum(ctx->outputs()).Build();
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> TensorDescInferFn(user_op::InferContext* ctx) {
+  const int32_t start_dim = ctx->Attr<int32_t>("start_dim");
+  const int32_t end_dim = ctx->Attr<int32_t>("end_dim");
+  const user_op::TensorDesc* in_tensor_desc = ctx->TensorDesc4ArgNameAndIndex("in", 0);
+  user_op::TensorDesc* out_tensor_desc = ctx->TensorDesc4ArgNameAndIndex("out", 0);
+  const Shape& in_shape = in_tensor_desc->shape();
+
+  CHECK_GE_OR_RETURN(start_dim, 0);
+  CHECK_LT_OR_RETURN(start_dim, in_shape.NumAxes());
+  const int32_t true_end_dim = end_dim < 0 ? end_dim + in_shape.NumAxes() : end_dim;
+  CHECK_GE_OR_RETURN(true_end_dim, 0);
+  CHECK_LT_OR_RETURN(true_end_dim, in_shape.NumAxes());
+  CHECK_LE_OR_RETURN(start_dim, true_end_dim);
+
+  *out_tensor_desc = *in_tensor_desc;
+
+  Shape* out_shape = out_tensor_desc->mut_shape();
+
+  DimVector dim_vec;
+
+  for (int i = 0; i < start_dim; ++i) { dim_vec.push_back(in_shape.At(i)); }
+  int64_t flatten_dim = 1;
+  for (int i = start_dim; i <= true_end_dim; ++i) { flatten_dim *= in_shape.At(i); }
+  dim_vec.push_back(flatten_dim);
+  for (int i = true_end_dim + 1; i < in_shape.NumAxes(); ++i) { dim_vec.push_back(in_shape.At(i)); }
+
+  *out_shape = Shape(dim_vec);
+  CHECK_EQ_OR_RETURN(out_shape->elem_cnt(), in_shape.elem_cnt());
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> GetBatchAxisInferFn(user_op::BatchAxisContext* ctx) {
+  const int32_t start_dim = ctx->Attr<int32_t>("start_dim");
+  const int32_t end_dim = ctx->Attr<int32_t>("end_dim");
+
+  const auto& in_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("in", 0).shape();
+
+  CHECK_GE_OR_RETURN(start_dim, 0);
+  CHECK_LT_OR_RETURN(start_dim, in_shape.NumAxes());
+  const int32_t true_end_dim = end_dim < 0 ? end_dim + in_shape.NumAxes() : end_dim;
+  CHECK_GE_OR_RETURN(true_end_dim, 0);
+  CHECK_LT_OR_RETURN(true_end_dim, in_shape.NumAxes());
+  CHECK_LE_OR_RETURN(start_dim, true_end_dim);
+
+  const int64_t input_batch_axis = (*ctx->BatchAxis4ArgNameAndIndex("in", 0)).value();
+
+  OptInt64 output_batch_axis;
+
+  if (input_batch_axis < start_dim) {
+    output_batch_axis.set_value(input_batch_axis);
+  } else if (input_batch_axis >= start_dim && input_batch_axis <= true_end_dim) {
+    output_batch_axis.set_value(start_dim);
+  } else if (input_batch_axis > true_end_dim) {
+    output_batch_axis.set_value(input_batch_axis - (true_end_dim - start_dim));
+  }
+
+  *ctx->BatchAxis4ArgNameAndIndex("out", 0) = output_batch_axis;
+
+  return Maybe<void>::Ok();
+}
+
+REGISTER_USER_OP("flatten")
+    .Input("in")
+    .Output("out")
+    .Attr<int32_t>("start_dim", 0)
+    .Attr<int32_t>("end_dim", -1)
+    .SetTensorDescInferFn(TensorDescInferFn)
+    .SetGetSbpFn(GetSbpFn)
+    .SetBatchAxisInferFn(GetBatchAxisInferFn);
+
+REGISTER_USER_OP_GRAD("flatten").SetGenBackwardOpConfFn([](const user_op::UserOpWrapper& op,
+                                                           user_op::AddOpFn AddOp) {
+  if (op.NeedGenGradTensor4OpInput("in", 0)) {
+    user_op::UserOpConfWrapperBuilder builder(op.op_name() + "_grad");
+    user_op::UserOpConfWrapper reshape_grad_op =
+        builder.Op("reshape_like")
+            .Input("in", op.GetGradTensorWithOpOutput("out", 0))
+            .Input("like", op.input("in", 0))
+            .Output("out")
+            .Build();
+    op.BindGradTensorWithOpInput(reshape_grad_op.output("out", 0), "in", 0);
+    AddOp(reshape_grad_op);
+  }
+});
+
+}  // namespace
+}  // namespace oneflow