Merge pull request #419 from dolphin8/develop

#418 a faster transpose implementation

Merge pull request #419 from dolphin8/develop
#418 a faster transpose implementation
e83a5963 · eclipsycn · GitHub · cc07fff2 · b3781d28 · e83a5963
显示空白变更内容
内联并排

Showing with 44 addition and 35 deletion

src/operators/kernel/arm/transpose_kernel.cpp src/operators/kernel/arm/transpose_kernel.cpp +44 -35

未找到文件。
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -11,29 +11,28 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #ifdef TRANSPOSE_OP

 #include "operators/kernel/transpose_kernel.h"
-
 namespace paddle_mobile {
 namespace operators {

-template <typename T>
-void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-                   const vector<int> old_strides, const vector<int> new_strides,
-                   T* output) {
-  for (int i = 0; i < numel; ++i) {
-    int old_idx = 0;
-    int idx = i;
-    for (int j = 0; j < axis.size(); ++j) {
-      int order = axis[j];
-      old_idx += (idx / new_strides[j]) * old_strides[order];
-      idx %= new_strides[j];
-    }
-    output[i] = input[old_idx];
-  }
-}
+// vector<int> pos;
+// template <typename T>
+// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
+//                    const vector<int> old_strides, const vector<int>
+//                    new_strides, T* output) {
+//   for (int i = 0; i < numel; ++i) {
+//     int old_idx = 0;
+//     int idx = i;
+//     for (int j = 0; j < axis.size(); ++j) {
+//       int order = axis[j];
+//       old_idx += (idx / new_strides[j]) * old_strides[order];
+//       idx %= new_strides[j];
+//     }
+//     output[i] = input[old_idx];
+//   }
+// }

 template <>
 void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
@@ -44,28 +43,38 @@ void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
  const auto* input_x_data = input_x->data<float>();
  auto* out_data = out->mutable_data<float>();

-  size_t axis_size = axis.size();
-  std::vector<int> new_dims;
-  new_dims.reserve(axis_size);
-  for (auto c : axis) {
-    new_dims.push_back(input_x_dims[c]);
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
  }

-  std::vector<int> old_strides;
-  std::vector<int> new_strides;
-  for (int i = 0; i < axis.size(); i++) {
-    int temp_old = 1;
-    int temp_new = 1;
-    for (int j = i + 1; j < axis.size(); j++) {
-      temp_old *= input_x_dims[j];
-      temp_new *= new_dims[j];
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
    }
-    old_strides.push_back(temp_old);
-    new_strides.push_back(temp_new);
  }
-
-  TransposeFunc<float>(input_x->numel(), input_x_data, axis, old_strides,
-                       new_strides, out_data);
 }

 }  // namespace operators