diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index 1b607922eda1d854567a338b51121e47064915e4..0987118ba39b6ec6893ea3914a30ff477c42d6a6 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -205,35 +205,25 @@ struct DotGradFunction> { } } #else - const auto* data_dout = tensor_dout->data(); + auto const *x = tensor_x->data(), *y = tensor_y->data(), + *dz = tensor_dout->data(); + auto&& d = tensor_x->dims(); + auto const N = tensor_x->numel(); + auto const B = d[d.size() - 1]; if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); - const auto* data_y = tensor_y->data(); - const framework::DDim& dim = tensor_x->dims(); - size_t N = static_cast(framework::product(dim)); - - auto step = dim[dim.size() - 1]; - - int s = -1; - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_dx[i] = data_y[i] * data_dout[s]; + auto* dx = tensor_dx->mutable_data(ctx.GetPlace()); + for (auto j = 0; j < N / B; ++j) { + auto const ss = dz[j]; + for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss; } } if (tensor_dy) { - auto* data_dy = tensor_dy->mutable_data(ctx.GetPlace()); - const auto* data_x = tensor_x->data(); - const framework::DDim& dim = tensor_y->dims(); - size_t N = static_cast(framework::product(dim)); - - auto step = dim[dim.size() - 1]; - - int s = -1; - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_dy[i] = data_x[i] * data_dout[s]; + auto* dy = tensor_dy->mutable_data(ctx.GetPlace()); + for (auto j = 0; j < N / B; ++j) { + auto const ss = dz[j]; + for (auto i = 0; i < B; i++) *dy++ = *x++ * ss; } } #endif @@ -266,21 +256,20 @@ class DotKernel : public framework::OpKernel { out.device(dev) = (x * y).sum(Eigen::DSizes(1)); } #else - const auto* data_x = tensor_x->data(); - const auto* data_y = tensor_y->data(); - auto* data_out = tensor_out->data(); - - auto x_dims = tensor_x->dims(); - auto step = x_dims[x_dims.size() - 1]; - int size = static_cast(framework::product(x_dims)); - - for (int ind = -1, j = 0; j < size; ++j) { - if (j % step == 0) { - ++ind; - data_out[ind] = data_x[j] * data_y[j]; - } else { - data_out[ind] += data_x[j] * data_y[j]; - } + auto const *x = tensor_x->data(), *x_ = &x[0]; + auto const *y = tensor_y->data(), *y_ = &y[0]; + auto* z = tensor_out->data(); + + // Loop over the total N elements of both operands while sum-reducing every + // B pairs along the way where B is the dimension of the least ordered axis + auto&& d = tensor_x->dims(); + auto const N = tensor_x->numel(); + auto const B = d[d.size() - 1]; + + for (int j = 0; j < N / B; j++) { + T ss = 0; + for (int i = 0; i < B; i++) ss += (*x_++) * (*y_++); + z[j] = ss; } #endif }