提交 a4e7d9c2 编写于 作者: E eclipsycn 提交者: GitHub

Merge pull request #415 from smilejames/develop

optimize gemm
...@@ -28,7 +28,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const { ...@@ -28,7 +28,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor *output = param.Output(); Tensor *output = param.Output();
output->mutable_data<float>(); output->mutable_data<float>();
int groups = param.Groups(); int groups = param.Groups();
std::vector<int> strides = param.Strides(); std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings(); std::vector<int> paddings = param.Paddings();
...@@ -40,7 +39,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const { ...@@ -40,7 +39,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims())); std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims())); std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2; size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim); std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups; col_shape_vec[0] = input->dims()[1] / groups;
...@@ -61,18 +59,13 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const { ...@@ -61,18 +59,13 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
// DLOG << " col_shape = " << col_shape;
// DLOG << " col_matrix_shape = " << col_matrix_shape;
framework::DDim input_shape = framework::slice_ddim( framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size())); input->dims(), 1, static_cast<int>(input->dims().size()));
// DLOG << " input_shape = " << input_shape;
framework::DDim filter_matrix_shape = {filter.dims()[0], framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]}; filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape); filter.Resize(filter_matrix_shape);
// DLOG << " filter.dims() = " << filter.dims();
framework::DDim output_matrix_shape = { framework::DDim output_matrix_shape = {
output->dims()[1], output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])}; output->numel() / (output->dims()[0] * output->dims()[1])};
...@@ -87,8 +80,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const { ...@@ -87,8 +80,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
// DLOG << " in_batch.dims() = " << in_batch.dims();
// DLOG << " out_batch.dims() = " << out_batch.dims();
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
...@@ -111,13 +102,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const { ...@@ -111,13 +102,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
// DLOG << " out_slice " << out_slice.dims();
// DLOG << " filter_slice " << filter_slice.dims();
// DLOG << " col_matrix " << col_matrix.dims();
math::matmul<float>(filter_slice, false, col_matrix, false, math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(0)); static_cast<float>(0));
auto filter_ptr = filter_slice.data<float>();
} }
} }
} }
......
...@@ -114,10 +114,12 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, ...@@ -114,10 +114,12 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
for (j = 0; j < n - paddingN; j += NR) { for (j = 0; j < n - paddingN; j += NR) {
for (i = 0; i < k; ++i) { for (i = 0; i < k; ++i) {
Bij = &B(i, j); Bij = &B(i, j);
*buffer++ = *Bij; asm volatile(
*buffer++ = *(Bij + 1); "vld1.32 {q0}, [%[Bij]] \n\t"
*buffer++ = *(Bij + 2); "vst1.32 {q0}, [%[buffer]]! \n\t"
*buffer++ = *(Bij + 3); : [buffer] "+r"(buffer)
: [Bij] "r"(Bij)
: "memory", "q0");
} }
} }
if (paddingN != 0) { if (paddingN != 0) {
......
...@@ -20,9 +20,9 @@ limitations under the License. */ ...@@ -20,9 +20,9 @@ limitations under the License. */
#define C(i, j) C[(i)*ldc + (j)] #define C(i, j) C[(i)*ldc + (j)]
// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k // 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k
#define MC 384 #define MC 128
#define KC 384 #define KC 128
#define NC 4096 #define NC 1024
#define MR 4 #define MR 4
#define NR 4 #define NR 4
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册