未验证 提交 cc07fff2 编写于 作者: E eclipsycn 提交者: GitHub

Merge pull request #415 from smilejames/develop

optimize gemm
......@@ -28,7 +28,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
......@@ -40,7 +39,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
......@@ -61,18 +59,13 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
// DLOG << " col_shape = " << col_shape;
// DLOG << " col_matrix_shape = " << col_matrix_shape;
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
// DLOG << " input_shape = " << input_shape;
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
// DLOG << " filter.dims() = " << filter.dims();
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
......@@ -87,8 +80,6 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
// DLOG << " in_batch.dims() = " << in_batch.dims();
// DLOG << " out_batch.dims() = " << out_batch.dims();
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
......@@ -111,13 +102,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
// DLOG << " out_slice " << out_slice.dims();
// DLOG << " filter_slice " << filter_slice.dims();
// DLOG << " col_matrix " << col_matrix.dims();
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
auto filter_ptr = filter_slice.data<float>();
}
}
}
......
......@@ -114,10 +114,12 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
for (j = 0; j < n - paddingN; j += NR) {
for (i = 0; i < k; ++i) {
Bij = &B(i, j);
*buffer++ = *Bij;
*buffer++ = *(Bij + 1);
*buffer++ = *(Bij + 2);
*buffer++ = *(Bij + 3);
asm volatile(
"vld1.32 {q0}, [%[Bij]] \n\t"
"vst1.32 {q0}, [%[buffer]]! \n\t"
: [buffer] "+r"(buffer)
: [Bij] "r"(Bij)
: "memory", "q0");
}
}
if (paddingN != 0) {
......
......@@ -20,9 +20,9 @@ limitations under the License. */
#define C(i, j) C[(i)*ldc + (j)]
// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k
#define MC 384
#define KC 384
#define NC 4096
#define MC 128
#define KC 128
#define NC 1024
#define MR 4
#define NR 4
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册