diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 1cb42fbe43d750acb2c9f4911ddec0ffde4815ac..ed8128c33078906f9ba6616660ac4966525096d7 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -89,6 +89,35 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { std::string activation_type = with_relu ? "relu" : ""; desc.SetAttr("activation_type", activation_type); + // This is to add padding for dimension 128 on concern of MKL performance + auto* scope = param_scope(); + auto* weight = scope->FindVar(w->Name())->GetMutable(); + auto place = weight->place(); + bool use_gpu = Get("use_gpu"); + auto weight_data = weight->data(); + auto weight_dims = weight->dims(); + int weight_num = product(weight_dims); + int w_h = weight_dims[0]; + int w_w = weight_dims[1]; + if (!use_gpu) { + if (w_h % 128 == 0 && w_w % 128 == 0) { + float* weight_data_tmp = new float[weight_num]; + for (int i = 0; i < w_h; i++) { + memcpy(weight_data_tmp + i * w_w, weight_data + i * w_w, + w_w * sizeof(float)); + } + weight->Resize(DDim{weight_dims[0] + 4, weight_dims[1] + 4}); + auto weight_data_new = + weight->mutable_data(platform::CPUPlace()); + for (int i = 0; i < w_h; i++) { + memcpy(weight_data_new + i * (w_w + 4), weight_data_tmp + i * w_w, + w_w * sizeof(float)); + } + delete[] weight_data_tmp; + desc.SetAttr("padding_weights", true); + } + } + // For anakin subgraph int8 // When in anakin subgraph int8 mode, the pattern like "fake_quant + mul + // fake_dequant" can be detected by the quant_dequant_fuse_pass. This pass diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 320d28f131f03ed118614b5f97baa4397db0fcaa..dfae572d4634e43fb288f5cc21bf53efc3834f5e 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -21,6 +21,24 @@ namespace paddle { namespace framework { namespace ir { +void AddVarToScope(Scope* param_scope, const std::string& name, + const DDim& dims) { + auto* tensor = param_scope->Var(name)->GetMutable(); + tensor->Resize(dims); + tensor->mutable_data(platform::CPUPlace()); +} + +Scope* CreateParamScope() { + auto param_scope = new Scope(); + AddVarToScope(param_scope, "conv2d_filters_0", {}); + AddVarToScope(param_scope, "conv2d_bias_0", {}); + AddVarToScope(param_scope, "weights_0", {}); + AddVarToScope(param_scope, "weights_1", {}); + AddVarToScope(param_scope, "bias_1", {}); + AddVarToScope(param_scope, "bias_2", {}); + return param_scope; +} + TEST(FCFusePass, basic) { // inputs operator output // -------------------------------------------------------- @@ -50,6 +68,8 @@ TEST(FCFusePass, basic) { std::unique_ptr graph(new ir::Graph(layers.main_program())); auto pass = PassRegistry::Instance().Get("fc_fuse_pass"); + pass->Set("use_gpu", new bool(true)); + graph->Set("__param_scope__", CreateParamScope()); int num_nodes_before = graph->Nodes().size(); int num_mul_nodes_before = GetNumOpNodes(graph, "mul"); VLOG(3) << DebugString(graph); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 3fa907b418cfc6982ac6eb6c5c7077b32c050676..80b68ec155b7d603b12cffb42dccdaa8d6303a9c 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -147,6 +147,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("auto_config_layout", new bool(argument->anakin_auto_config_layout())); } + if (pass_name == "fc_fuse_pass") { + pass->Set("use_gpu", new bool(argument->use_gpu())); + } pre_pass = pass_name; diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index f679e1221821a3ef32989127e01e6af67240fab8..5035f9b358718c4b3da445f82863c5d66e2dfbe6 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -153,7 +153,6 @@ void profile(bool use_mkldnn = false, bool use_ngraph = false) { if (use_mkldnn) { config.EnableMKLDNN(); - config.pass_builder()->AppendPass("fc_mkldnn_pass"); } if (use_ngraph) { @@ -193,7 +192,6 @@ void compare(bool use_mkldnn = false, bool use_ngraph = false) { SetConfig(&cfg); if (use_mkldnn) { cfg.EnableMKLDNN(); - cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); } if (use_ngraph) { diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 5a3e1bb7fdabc6995e547264bd11f5f8cf0b7c25..46ea4b6bb84c3cdf97c609230139b7dae98c7873 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -32,17 +32,33 @@ class FCOp : public framework::OperatorWithKernel { auto in_dims = ctx->GetInputDim("Input"); auto w_dims = ctx->GetInputDim("W"); + bool padding_weights = ctx->Attrs().Get("padding_weights"); if (ctx->HasInput("Bias")) { auto bias_dims = ctx->GetInputDim("Bias"); + auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1]; if (bias_dims.size() == 2) { PADDLE_ENFORCE_EQ(bias_dims[0], 1, - "The shape of Bias must be [1, dim]."); - PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1], - "The shape of Bias must be [1, dim]."); + platform::errors::InvalidArgument( + "The shape of Bias is invalid." + "The height of Bias should be 1." + "But received height of Bias is %d.", + bias_dims[0])); + PADDLE_ENFORCE_EQ( + bias_dims[1], w_dims1, + platform::errors::InvalidArgument( + "The shape of Bias is invalid." + "The width of Bias should be equal to width of Weight." + "But received width of Bias is %d and width of Weight is %d.", + bias_dims[1], w_dims1)); } else if (bias_dims.size() == 1) { - PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1], - "The shape of Bias must be [1, dim]."); + PADDLE_ENFORCE_EQ( + bias_dims[0], w_dims1, + platform::errors::InvalidArgument( + "The shape of Bias is invalid." + "The height of Bias should be equal to the width of weight." + "But received height of Bias is %d and width of Weight is %d.", + bias_dims[0], w_dims1)); } } @@ -65,7 +81,8 @@ class FCOp : public framework::OperatorWithKernel { "in_num_col_dims."); std::vector output_dims; - FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims); + FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims, + padding_weights); ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); ctx->ShareLoD("Input", "Out"); @@ -107,6 +124,11 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr( + "padding_weights", + "(bool, default false) When padding weights in the fc fuse pass, " + "the 'padding_weights' attribute is set as true.") + .SetDefault(false); AddAttr(framework::kAllKernelsMustComputeRuntimeShape, "Skip calling InferShape() function in the runtime.") .SetDefault(true); diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index cc133af7315eafaa2fa75d2cbfddb2def736a3f2..54a81812c25707393619f6aae8e4b26ab6b0b5ef 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -27,17 +27,21 @@ using Tensor = framework::Tensor; inline void FCOutputSize(const framework::DDim& in_dims, const framework::DDim& w_dims, std::vector& out_dims, // NOLINT - int in_num_col_dims) { + int in_num_col_dims, bool padding_weights) { auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims); - PADDLE_ENFORCE_EQ( - in_mat_dims[1], w_dims[0], - "Fully Connected input and weigth size do not match. %s, %s"); + auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0]; + auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1]; + PADDLE_ENFORCE_EQ(in_mat_dims[1], w_dims0, + platform::errors::InvalidArgument( + "Fully Connected input and weigth size do not match. " + "input width: %d,weight height: %d", + in_mat_dims[1], w_dims0)); out_dims.reserve(static_cast(in_num_col_dims + 1)); for (int i = 0; i < in_num_col_dims; ++i) { out_dims.push_back(in_dims[i]); } - out_dims.push_back(w_dims[1]); + out_dims.push_back(w_dims1); } template @@ -53,14 +57,18 @@ class FCOpKernel : public framework::OpKernel { (ctx.Attr("activation_type") == "relu") ? true : false; auto w_dims = w->dims(); + bool padding_weights = ctx.Attr("padding_weights"); std::vector output_dims; - FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims); + FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims, + padding_weights); output->Resize(framework::make_ddim(output_dims)); output->set_lod(input->lod()); auto out_dims = output->dims(); - int M = framework::product(out_dims) / w_dims[1]; + auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0]; + auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1]; + int M = framework::product(out_dims) / w_dims1; const T* input_data = input->data(); const T* w_data = w->data(); @@ -68,8 +76,8 @@ class FCOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); math::FCFunctor fc; - fc(dev_ctx, M, w_dims[1], w_dims[0], input_data, w_data, output_data, - bias ? bias->data() : NULL, with_relu); + fc(dev_ctx, M, w_dims1, w_dims0, input_data, w_data, output_data, + bias ? bias->data() : NULL, with_relu, padding_weights); } }; diff --git a/paddle/fluid/operators/math/fc.cc b/paddle/fluid/operators/math/fc.cc index b5479a1b435682384e555c6607a097c9e0c82bd8..38acd7ba94817932ed4ea628a36703dbb8495de2 100644 --- a/paddle/fluid/operators/math/fc.cc +++ b/paddle/fluid/operators/math/fc.cc @@ -25,10 +25,53 @@ class FCFunctor { public: void operator()(const platform::CPUDeviceContext& context, const int M, const int N, const int K, const T* X, const T* W, T* Y, - const T* B = nullptr, bool relu = false) { + const T* B = nullptr, bool relu = false, + bool padding_weights = false) { auto blas = math::GetBlas(context); - blas.MatMul(M, N, K, X, W, Y); + framework::Tensor Y1; + T* Y1_data = nullptr; + if (N % 128 == 0 && K % 128 == 0) { + const int NN = N + 4; + const int KK = K + 4; + framework::Tensor X1; + T* X1_data = X1.Resize({M * KK}).mutable_data(platform::CPUPlace()); + Y1_data = Y1.Resize({M * (N + 4)}).mutable_data(platform::CPUPlace()); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < M; i++) { + memcpy(X1_data + i * KK, X + i * K, K * sizeof(X[0])); + } + framework::Tensor W1; + T* W1_data = nullptr; + if (!padding_weights) { + W1_data = W1.Resize({(K + 4) * (N + 4)}) + .mutable_data(platform::CPUPlace()); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < K; i++) { + memcpy(W1_data + i * NN, W + i * N, N * sizeof(W[0])); + } + } + blas.GEMM(false, false, M, N, K, static_cast(1.0), X1_data, KK, + (padding_weights ? W : W1_data), NN, static_cast(0.0), + Y1_data, NN); + } else { + blas.MatMul(M, N, K, X, W, Y); + } if (B == NULL) { + if (N % 128 == 0 && K % 128 == 0) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < M; i++) { + memcpy(Y + i * N, Y1_data + i * (N + 4), N * sizeof(Y[0])); + } + } + PADDLE_ENFORCE_EQ(relu, false, + platform::errors::PermissionDenied( + "When bias is NULL, relu can not be true.")); return; } if (relu) { @@ -37,7 +80,8 @@ class FCFunctor { .At(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; - compute(B, dst, dst, N); + T* src = (N % 128 == 0 && K % 128 == 0) ? Y1_data + i * (N + 4) : dst; + compute(B, src, dst, N); } } else { auto compute = @@ -48,7 +92,8 @@ class FCFunctor { #endif for (int i = 0; i < M; i++) { T* dst = Y + i * N; - compute(B, dst, dst, N); + T* src = (N % 128 == 0 && K % 128 == 0) ? Y1_data + i * (N + 4) : dst; + compute(B, src, dst, N); } } } diff --git a/paddle/fluid/operators/math/fc.cu b/paddle/fluid/operators/math/fc.cu index 1b22b81039954bfcf8ea0f6819d778d3fa126cab..82da2dd805aef99d603944ef0a3acc5ab0e64c19 100644 --- a/paddle/fluid/operators/math/fc.cu +++ b/paddle/fluid/operators/math/fc.cu @@ -41,7 +41,12 @@ class FCFunctor { public: void operator()(const platform::CUDADeviceContext& context, const int M, const int N, const int K, const T* X, const T* W, T* Y, - const T* B = nullptr, bool relu = false) { + const T* B = nullptr, bool relu = false, + bool padding_weights = false) { + PADDLE_ENFORCE_EQ( + padding_weights, false, + platform::errors::PermissionDenied( + "Weight padding in fc can not be used in GPU scope.")); auto blas = math::GetBlas(context); blas.GEMM(false, false, M, N, K, static_cast(1.0), X, K, W, N, static_cast(0.0), Y, N); diff --git a/paddle/fluid/operators/math/fc.h b/paddle/fluid/operators/math/fc.h index 9bef496fb9d3977b286338a79f641fde514d8303..02f81587c739f2b47ef70a92f01d083c932deae3 100644 --- a/paddle/fluid/operators/math/fc.h +++ b/paddle/fluid/operators/math/fc.h @@ -26,7 +26,8 @@ class FCFunctor { public: void operator()(const DeviceContext& context, const int M, const int N, const int K, const T* X, const T* W, T* Y, - const T* B = nullptr, bool relu = false); + const T* B = nullptr, bool relu = false, + bool weight_pass = false); }; } // namespace math diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 349dbffb386f84662b0ce8e6668126263d95b88d..dfaf47653fac51c1aa7d2150b80efe0726ef36eb 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -207,8 +207,13 @@ class FCPrimitiveFactory { void RecomputeOutputDims(const ExecutionContext& ctx, const LoDTensor* input, const Tensor* w, LoDTensor* output) { int in_num_col_dims = ctx.Attr("in_num_col_dims"); + bool padding_weights = ctx.Attr("padding_weights"); + PADDLE_ENFORCE_EQ(padding_weights, false, + platform::errors::PermissionDenied( + "Weight padding in fc can not be used in MKLDNN.")); std::vector output_dims; - FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims); + FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims, + padding_weights); output->Resize(framework::make_ddim(output_dims)); output->set_lod(input->lod()); } diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py index 0da0fd0789a770db20dd219273d5c21ec7572e29..9028210b8fe9cc7ee4b6af3598ea0bc79fa2cde0 100644 --- a/python/paddle/fluid/tests/unittests/test_fc_op.py +++ b/python/paddle/fluid/tests/unittests/test_fc_op.py @@ -124,6 +124,13 @@ class TestFCOpWithBias3(TestFCOp): self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1) +class TestFCOpWithPadding(TestFCOp): + def config(self): + self.with_bias = True + self.with_relu = True + self.matrix = MatrixGenerate(1, 4, 3, 128, 128, 2) + + class TestFCOpError(OpTest): def test_errors(self): with program_guard(Program(), Program()):