diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 71a69bd0d01f4f6fcd579a408008ad4e00b5fd4d..a9b5b916a1f0d22ff46dc6795053f44e3e3af09e 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -145,7 +145,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
   real *expInData = expandInput_->getData();
   for (int g = 0; g < groups_[inIdx]; ++g) {
     MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
     MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
     MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
     C->mul(A, B, 1, 1);
@@ -182,7 +182,7 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
       // create temporary matrix
       MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
       MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
       C->mul(A, B);  // mul
 
       // clear the temporary matrix
@@ -247,10 +247,10 @@ void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
 
     // expand-mul one-group by one
     for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
+      C->mul(B, A, 1, 1);
 
       A->clear();
       B->clear();
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index f1442ca7b832ad449a910208d9f27257bbe7ffaf..795641143e31c51b5bd91bd0029d85ccd4c29d94 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -86,13 +86,14 @@ MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
     initTestLayer(config, &layerMap, &parameters, &convLayer);
     convLayer->getBiasParameter()->zeroMem();
     convLayer->getParameters()[0]->zeroMem();
-    convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(param, 18);
+    convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(param,
+        channel* filter_size * filter_size * config.layerConfig.num_filters());
     convLayer->forward(PASS_GC);
 
     return convLayer->getOutputValue();
 }
 
-TEST(Layer, convTransLayerFwd2) {
+TEST(Layer, convParaUnified) {
     MatrixPtr input, resultCpu, resultGpu;
     input = Matrix::create(1, 4 * 4, false, false);
     float inputData[] = {1, 2, 3, 4,
@@ -122,6 +123,38 @@ TEST(Layer, convTransLayerFwd2) {
                        /*numfilters*/ 2,
                        input, param, true);
     checkMatrixEqual(resultCpu, resultGpu);
+
+    input = Matrix::create(1, 3 * 3 * 2, false, false);
+    float inputData2[] = {1, 2, 3,
+                          4, 5, 6,
+                          7, 8, 9,
+
+                          10, 11, 12,
+                          13, 14, 15,
+                          16, 17, 18};
+    float param2[] = {1, 2, 3, 4, 5, 6, 7, 8,
+                      8, 7, 6, 5, 4, 3, 2, 1};
+
+    input->setData(inputData2);
+
+    resultCpu = doOneConvTest(/* imgSize */ 3,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 2,
+                   /*channel*/ 2,
+                   /*numfilters*/ 2,
+                   input, param2, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 3,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 2,
+                       /*channel*/ 2,
+                       /*numfilters*/ 2,
+                       input, param2, true);
+    checkMatrixEqual(resultCpu, resultGpu);
 }
 
 int main(int argc, char** argv) {