Merge branch 'develop' of https://github.com/paddlepaddle/paddle into eigen_warning

5ceb7d12 · Yi Wang · b36f3ae7 · 9eeb8fde · 5ceb7d12 · 5ceb7d12
16 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
        set(OPENMP_FLAGS "-fopenmp")
        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
    else()

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    # TODO(yiwang): The annoying warnings mentioned in
+    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
+    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
+    # to fix it.  Before it gets accepted by the gflags team, we use
+    # my personal fork, which contains above fix, temporarily.  Let's
+    # change this back to the official Github repo once my PR is
+    # merged.
+    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
 }

 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_->get_eigen_device<Eigen::GpuDevice>();
 }
 #endif


--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -252,7 +252,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 class ExecutionContext : public OperatorContext {
 public:
  ExecutionContext(const OperatorBase* op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
+                   const platform::DeviceContext* device_context)
      : OperatorContext(op, scope), device_context_(device_context) {}

  template <typename PlaceType,
@@ -260,9 +260,9 @@ class ExecutionContext : public OperatorContext {
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
  DeviceType& GetEigenDevice() const;

-  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+  platform::Place GetPlace() const { return device_context_->GetPlace(); }

-  const platform::DeviceContext& device_context_;
+  const platform::DeviceContext* device_context_;
 };

 class OpKernel {
@@ -311,7 +311,7 @@ class OperatorWithKernel : public OperatorBase {
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(this, scope, dev_ctx));
+    opKernel->Compute(ExecutionContext(this, scope, &dev_ctx));
  }

  static std::unordered_map<std::string /* op_type */, OpKernelMap>&

--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -49,9 +49,7 @@ class NNPACKConvFunction : public ConvFunctionBase {
 public:
  void init(const FuncConfig& config) override {
    ConvFunctionBase::init(config);
-    CHECK_EQ(groups_, (size_t)1);
    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    // algorithm_ = nnp_convolution_algorithm_auto;
    transform_strategy_ = nnp_convolution_transform_strategy_compute;
    nnp_status status = nnp_initialize();
    CHECK_EQ(status, nnp_status_success);
@@ -67,8 +65,7 @@ public:
    }
  }

-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
@@ -91,8 +88,8 @@ public:
    size_t filterHeight = getFilterHeight(filter);
    size_t filterWidth = getFilterWidth(filter);
    size_t outputChannels = output[1];
-    // size_t outputHeight = output[2];
-    // size_t outputWidth = output[3];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];

    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
    nnp_padding padding = {.top = (size_t)paddingH(),
@@ -171,20 +168,25 @@ public:
      }
    }

+    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
+    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
    if (batchSize == 1) {
+      for (size_t g = 0; g < groups_; g++) {
        nnp_status status =
            nnp_convolution_inference(algorithm_,
                                      transform_strategy_,
-                                    inputChannels,
-                                    outputChannels,
+                                      inputChannels / groups_,
+                                      outputChannels / groups_,
                                      inputSize,
                                      padding,
                                      kernelSize,
                                      outputSubsampling,
-                                    inputData,
-                                    filterData,
+                                      inputData + inputOffset * g,
+                                      filterData + filterOffset * g,
                                      nullptr, /* bias */
-                                    outputData,
+                                      outputData + outputOffset * g,
                                      bufferPtr,
                                      sizePtr,
                                      nnp_activation_identity,
@@ -192,21 +194,24 @@ public:
                                      threadpool_, /* threadpool */
                                      nullptr);
        CHECK_EQ(status, nnp_status_success);
+      }
    } else {
+      for (size_t g = 0; g < groups_; g++) {
        // only supports stride = 1
        CHECK_EQ(strideH(), 1);
        CHECK_EQ(strideW(), 1);
-      nnp_status status = nnp_convolution_output(algorithm_,
+        nnp_status status =
+            nnp_convolution_output(algorithm_,
                                   batchSize,
-                                                 inputChannels,
-                                                 outputChannels,
+                                   inputChannels / groups_,
+                                   outputChannels / groups_,
                                   inputSize,
                                   padding,
                                   kernelSize,
-                                                 inputData,
-                                                 filterData,
+                                   inputData + inputOffset * g,
+                                   filterData + filterOffset * g,
                                   nullptr, /* bias */
-                                                 outputData,
+                                   outputData + outputOffset * g,
                                   bufferPtr,
                                   sizePtr,
                                   nnp_activation_identity,
@@ -216,6 +221,7 @@ public:
        CHECK_EQ(status, nnp_status_success);
      }
    }
+  }

  static void create_nnpack_threadpool() {
    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
      convGradFilterType = "GemmConvGradFilter";
    }

-    if (FLAGS_use_nnpack) {
-      CHECK_EQ(isDeconv_, false);
+    if (FLAGS_use_nnpack && !isDeconv_) {
      createFunction(forward_,
                     "NNPACKConv",
                     FuncConfig()

--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -33,23 +33,28 @@ class OpTestMeta(type):

            for place in places:
                for in_name in func.all_input_args:
-                    if hasattr(self, in_name):
+                    if hasattr(self, "inputs") and in_name in self.inputs:
                        kwargs[in_name] = in_name
                        var = scope.new_var(in_name).get_tensor()
-                        arr = getattr(self, in_name)
+                        arr = self.inputs[in_name]
                        var.set_dims(arr.shape)
                        var.set(arr, place)
                    else:
                        kwargs[in_name] = "@EMPTY@"

                for out_name in func.all_output_args:
-                    if hasattr(self, out_name):
+                    if not hasattr(self, "outputs"):
+                        raise ValueError(
+                            "The test op must set self.outputs dict.")
+                    if out_name not in self.outputs:
+                        raise ValueError("The %s is not in self.outputs dict." %
+                                         (out_name))
                    kwargs[out_name] = out_name
                    scope.new_var(out_name).get_tensor()

                for attr_name in func.all_attr_args:
-                    if hasattr(self, attr_name):
-                        kwargs[attr_name] = getattr(self, attr_name)
+                    if hasattr(self, "attrs") and attr_name in self.attrs:
+                        kwargs[attr_name] = self.attrs[attr_name]

                op = func(**kwargs)

@@ -60,7 +65,7 @@ class OpTestMeta(type):

                for out_name in func.all_output_args:
                    actual = numpy.array(scope.find_var(out_name).get_tensor())
-                    expect = getattr(self, out_name)
+                    expect = self.outputs[out_name]
                    numpy.isclose(actual, expect)

        obj.test_all = test_all

--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -12,9 +12,11 @@ class TestAddOp(unittest.TestCase):

    def setUp(self):
        self.type = "add_two"
-        self.X = numpy.random.random((102, 105)).astype("float32")
-        self.Y = numpy.random.random((102, 105)).astype("float32")
-        self.Out = self.X + self.Y
+        self.inputs = {
+            'X': numpy.random.random((102, 105)).astype("float32"),
+            'Y': numpy.random.random((102, 105)).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}


 class TestAddGradOp(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -7,15 +7,17 @@ class TestSGD(unittest.TestCase):
    __metaclass__ = OpTestMeta

    def setUp(self):
+        # TODO this unit test is not passed
        self.type = "onehot_cross_entropy"
        batch_size = 100
        class_num = 10
-        self.X = numpy.random.random((batch_size, class_num)).astype("float32")
-        self.label = 5 * numpy.ones(batch_size).astype("int32")
+        X = numpy.random.random((batch_size, class_num)).astype("float32")
+        label = 5 * numpy.ones(batch_size).astype("int32")
+        self.inputs = {'X': X, 'label': label}
        Y = []
        for i in range(0, batch_size):
-            Y.append(-numpy.log(self.X[i][self.label[i]]))
-        self.Y = numpy.array(Y).astype("float32")
+            Y.append(-numpy.log(X[i][label[i]]))
+        self.outputs = {'Y': numpy.array(Y).astype("float32")}


 # TODO(superjom) add gradient check

--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
@@ -8,8 +8,8 @@ class TestMeanOp(unittest.TestCase):

    def setUp(self):
        self.type = "mean"
-        self.X = np.random.random((32, 784)).astype("float32")
-        self.Out = np.mean(self.X)
+        self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
+        self.outputs = {'Out': np.mean(self.inputs['X'])}


 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -8,9 +8,11 @@ class TestMulOp(unittest.TestCase):

    def setUp(self):
        self.type = "mul"
-        self.X = np.random.random((32, 84)).astype("float32")
-        self.Y = np.random.random((84, 100)).astype("float32")
-        self.Out = np.dot(self.X, self.Y)
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}


 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -8,9 +8,11 @@ class TestRowwiseAddOp(unittest.TestCase):

    def setUp(self):
        self.type = "rowwise_add"
-        self.X = np.random.random((32, 84)).astype("float32")
-        self.b = np.random.random(84).astype("float32")
-        self.Out = np.add(self.X, self.b)
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'b': np.random.random(84).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}


 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -8,10 +8,13 @@ class TestSGD(unittest.TestCase):

    def setUp(self):
        self.type = "sgd"
-        self.param = numpy.random.random((102, 105)).astype("float32")
-        self.grad = numpy.random.random((102, 105)).astype("float32")
-        self.learning_rate = 0.1
-        self.param_out = self.param - self.learning_rate * self.grad
+        w = numpy.random.random((102, 105)).astype("float32")
+        g = numpy.random.random((102, 105)).astype("float32")
+        lr = 0.1
+
+        self.inputs = {'param': w, 'grad': g}
+        self.attrs = {'learning_rate': lr}
+        self.outputs = {'param_out': w - lr * g}


 if __name__ == "__main__":

--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -8,8 +8,8 @@ class TestSigmoidOp(unittest.TestCase):

    def setUp(self):
        self.type = "sigmoid"
-        self.X = np.random.random((32, 100)).astype("float32")
-        self.Y = 1 / (1 + np.exp(-self.X))
+        self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
+        self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}


 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -19,8 +19,10 @@ class TestSoftmaxOp(unittest.TestCase):

    def setUp(self):
        self.type = "softmax"
-        self.X = np.random.random((32, 100)).astype("float32")
-        self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
+        self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
+        self.outputs = {
+            'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        }


 class TestSoftmaxGradOp(unittest.TestCase):