Merge remote-tracking branch 'baidu/develop' into feature/sppnet

3553576e · qijun · e2c07135 · 65af9f94 · 3553576e · 3553576e
24 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,10 +38,21 @@ addons:
      - curl
      - lcov
      - graphviz
+      - swig
 before_install:
+  - |
+    if [ ${JOB} == "BUILD_AND_TEST" ]; then
+      if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then
+        TRAVIS_COMMIT_RANGE="FETCH_HEAD...$TRAVIS_BRANCH"
+      fi
+      git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)' || {
+        echo "Only markdown docs were updated, stopping build process."
+        exit
+      }
+    fi
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
 script:
  - paddle/scripts/travis/main.sh
 notifications:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,11 +95,26 @@ if(NOT WITH_GPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    if(${CUDA_VERSION_MAJOR} GREATER 6)
+        if(COMPILER_SUPPORT_CXX11)
+            LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+        endif()
+    endif()
+
    # TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
    if(NOT CUDNN_FOUND)
        message(FATAL_ERROR "Paddle need cudnn to compile")
    endif()
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
+
+    if(WITH_AVX)
+        if(AVX_FOUND)
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -mavx")
+        endif(AVX_FOUND)
+    else(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -msse3")
+    endif(WITH_AVX)

    if(WITH_DSO)
        set(CUDA_LIBRARIES "")

--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.

 Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release log](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 
+Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 

 ## Features


--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -188,14 +188,6 @@ macro(add_simple_unittest TARGET_NAME)
    add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
 endmacro()

-macro(add_paddle_culib TARGET_NAME)
-    set(NVCC_FLAG ${CUDA_NVCC_FLAGS})
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--use_fast_math)
-    cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
-    set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
-endmacro()
-
-
 # Creates C resources file from files in given resource file
 function(create_resources res_file output)
    # Create empty output file

--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -21,14 +21,21 @@
 set -e

 export LC_ALL=C
+UNAME_STR=`uname`
+
+if [[ ${UNAME_STR} == 'Linux' ]]; then
+  SHUF_PROG='shuf'
+else
+  SHUF_PROG='gshuf'
+fi

 mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle
 cd data/tmp
 echo 'uniq and shuffle...'
-cat pos_*|sort|uniq|shuf> pos.shuffed
-cat neg_*|sort|uniq|shuf> neg.shuffed
+cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
+cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed

 min_len=`sed -n '$=' neg.shuffed`
 test_num=$((min_len/10))
@@ -42,8 +49,8 @@ head -n$train_num neg.shuffed >train.neg
 tail -n$test_num pos.shuffed >test.pos
 tail -n$test_num neg.shuffed >test.neg

-cat train.pos train.neg|shuf>../train.txt
-cat test.pos test.neg|shuf>../test.txt
+cat train.pos train.neg | ${SHUF_PROG} >../train.txt
+cat test.pos test.neg | ${SHUF_PROG} >../test.txt

 cd -
 echo 'data/train.txt' > data/train.list

--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -134,9 +134,8 @@ define_py_data_sources2(train_list='data/train.list',
 * obj="process": 指定生成数据的函数
 * args={"dictionary": word_dict}: 额外的参数，这里指定词典

-更详细用例请参考文档<a href = "../../../doc/ui/data_provider/python_case.html">Python Use Case</a>，
-数据格式和详细文档请参考<a href = "../../../doc/ui/data_provider/pydataprovider2.html">
-PyDataProviderWrapper</a>。
+更详细数据格式和用例请参考<a href = "../../ui/data_provider/pydataprovider2.html">
+PyDataProvider2</a>。

 ## 网络结构(Network Architecture)
 本节我们将专注于网络结构的介绍。

--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -177,3 +177,40 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字

    pip install --upgrade pip

+8.  python相关的单元测试都过不了
+--------------------------------
+
+如果出现以下python相关的单元测试都过不了的情况：
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+    
+并且查询PaddlePaddle单元测试的日志，提示：
+
+..  code-block:: bash
+    
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+    
+解决办法是：卸载paddle包 :code:`pip uninstall paddle`。
+
+原因是：单元测试使用了一个旧版本的python包，而没有测试到代码中实际修改的python包。即单元测试需要一个干净的环境：
+
+* 如果paddle包已经在python的site-packages里面了，那么单元测试时使用的paddle包，就是site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。
+* 即便设置了 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
\ No newline at end of file
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
 PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
 WITH_GPU="@WITH_GPU@"
 PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
+ZLIB_LIB="@ZLIB_LIBRARIES@"
 CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
 CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"

@@ -15,3 +16,4 @@ GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBS@"

 CUDA_LIBRARIES="@CUDA_LIBRARIES@"
+WITH_COVERALLS="@ON_COVERALLS@"
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -38,6 +38,7 @@ try:
            self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
            self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
            self.protolib = PROTOBUF_LIB
+            self.zlib = ZLIB_LIB
            self.thread = CMAKE_THREAD_LIB
            self.dl_libs = CMAKE_DL_LIBS
            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
@@ -47,6 +48,7 @@ try:
            self.glog_libs = LIBGLOG_LIBRARY

            self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
+            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
            self.gflags_libs = GFLAGS_LIBRARIES
            self.gflags_location = GFLAGS_LOCATION
            self.cblas_libs = CBLAS_LIBRARIES
@@ -82,6 +84,7 @@ try:
                "-lpaddle_cuda",
                "-lpaddle_api",
                self.normalize_flag(self.protolib),
+                self.normalize_flag(self.zlib),
                self.normalize_flag(self.thread),
                self.normalize_flag(self.dl_libs),
                self.normalize_flag(self.cblas_libs),
@@ -95,6 +98,8 @@ try:
                libs.append(self.normalize_flag(self.gflags_libs))
            if self.with_gpu:
                libs.append(self.normalize_flag(self.curt))
+            if self.with_coverage:
+                libs.append("-fprofile-arcs")
            return " ".join(filter(lambda l: len(l) != 0, libs))

        def normalize_flag(self, cmake_flag):
@@ -131,8 +136,14 @@ try:
                return False
            else:
                return True
-
+        def c_flag(self):
+            if self.with_coverage:
+                return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
+            else:
+                return None
 except ImportError:
    class PaddleLDFlag(object):
        def ldflag_str(self):
            pass
+        def c_flag(self):
+            pass
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -209,6 +209,15 @@ typedef struct {
 #define HL_FLOAT_MIN        2.2250738585072014e-308
 #endif

+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ *
+ * Currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT       40.0
+
+
 /**
 * @brief DIVUP(x, y) is similar to ceil(x / y).
 * @note  For CUDA, DIVUP will be used to specify

--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -38,7 +38,9 @@ namespace hppl {
  }

  __m256 tanh(const __m256 a) {
+    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+    tmp = _mm256_min_ps(tmp, max);
    tmp = exp(tmp);
    return _mm256_sub_ps(
        _mm256_div_ps(_mm256_set1_ps(2.0f),

--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -30,7 +30,9 @@ namespace hppl {
  }

  real tanh(const real a) {
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+    real tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    return (2.0 / (1.0 + exp(tmp))) - 1.0;
  }

  real linear(const real a) {

--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -50,7 +50,7 @@ if(NOT WITH_PYTHON)
 endif()

 if(WITH_GPU)
-    add_paddle_culib(paddle_gserver ${GSERVER_SOURCES})
+    cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
 else()
    add_library(paddle_gserver STATIC
        ${GSERVER_SOURCES})

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1021,7 +1021,7 @@ TEST(Layer, LstmLayer) {
  TestConfig config;
  config.layerConfig.set_type("lstmemory");
  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_type("tanh");
  config.layerConfig.set_active_state_type("sigmoid");
  config.layerConfig.set_active_gate_type("sigmoid");
  config.biasSize = 28;

--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -369,7 +369,7 @@ TEST(Layer, LstmLayer) {
  LayerConfig layerConfig;
  layerConfig.set_type("lstmemory");
  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_state_type("tanh");
  layerConfig.set_active_gate_type("sigmoid");

  layerConfig.add_inputs();

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -625,7 +625,10 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
  applyBinary(binary::SquareDerivative<T>(), b);
 }

-DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(Tanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
  applyBinary(binary::Tanh<real>(), b);

--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -23,7 +23,7 @@ if(NOT WITH_GPU)
    add_library(paddle_math STATIC
        ${MATH_SOURCES})
 else()
-    add_paddle_culib(paddle_math ${MATH_SOURCES})
+    cuda_add_library(paddle_math ${MATH_SOURCES})
 endif()



--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -200,7 +200,10 @@ void vLog1p(const int n, const T* a, T* r) {
    binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }

-DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(vTanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<class T>
 void vTanh(const int n, const T* a, T* r) {
  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3468,9 +3468,7 @@ void CpuMatrix::tanh(Matrix& output) {
  size_t dim = getWidth();
  CHECK_EQ(output.getHeight(), numSamples);
  CHECK_EQ(output.getWidth(), dim);
-  errno = 0;
  vTanh(numSamples * dim, getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vTanh error";
 }

 void CpuMatrix::tanhDerivative(Matrix& output) {
@@ -3492,10 +3490,8 @@ void CpuMatrix::softrelu(Matrix& output) {
      out[j] = x;
    }
  }
-  errno = 0;
  vExp(numSamples * dim, output.getData(), output.getData());
  vLog1p(numSamples * dim, output.getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
 }

 void CpuMatrix::softreluDerivative(Matrix& output) {
@@ -3510,9 +3506,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) {
  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
  real* tmp = tmpMat->getData();

-  errno = 0;
  vExp(size, output.getData(), tmpMat->getData());
-  CHECK_EQ(errno, 0) << "vExp error";

  for (size_t i = 0; i < size; ++i) {
    grad[i] *= (1.0 - 1.0 / tmp[i]);
@@ -3535,10 +3529,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
    out[i] = p2 * in[i];
  }

-  // out = tanh(out)
-  errno = 0;
  vTanh(numSamples * dim, out, out);
-  CHECK_EQ(errno, 0) << "vTanh error";

  // out = p1 * out
  for (size_t i = 0; i < numSamples * dim; ++i) {

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+add_simple_unittest(test_FPException)
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas, 
+ * that can be circumvented by some tricks.
+ * For example, 
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+#include <fenv.h>
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Excepts.h"
+
+using namespace paddle;     // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template<typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template<typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -3,6 +3,8 @@ source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+else
+  CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
 fi



--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -31,8 +31,8 @@ is_lin = (system == 'linux')
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
 #   it just read COMAKE generated LDFLAGS.
 extra_links = []
-ldflags = api.paddle_ld_flags.PaddleLDFlag()
-ldflags = ldflags.ldflag_str()
+obj = api.paddle_ld_flags.PaddleLDFlag()
+ldflags = obj.ldflag_str()
 if ldflags is not None:
  extra_links.extend(ldflags.split(" "))

@@ -51,13 +51,20 @@ elif is_osx == True:

 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.

+extra_c = obj.c_flag()
+
+attr=dict()
+if extra_c is not None:
+  attr["extra_compile_args"] = extra_c
+
 setup(name="py_paddle",
  version="@PADDLE_VERSION@",
  ext_modules=[
    Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
       ['Paddle_wrap.cxx'],
       include_dirs = include_dirs,
-       extra_link_args = extra_links
+       extra_link_args = extra_links,
+       **attr
    )
  ],
  packages=['py_paddle'],

--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -13,5 +13,5 @@ for file in $files
 do
    base_protostr=$protostr/$file
    new_protostr=$protostr/$file.unitest
-    diff $base_protostr $new_protostr
+    diff $base_protostr $new_protostr -u
 done