diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40f18f15508daef6055194adfc119107f0a9123c..fcb839851787fe71d23ce2accd58bc3e4693f70f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 9)
-set(PADDLE_PATCH_VERSION 0a0)
+set(PADDLE_PATCH_VERSION 0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
diff --git a/RELEASE.md b/RELEASE.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8a245ab442ba0fc63d1f1fda932e7590a6fe4ca
--- /dev/null
+++ b/RELEASE.md
@@ -0,0 +1,69 @@
+# Release v0.9.0
+
+## New Features:
+
+* New Layers
+  * bilinear interpolation layer.
+  * spatial pyramid-pool layer.
+  * de-convolution layer.
+  * maxout layer.
+* Support rectangle padding, stride, window and input for Pooling Operation.
+* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
+* Expose cost_weight/nce_layer in `trainer_config_helpers`
+* Add FAQ, concepts, h-rnn docs.
+* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
+* Add usage track scripts.
+
+## Improvements
+
+* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
+* Add code coverage tools.
+* Refine convolution layer to speedup and reduce GPU memory.
+* Speed up PyDataProvider2
+* Add ubuntu deb package build scripts.
+* Make Paddle use git-flow branching model.
+* PServer support no parameter blocks.
+
+## Bug Fixes
+
+* add zlib link to py_paddle
+* add input sparse data check for sparse layer at runtime
+* Bug fix for sparse matrix multiplication
+* Fix floating-point overflow problem of tanh
+* Fix some nvcc compile options
+* Fix a bug in yield dictionary in DataProvider
+* Fix SRL hang when exit.
+
+# Release v0.8.0beta.1
+New features:
+
+* Mac OSX is supported by source code. #138
+   * Both GPU and CPU versions of PaddlePaddle are supported.
+
+* Support CUDA 8.0
+
+* Enhance `PyDataProvider2`
+   * Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
+   * Add `min_pool_size` to control memory pool in provider.
+
+* Add `deb` install package & docker image for no_avx machines.
+   * Especially for cloud computing and virtual machines
+
+* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
+
+* Add Parallel NN api in trainer_config_helpers.
+
+* Add `travis ci` for Github
+
+Bug fixes:
+
+* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
+* Check if PaddlePaddle is installed when unittest.
+* Fix bugs in GTX series GPU
+* Fix bug in MultinomialSampler
+
+Also more documentation was written since last release.
+
+# Release v0.8.0beta.0
+
+PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
diff --git a/paddle/scripts/docker/Dockerfile.m4 b/paddle/scripts/docker/Dockerfile.m4
index e14493ed9e842351125ab458db53fcc3f38233f6..761aa975d693631556c162dc29ae288ad6bd980b 100644
--- a/paddle/scripts/docker/Dockerfile.m4
+++ b/paddle/scripts/docker/Dockerfile.m4
@@ -1,7 +1,7 @@
 FROM PADDLE_BASE_IMAGE
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
+ENV GIT_CHECKOUT=v0.9.0
 ENV WITH_GPU=PADDLE_WITH_GPU
 ENV IS_DEVEL=PADDLE_IS_DEVEL
 ENV WITH_DEMO=PADDLE_WITH_DEMO
diff --git a/plugin/opencv/CMakeLists.txt b/plugin/opencv/CMakeLists.txt
index 4a253f346a0b67a8cce8b6b663220d7a163e0660..bc0a6e635475bec67cbcf2b08c75ef3e72631a03 100644
--- a/plugin/opencv/CMakeLists.txt
+++ b/plugin/opencv/CMakeLists.txt
@@ -32,8 +32,13 @@ list(APPEND DEJPEG_LINKER_LIBS ${Boost_LIBRARIES})
 file(GLOB DEJPEG_HEADER "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
 file(GLOB DEJPEG_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp")
 
-set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -fPIC -Wno-unused-parameter")
+set(BUILD_PRIVATE_FLAGS
+    -Wno-all
+    -Wno-error
+    -Wno-non-virtual-dtor
+    -Wno-delete-non-virtual-dtor)
 
 add_library(DeJpeg SHARED ${DEJPEG_SOURCES})
+target_compile_options(DeJpeg BEFORE PRIVATE ${BUILD_PRIVATE_FLAGS})
 target_link_libraries(DeJpeg ${DEJPEG_LINKER_LIBS})
 set_target_properties(DeJpeg PROPERTIES PREFIX "")
diff --git a/plugin/opencv/DataTransformer.cpp b/plugin/opencv/DataTransformer.cpp
index f4e21db8861bcbeafc25fe7bcaec1b9ada39ef2d..d9e8883443c7db49057f77d4eb4641122266b324 100644
--- a/plugin/opencv/DataTransformer.cpp
+++ b/plugin/opencv/DataTransformer.cpp
@@ -50,7 +50,7 @@ DataTransformer::DataTransformer(int threadNum,
     prefetchFree_.enqueue(prefetch_[i]);
   }
 
-  numThreads_ = 12;
+  numThreads_ = threadNum;
   syncThreadPool_.reset(new SyncThreadPool(numThreads_, false));
 }
 
@@ -154,7 +154,7 @@ void DataTransformer::transform(Mat& cvImgOri, float* target) {
 
 void DataTransformer::start(vector<char*>& data, int* datalen, int* labels) {
   auto job = [&](int tid, int numThreads) {
-    for (int i = tid; i < data.size(); i += numThreads) {
+    for (size_t i = tid; i < data.size(); i += numThreads) {
       DataTypePtr ret = prefetchFree_.dequeue();
       char* buf = data[i];
       int size = datalen[i];
diff --git a/plugin/opencv/DataTransformer.h b/plugin/opencv/DataTransformer.h
index c4f04a58785f0f0c34149caba80d0b2832ab0dd5..52abab928b0514ea37964443c0d17d8efc6d9ad2 100644
--- a/plugin/opencv/DataTransformer.h
+++ b/plugin/opencv/DataTransformer.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include <iostream>
 #include <fstream>
-// #define OPENCV_CAN_BREAK_BINARY_COMPATIBILITY
 #include <opencv2/opencv.hpp>
 #include <vector>
 #include <string>
@@ -22,7 +21,6 @@ limitations under the License. */
 
 #include "paddle/utils/Thread.h"
 
-using namespace std;
 using namespace cv;
 using namespace paddle;
 
@@ -113,7 +111,7 @@ private:
    */
   int Rand(int min, int max);
 
-  typedef pair<float*, int> DataType;
+  typedef std::pair<float*, int> DataType;
   typedef std::shared_ptr<DataType> DataTypePtr;
   std::vector<DataTypePtr> prefetch_;
   std::unique_ptr<SyncThreadPool> syncThreadPool_;
diff --git a/plugin/opencv/PyDecodejpeg.cpp b/plugin/opencv/PyDecodejpeg.cpp
index b004d7cad8051c121a96c36dcdaa71cf0fdadd55..66054302f881bbf6e4b06a26780b70da8f2d806b 100644
--- a/plugin/opencv/PyDecodejpeg.cpp
+++ b/plugin/opencv/PyDecodejpeg.cpp
@@ -19,13 +19,11 @@ limitations under the License. */
 #include <unistd.h>
 #include <glog/logging.h>
 #include <numpy/arrayobject.h>
-
 #include <boost/python.hpp>
 
 #include "DataTransformer.h"
 
 using namespace boost::python;
-using namespace std;
 
 /**
  * DecodeJpeg is an image processing API for interfacing Python and C++
@@ -37,7 +35,7 @@ using namespace std;
 class DecodeJpeg {
 public:
   /**
-   * The constructor will create and nitialize an object of DataTransformer.
+   * The constructor will create and initialize an object of DataTransformer.
    */
   DecodeJpeg(int threadNum,
              int capacity,
@@ -56,11 +54,11 @@ public:
         LOG(FATAL) << "Object is not a numpy array";
       }
       pyTypeCheck(meanValues);
-      int size = PyArray_SIZE(meanValues);
+      int size = PyArray_SIZE(reinterpret_cast<PyArrayObject*>(meanValues));
       isChannelMean = (size == channel) ? true : false;
       isEltMean = (size == channel * cropSizeH * cropSizeW) ? true : false;
       CHECK(isChannelMean != isEltMean);
-      mean = (float*)PyArray_DATA(meanValues);
+      mean = (float*)PyArray_DATA(reinterpret_cast<PyArrayObject*>(meanValues));
     }
     tfhandlerPtr_ = std::make_shared<DataTransformer>(threadNum,
                                                       capacity,
@@ -91,8 +89,9 @@ public:
       char* src = boost::python::extract<char*>(pysrc[t]);
       data.push_back(src);
     }
-    int* dlen = (int*)PyArray_DATA(pydlen);
-    int* dlabels = (int*)PyArray_DATA(pylabel);
+    int* dlen = (int*)PyArray_DATA(reinterpret_cast<PyArrayObject*>(pydlen));
+    int* dlabels =
+        (int*)PyArray_DATA(reinterpret_cast<PyArrayObject*>(pylabel));
     tfhandlerPtr_->start(data, dlen, dlabels);
   }
 
@@ -106,8 +105,8 @@ public:
     pyWritableCheck(pylab);
     pyContinuousCheck(pytrg);
     pyContinuousCheck(pylab);
-    float* data = (float*)PyArray_DATA(pytrg);
-    int* label = (int*)PyArray_DATA(pylab);
+    float* data = (float*)PyArray_DATA(reinterpret_cast<PyArrayObject*>(pytrg));
+    int* label = (int*)PyArray_DATA(reinterpret_cast<PyArrayObject*>(pylab));
     tfhandlerPtr_->obtain(data, label);
   }
 
@@ -121,8 +120,8 @@ private:
   /**
    * @brief Check whether the type of PyObject is valid or not.
    */
-  void pyTypeCheck(const PyObject* o) {
-    int typenum = PyArray_TYPE(o);
+  void pyTypeCheck(PyObject* o) {
+    int typenum = PyArray_TYPE(reinterpret_cast<PyArrayObject*>(o));
 
     // clang-format off
     int type =
@@ -143,13 +142,17 @@ private:
   /**
    * @brief Check whether the PyObject is writable or not.
    */
-  void pyWritableCheck(PyObject* o) { CHECK(PyArray_ISWRITEABLE(o)); }
+  void pyWritableCheck(PyObject* o) {
+    CHECK(PyArray_ISWRITEABLE(reinterpret_cast<PyArrayObject*>(o)));
+  }
 
   /**
    * @brief Check whether the PyObject is c-contiguous or not.
    */
-  void pyContinuousCheck(PyObject* o) { CHECK(PyArray_IS_C_CONTIGUOUS(o)); }
-};
+  void pyContinuousCheck(PyObject* o) {
+    CHECK(PyArray_IS_C_CONTIGUOUS(reinterpret_cast<PyArrayObject*>(o)));
+  }
+};  // DecodeJpeg
 
 /**
  * @brief Initialize the Python interpreter and numpy.