Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fixdownloadbug

5b2f9939 · gongweibao · 48fdfd5a · c5dc0b73 · 5b2f9939 · 5b2f9939
44 changed file
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -7,8 +7,17 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
 ExternalProject_Add(
    eigen3
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
-    URL_MD5        "1a47e78efe365a97de0c022d127607c3"
+    # for latest version, please get from official website
+    # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
+    # URL_MD5        "1a47e78efe365a97de0c022d127607c3"
+
+    # for no-ssl http support, please get from bazel's mirror
+    # URL           "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
+    # URL_MD5       "4645c66075982da6fa0bcf6b20f3e8f7"
+
+    # get from github mirror
+    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+    GIT_TAG         "a46d2e7337c4656f00abe54a8115f6d76153a048"
    PREFIX          ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -13,6 +13,10 @@
 # limitations under the License.

 INCLUDE(ExternalProject)
+# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+FIND_PACKAGE(Protobuf QUIET)
+SET(PROTOBUF_FOUND "OFF")
+

 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
@@ -39,12 +43,19 @@ macro(PROMPT_PROTOBUF_LIB)
    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})

-    ADD_LIBRARY(protoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+    ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+
+    ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
+    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
+    # make `protobuf_generate_cpp` happy.
+    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})

    FOREACH(dep ${protobuf_DEPS})
        ADD_DEPENDENCIES(protobuf ${dep})
        ADD_DEPENDENCIES(protobuf_lite ${dep})
+        ADD_DEPENDENCIES(libprotoc ${dep})
        ADD_DEPENDENCIES(protoc ${dep})
    ENDFOREACH()


--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -87,6 +87,9 @@
 #   go_library(example SHARED)
 #

+# including binary directory for generated headers.
+include_directories(${CMAKE_BINARY_DIR})
+
 if(NOT APPLE)
    find_package(Threads REQUIRED)
    link_libraries(${CMAKE_THREAD_LIBS_INIT})
@@ -331,3 +334,13 @@ function(go_test TARGET_NAME)
  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
+
+function(proto_library TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(proto_srcs)
+  set(proto_hdrs)
+  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS protobuf)
+endfunction()
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -27,10 +27,6 @@ sphinx_add_target(paddle_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_docs
-  gen_proto_py)
-
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")

@@ -51,6 +47,3 @@ sphinx_add_target(paddle_docs_cn
                  ${SPHINX_CACHE_DIR_CN}
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
-
-add_dependencies(paddle_docs_cn
-  gen_proto_py)
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -41,7 +41,7 @@ class Scope {
  const Variable* GetVariable(const std::string& name) const;

 private:
-    std::unordered_map<std::string, std::unique_ptr<Vairable>> vars_;
+    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 };
 ```

@@ -59,9 +59,9 @@ class Scope {
  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}

  Variable* GetVariable(const std::string& name) const {
-    Variable* var = GetVarLocally(name);
-    if (var != nullptr) {
-      return var;
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
    } else if (parent_ != nullptr) {
      return parent_->GetVariable(name);
    } else {
@@ -97,8 +97,8 @@ class Scope {
  // return nullptr if not found.
  Variable* GetVariable(const std::string& name) const;

-  // return Error if already contains same name variable.
-  Error CreateVariable(const std::string& name);
+  // return if already contains same name variable.
+  Variable* CreateVariable(const std::string& name);

 private:
  std::shared_ptr<Scope> parent_;

--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -13,10 +13,13 @@ typedef int paddle_master_client;
 import "C"

 import (
+	"strings"
 	"sync"
+	"time"
 	"unsafe"

 	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )

@@ -48,16 +51,33 @@ func remove(client C.paddle_master_client) *master.Client {
 	return h
 }

-type addresser string
-
-func (a addresser) Address() string {
-	return string(a)
+//export paddle_new_etcd_master_client
+func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
+	p := C.GoString(etcdEndpoints)
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   strings.Split(p, ","),
+		DialTimeout: time.Second * time.Duration(timeout),
+	})
+	if err != nil {
+		panic(err)
+	}
+	ch := make(chan string, 1)
+	a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
+	if err != nil {
+		panic(err)
+	}
+	ch <- a
+	go master.WatchKey(cli, master.DefaultAddrPath, ch)
+	c := master.NewClient(ch, bufSize)
+	return add(c)
 }

 //export paddle_new_master_client
 func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
 	a := C.GoString(addr)
-	c := master.NewClient(addresser(a), bufSize)
+	ch := make(chan string, 1)
+	ch <- a
+	c := master.NewClient(ch, bufSize)
 	return add(c)
 }


--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,18 +2,12 @@ package master

 import (
 	"os"
-	"time"

 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
 	log "github.com/sirupsen/logrus"
 )

-// Addresser provide the address of the master server.
-type Addresser interface {
-	Address() string
-}
-
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
@@ -24,11 +18,11 @@ type Client struct {
 //
 // bufSize is the record buffer size. NextRecord will read from this
 // buffer.
-func NewClient(addr Addresser, bufSize int) *Client {
+func NewClient(addrCh <-chan string, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
 	c.ch = make(chan []byte, bufSize)
-	go c.monitorMaster(addr)
+	go c.monitorMaster(addrCh)
 	go c.getRecords()
 	return c
 }
@@ -72,12 +66,10 @@ func (c *Client) getRecords() {
 	}
 }

-func (c *Client) monitorMaster(addr Addresser) {
+func (c *Client) monitorMaster(addrCh <-chan string) {
 	lastMaster := ""
-	monitor := func() {
-		// get the lastest address of the master server,
+	for curMaster := range addrCh {
 		// connect to the new address once address changed.
-		curMaster := addr.Address()
 		if curMaster != lastMaster {
 			if curMaster == "" {
 				err := c.conn.Close()
@@ -94,18 +86,10 @@ func (c *Client) monitorMaster(addr Addresser) {
 					// to retry next time.
 					curMaster = lastMaster
 				}
-
 			}
 		}
-
 		lastMaster = curMaster
 	}
-
-	monitor()
-	ticker := time.NewTicker(10 * time.Second)
-	for _ = range ticker.C {
-		monitor()
-	}
 }

 // SetDataset set dataset for the master server to dispatch.

--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -26,12 +26,6 @@ func init() {
 	log.SetLevel(log.ErrorLevel)
 }

-type TestAddresser string
-
-func (a TestAddresser) Address() string {
-	return string(a)
-}
-
 func TestGetFinishTask(t *testing.T) {
 	const path = "/tmp/master_client_test_0"

@@ -45,7 +39,6 @@ func TestGetFinishTask(t *testing.T) {
 	if err != nil {
 		panic(err)
 	}
-
 	go func(l net.Listener) {
 		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
 		if err != nil {
@@ -82,9 +75,11 @@ func TestGetFinishTask(t *testing.T) {
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
 	c.conn = connection.New()
-	go c.monitorMaster(TestAddresser(fmt.Sprintf(":%d", p)))
+	addr := fmt.Sprintf(":%d", p)
+	ch := make(chan string, 1)
+	ch <- addr
+	go c.monitorMaster(ch)
 	c.SetDataset([]string{path})
-
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -20,7 +20,6 @@ func TestNextRecord(t *testing.T) {
 		path  = "/tmp/master_client_TestFull"
 		total = 50
 	)
-
 	l, err := net.Listen("tcp", ":0")
 	if err != nil {
 		panic(err)
@@ -31,7 +30,6 @@ func TestNextRecord(t *testing.T) {
 	if err != nil {
 		panic(err)
 	}
-
 	go func(l net.Listener) {
 		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
 		if err != nil {
@@ -63,10 +61,10 @@ func TestNextRecord(t *testing.T) {
 	}
 	w.Close()
 	f.Close()
-
-	c := master.NewClient(master.TestAddresser(fmt.Sprintf(":%d", p)), 10)
+	curAddr := make(chan string, 1)
+	curAddr <- fmt.Sprintf(":%d", p)
+	c := master.NewClient(curAddr, 10)
 	c.SetDataset([]string{path})
-
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {

--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -142,3 +142,31 @@ func (e *EtcdClient) Load() ([]byte, error) {
 	state := kvs[0].Value
 	return state, nil
 }
+
+// GetKey gets the value by the specify key.
+func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+	resp, err := c.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return "", err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return "", nil
+	}
+	v := kvs[0].Value
+	return string(v), nil
+}
+
+// WatchKey watches the specify key and send to valChan if there is some event.
+func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
+	rch := c.Watch(context.Background(), key)
+	for wresp := range rch {
+		for _, ev := range wresp.Events {
+			// if received event is DELETE, the value will be an empty string
+			log.Infof("received event %s, %q : %q\n", ev.Type, ev.Kv.Key, ev.Kv.Value)
+			valChan <- string(ev.Kv.Value)
+		}
+	}
+}
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
 package pserver

 import (
+	"errors"
 	"hash/fnv"
 	"sort"
 	"time"
@@ -123,6 +124,9 @@ func (c *Client) FinishInitParams() error {
 // SendGrads sends gradients to parameter servers for updating
 // parameters.
 func (c *Client) SendGrads(grads []Gradient) error {
+	if len(grads) == 0 {
+		return errors.New("no gradient received")
+	}
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
    Internal.h)

 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp paddle_trainer_lib)
+add_dependencies(paddle_api paddle_proto paddle_trainer_lib)

 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)

--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -26,7 +26,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
  ${CAPI_PRIVATE_HEADER})

-add_dependencies(paddle_capi gen_proto_cpp)
+add_dependencies(paddle_capi paddle_proto)


 # combine all paddle static libraries together, into libpaddle_capi_whole.a

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
                ${CUDA_CXX_SOURCES})
 endif()

-add_dependencies(paddle_cuda ${external_project_dependencies})
+add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})

 add_style_check_target(paddle_cuda
                       ${CUDA_SOURCES}

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
+# ddim lib
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-
 cc_test(variable_test SRCS variable_test.cc)
+cc_test(scope_test SRCS scope_test.cc)
+cc_test(enforce_test SRCS enforce_test.cc)
--- a/paddle/framework/enforce.h
+++ b/paddle/framework/enforce.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <paddle/string/printf.h>
+#include <exception>
+#include <sstream>
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Enforce exception. Inherits std::exception
+ *
+ * All enforce condition not met, will throw an EnforceNotMet exception.
+ */
+class EnforceNotMet : public std::exception {
+ public:
+  EnforceNotMet(const std::string& msg, const char* file, int fileline) {
+    std::ostringstream sout;
+    sout << msg << " at [" << file << ":" << fileline << "];";
+    all_msg_ = sout.str();
+  }
+
+  const char* what() const noexcept override { return all_msg_.c_str(); }
+
+ private:
+  std::string all_msg_;
+};
+
+// From https://stackoverflow.com/questions/30130930/
+// __buildin_expect is in C++ 11 standard. Since the condition which enforced
+// should be true in most situation, it will make the compiler generate faster
+// code by adding `UNLIKELY` macro.
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+/**
+ * @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
+ * __LINE__
+ *
+ * This macro take __VA_ARGS__, user can pass any type if that type can
+ * serialize to std::ostream
+ */
+#define PADDLE_THROW(...)                                            \
+  do {                                                               \
+    throw ::paddle::framework::EnforceNotMet(                        \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+  } while (0)
+
+/**
+ * @brief Enforce a condition, otherwise throw an EnforceNotMet
+ */
+#define PADDLE_ENFORCE(condition, ...) \
+  do {                                 \
+    if (UNLIKELY(!(condition))) {      \
+      PADDLE_THROW(__VA_ARGS__);       \
+    }                                  \
+  } while (0)
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/enforce_test.cc
+++ b/paddle/framework/enforce_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/framework/enforce.h>
+
+TEST(ENFORCE, OK) {
+  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
+  size_t val = 1;
+  const size_t limit = 10;
+  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
+}
+
+TEST(ENFORCE, FAILED) {
+  bool in_catch = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
+  } catch (paddle::framework::EnforceNotMet err) {
+    in_catch = true;
+    std::string msg = "Enforce is not ok 123 at all";
+    const char* what = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(in_catch);
+}
\ No newline at end of file
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Scope that manage all variables.
+ *
+ * Scope is an association of a name to Variable. All variables belong to
+ * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
+ * One net can run in different scopes and update different variable in the
+ * scope.
+ */
+class Scope {
+ public:
+  /**
+   * @brief Initialize s Scope without parent.
+   */
+  Scope() {}
+
+  /**
+   * @brief Initialize a Scope with parent.
+   */
+  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
+
+  /**
+   * @brief Create Variable
+   *
+   * Create Variable in this Scope. Return the exist one if Variable already
+   * been created.
+   */
+  Variable* CreateVariable(const std::string& name) {
+    auto var = GetVariable(name);
+    if (var) {
+      return var;
+    } else {
+      vars_[name] = std::unique_ptr<Variable>(new Variable());
+      return GetVariable(name);
+    }
+  }
+
+  /**
+   * @brief Get Variable.
+   *
+   * Get Variable from this Scope, this function will recursive find Variable
+   * from it's parent scope. Return nullptr if not found.
+   */
+  Variable* GetVariable(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->GetVariable(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief If this scope has a Var named name.
+   *
+   * Find if there is a Variable in this scope and it's parent scope
+   */
+  bool HasVariable(const std::string& name) const {
+    return (vars_.find(name) != vars_.end() ||
+            (parent_ && parent_->HasVariable(name)));
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  std::shared_ptr<Scope> parent_{nullptr};
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+#include "gtest/gtest.h"
+
+TEST(Scope, Create) {
+  using paddle::framework::Scope;
+  using paddle::framework::Variable;
+
+  auto scope = std::make_shared<Scope>();
+
+  Variable* var0 = scope->CreateVariable("");
+  EXPECT_NE(var0, nullptr);
+
+  /// GetVariable will return nullptr if not exist.
+  Variable* var1 = scope->GetVariable("a");
+  EXPECT_EQ(var1, nullptr);
+
+  /// CreateVariable will return one.
+  Variable* var2 = scope->CreateVariable("a");
+  EXPECT_NE(var2, nullptr);
+
+  /// Get the created variable.
+  Variable* var3 = scope->GetVariable("a");
+  EXPECT_EQ(var2, var3);
+
+  /// CreateVariable will just return the variable if it's
+  /// already exist.
+  Variable* var4 = scope->CreateVariable("a");
+  EXPECT_EQ(var4, var2);
+}
+
+TEST(Scope, Parent) {
+  using paddle::framework::Scope;
+  using paddle::framework::Variable;
+
+  auto parent_scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<Scope>(parent_scope);
+
+  Variable* var0 = parent_scope->CreateVariable("a");
+  EXPECT_NE(var0, nullptr);
+
+  /// GetVariable will get Variable from parent scope if exist.
+  Variable* var1 = scope->GetVariable("a");
+  EXPECT_EQ(var0, var1);
+}
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()

 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
-add_dependencies(paddle_function gen_proto_cpp)
+add_dependencies(paddle_function paddle_proto)

 if(WITH_TESTING)
 if(WITH_GPU)

--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -58,7 +58,7 @@ endif()

 add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
 add_style_check_target(paddle_gserver ${GSERVER_HEADER})
-add_dependencies(paddle_gserver gen_proto_cpp)
+add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
    add_subdirectory(tests)
 endif()
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -33,7 +33,7 @@ endif()
 add_style_check_target(paddle_math ${MATH_SOURCES})
 add_style_check_target(paddle_math ${MATH_HEADERS})

-add_dependencies(paddle_math gen_proto_cpp)  # depends
+add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
 if(WITH_TESTING)
    add_subdirectory(tests)
 endif()
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -10,7 +10,7 @@ set(OPITMIZER_SRCS
  )

 add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
-add_dependencies(paddle_optimizer gen_proto_cpp)
+add_dependencies(paddle_optimizer paddle_proto ${external_project_dependencies})

 if(WITH_TESTING)
  add_simple_unittest(serialization_test)

--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -7,7 +7,7 @@ add_library(paddle_parameter STATIC
        ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
-add_dependencies(paddle_parameter gen_proto_cpp)
+add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
    add_subdirectory(tests)
 endif()
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_network STATIC
 add_style_check_target(paddle_network ${NETWORK_SOURCES})
 add_style_check_target(paddle_network ${NETWORK_HEADERS})

-add_dependencies(paddle_network gen_proto_cpp)
+add_dependencies(paddle_network paddle_proto ${external_project_dependencies})

 ################### paddle_pserver ######################
 set(PSERVER_SOURCES
@@ -40,7 +40,7 @@ add_library(paddle_pserver STATIC
 add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
 add_style_check_target(paddle_pserver ${PSERVER_HEADERS})

-add_dependencies(paddle_pserver gen_proto_cpp)
+add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})

 set(PSERVER_MAIN_SOURCES
    ParameterServer2Main.cpp)

--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -144,7 +144,7 @@ class DenseScanner(IScanner):
        if len(self.__shape__) > 1:
            # The last-two dimenstions are the frame height and width.
            # For example, the layout is CHW for 3-D feature of image.
-            # The H and W are the fram height and width.
+            # The H and W are the frame height and width.
            h, w = self.__shape__[-2:]
            argument.setSlotFrameHeight(self.pos, h)
            argument.setSlotFrameWidth(self.pos, w)

--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
 cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
+
+cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
--- a/paddle/string/printf.h
+++ b/paddle/string/printf.h
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+// Compared with std::stringstream, there are primary purpose of
+// string::Printf:
+//
+// 1. Type-safe printing, with why and how explained in
+//    http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999.
+//    Implementation includes
+//
+//    https://github.com/c42f/tinyformat
+//    boost::format
+//    std::stringstream
+//
+//    std::stringstream is not convenient enough in many cases.  For example:
+//
+//      std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
+//
+//    boost::format is the most convenient one.  We can have
+//
+//      std::cout << format("%2% %1%") % 36 % 77;
+//
+//    or
+//
+//      format fmter("%2% %1%");
+//      fmter % 36; fmter % 77;
+//      std::cout << fmter.c_str();
+//
+//    But the overloading of % might be overkilling and it would be
+//    more efficient if it can write to std::cout directly.
+//
+//    tinyformat has an interface compatible with the C-printf style,
+//    and it can writes to a stream or returns a std::string:
+//
+//      std::cout << tfm::printf(
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+//    or
+//
+//      tfm::format(std::cout,
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+// 2. High-performance -- most printed strings are not too long and
+//    doens't need dynamic memory allocation.  Many StringPrintf
+//    implementations doesn't enforce type-safe, but are
+//    high-performance, including
+//
+//    https://developers.google.com/optimization/reference/base/stringprintf/
+//    https://github.com/adobe/chromium/blob/master/base/stringprintf.h
+//    https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h
+//
+// According to
+// https://github.com/c42f/tinyformat#compile-time-and-code-bloat,
+// boost::format runs too slow and results in large executable binary
+// files.  So here we port tinyformat.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "paddle/string/tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
+
+namespace paddle {
+namespace string {
+
+template <typename... Args>
+void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
+  tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
+}
+
+template <typename... Args>
+std::string Sprintf(const char* fmt, const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, fmt, args...);
+  return oss.str();
+}
+
+template <typename... Args>
+void Printf(const char* fmt, const Args&... args) {
+  Fprintf(std::cout, fmt, args...);
+}
+
+}  // namespace string
+}  // namespace paddle
--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
+#include "paddle/string/printf.h"
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+TEST(StringPrintf, StringPrintf) {
+  std::string weekday = "Wednesday";
+  const char* month = "July";
+  size_t day = 27;
+  long hour = 14;
+  int min = 44;
+  EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
+            paddle::string::Sprintf(
+                "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min));
+}
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -2,7 +2,7 @@

 if(WITH_TESTING)
  add_library(paddle_test_main STATIC TestMain.cpp)
-  add_dependencies(paddle_test_main gen_proto_cpp)
+  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
  add_library(paddle_test_util STATIC TestUtil.cpp)
-  add_dependencies(paddle_test_util gen_proto_cpp)
+  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
 endif()
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -41,7 +41,8 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
    ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp)
+    paddle_proto
+    ${external_project_dependencies})

 macro(add_paddle_exe TARGET_NAME)
  add_executable(${TARGET_NAME} ${ARGN})

--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_utils STATIC
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
 add_style_check_target(paddle_utils ${UTIL_SOURCES}
    ${UTIL_ARCH_SOURCES})
-add_dependencies(paddle_utils gen_proto_cpp)
+add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
    add_subdirectory(tests)
 endif()
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
-set(proto_filenames
-    DataConfig.proto
-    DataFormat.proto
-    ModelConfig.proto
-    ParameterConfig.proto
-    ParameterService.proto
-    TrainerConfig.proto
-    OptimizerConfig.proto
-    ParameterServerConfig.proto)
+file(GLOB proto_filenames . *.proto)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+proto_library(paddle_proto SRCS ${proto_filenames})

 set(PROTO_GEN)
 set(PROTO_GEN_PY)

 foreach(filename ${proto_filenames})
-    get_filename_component(base_filename ${filename} NAME_WE)
-    set(CUR_PROTO_GEN
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.h
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.cc)
-    set(PROTO_GEN
-        ${PROTO_GEN}
-        ${CUR_PROTO_GEN})
-    add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
-                  --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
-
+    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
+    get_filename_component(FIL_WE ${filename} NAME_WE)
    set(CUR_PROTO_GEN_PY
-        ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
+            ${PROJ_ROOT}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
    set(PROTO_GEN_PY
            ${CUR_PROTO_GEN_PY}
            ${PROTO_GEN_PY})
    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
+            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
+            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+            DEPENDS ${ABS_FIL} ${external_project_dependencies})
 endforeach()

-add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-
-add_library(paddle_proto STATIC ${PROTO_GEN})
-target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1149,10 +1149,10 @@ def pooling_layer(input,
 @layer_support(DROPOUT)
 def lstmemory(input,
              name=None,
+              size=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              state_act=None,
              bias_attr=None,
              param_attr=None,
@@ -1194,6 +1194,8 @@ def lstmemory(input,

    :param name: The lstmemory layer name.
    :type name: basestring
+    :param size: DEPRECATED. size of the lstm cell
+    :type size: int
    :param input: input layer name.
    :type input: LayerOutput
    :param reverse: is sequence process reversed or not.
@@ -1220,15 +1222,15 @@ def lstmemory(input,
    assert state_act.support_hppl
    assert act.support_hppl
    assert input.size is not None and input.size % 4 == 0
+
    if size is not None:
        if input.size / 4 == size:
            plog = logger.warning
        else:
            plog = logger.fatal
-
-        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
-             "layer. The lstm size should be equal with input layer size/4. The"
-             " size which is set explicitly will be ignored." % name)
+        plog("size of lstmemory layer: %s is automatically set to "
+             "size of input layer / 4. The parameter size passing to "
+             "this layer is ignored." % (name))

    Layer(
        name=name,
@@ -1255,11 +1257,11 @@ def lstmemory(input,
 @wrap_name_default("gru")
 @layer_support(DROPOUT)
 def grumemory(input,
+              size=None,
              name=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              bias_attr=None,
              param_attr=None,
              layer_attr=None):
@@ -1318,6 +1320,8 @@ def grumemory(input,
    :type name: None|basestring
    :param input: input layer.
    :type input: LayerOutput.
+    :param size: DEPRECATED. size of the gru cell
+    :type size: int
    :param reverse: Whether sequence process is reversed or not.
    :type reverse: bool
    :param act: activation type, TanhActivation by default. This activation
@@ -1334,9 +1338,6 @@ def grumemory(input,
    :type param_attr: ParameterAttribute|None|False
    :param layer_attr: Extra Layer attribute
    :type layer_attr: ExtraLayerAttribute|None
-    :param size: Stub parameter of size, but actually not used. If set this size
-                 will get a warning.
-    :type size: None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -1348,9 +1349,9 @@ def grumemory(input,
            plog = logger.warning
        else:
            plog = logger.fatal
-        plog("NOTE: the gru memory layer's size is set by previous input layer,"
-             " and should be input size / 3. Set size explicitly will be "
-             "ignored.")
+        plog("size of grumemory layer: %s is automatically set to "
+             "size of input layer / 3. The parameter size passing to this "
+             "layer is ignored." % (name))

    Layer(
        name=name,
@@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,


 @wrap_bias_attr_default()
-@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
-                                                             initial_std=0.))
+@wrap_param_attr_default(
+    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
 @layer_support(DROPOUT)
@@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
                    bias_attr=None,
                    layer_attr=None):
    """
-    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
-    as follow.
+    LSTM Step Layer. This function is used only in recurrent_group.
+    The lstm equations are shown as follows.

    ..  math::

-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

        h_t & = o_t tanh(c_t)


    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vector.
+    input vectors.

    The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do

@@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
        ...


-    This layer contains two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, which name is 'state' and can use
+    This layer has two outputs. Default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and can use
    :code:`get_output_layer` to extract this output.

    :param name: Layer's name.
    :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
-                 :code:`input.size/4`, and should be equal as
+    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
+                 :code:`input.size/4`, and should be equal to
                 :code:`state.size`.
    :type size: int
    :param input: input layer. :math:`Wx_t + Wh_{t-1}`

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -614,6 +614,7 @@ def simple_lstm(input,

 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
+                   memory_boot=None,
                   name=None,
                   size=None,
                   param_attr=None,
@@ -626,9 +627,9 @@ def lstmemory_unit(input,
                   lstm_layer_attr=None,
                   get_output_layer_attr=None):
    """
-    Define calculations that a LSTM unit performs in a single time step.
-    This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is always used in
+    Define calculations that a LSTM unit performs during a single time step.
+    This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
    recurrent_group (see layers.py for more details) to implement attention
    mechanism.

@@ -638,13 +639,13 @@ def lstmemory_unit(input,

    ..  math::

-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

        h_t & = o_t tanh(c_t)

@@ -661,6 +662,8 @@ def lstmemory_unit(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: lstmemory unit name.
    :type name: basestring
    :param size: lstmemory unit size.
@@ -692,7 +695,8 @@ def lstmemory_unit(input,
        assert input.size % 4 == 0
        size = input.size / 4
    out_mem = memory(name=name, size=size)
-    state_mem = memory(name="%s_state" % name, size=size)
+    state_mem = memory(
+        name="%s_state" % name, size=size, boot_layer=memory_boot)

    with mixed_layer(
            name="%s_input_recurrent" % name,
@@ -726,6 +730,7 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                    size=None,
                    name=None,
+                    memory_boot=None,
                    reverse=False,
                    param_attr=None,
                    act=None,
@@ -737,7 +742,7 @@ def lstmemory_group(input,
                    lstm_layer_attr=None,
                    get_output_layer_attr=None):
    """
-    lstm_group is a recurrent layer group version of Long Short Term Memory. It
+    lstm_group is a recurrent_group version of Long Short Term Memory. It
    does exactly the same calculation as the lstmemory layer (see lstmemory in
    layers.py for the maths) does. A promising benefit is that LSTM memory
    cell states, or hidden states in every time step are accessible to the
@@ -748,8 +753,8 @@ def lstmemory_group(input,

    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
    multiplications:
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
+    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
    speed up the calculations. Consequently, an additional mixed_layer with
    full_matrix_projection must be included before lstmemory_unit is called.

@@ -765,10 +770,12 @@ def lstmemory_group(input,

    :param input: input layer name.
    :type input: LayerOutput
-    :param name: lstmemory group name.
-    :type name: basestring
    :param size: lstmemory group size.
    :type size: int
+    :param name: name of the lstmemory group.
+    :type name: basestring
+    :param memory_boot: the initialization state of LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param reverse: is lstm reversed
    :type reverse: bool
    :param param_attr: Parameter config, None if use default.
@@ -798,6 +805,7 @@ def lstmemory_group(input,
    def __lstm_step__(ipt):
        return lstmemory_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            mixed_bias_attr=mixed_bias_attr,
@@ -819,6 +827,7 @@ def lstmemory_group(input,

 @wrap_name_default('gru_unit')
 def gru_unit(input,
+             memory_boot=None,
             size=None,
             name=None,
             gru_bias_attr=None,
@@ -829,8 +838,8 @@ def gru_unit(input,
             naive=False):
    """
    Define calculations that a gated recurrent unit performs in a single time
-    step. This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is almost always used in
+    step. This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
    the recurrent_group (see layers.py for more details) to implement attention
    mechanism.

@@ -838,6 +847,8 @@ def gru_unit(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@@ -856,7 +867,7 @@ def gru_unit(input,
    if size is None:
        size = input.size / 3

-    out_mem = memory(name=name, size=size)
+    out_mem = memory(name=name, size=size, boot_layer=memory_boot)

    if naive:
        __step__ = gru_step_naive_layer
@@ -878,6 +889,7 @@ def gru_unit(input,

 @wrap_name_default('gru_group')
 def gru_group(input,
+              memory_boot=None,
              size=None,
              name=None,
              reverse=False,
@@ -888,7 +900,7 @@ def gru_group(input,
              gru_layer_attr=None,
              naive=False):
    """
-    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
+    gru_group is a recurrent_group version of Gated Recurrent Unit. It
    does exactly the same calculation as the grumemory layer does. A promising
    benefit is that gru hidden states are accessible to the user. This is
    especially useful in attention model. If you do not need to access
@@ -908,6 +920,8 @@ def gru_group(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@@ -929,6 +943,7 @@ def gru_group(input,
    def __gru_step__(ipt):
        return gru_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            gru_bias_attr=gru_bias_attr,
@@ -1083,7 +1098,6 @@ def simple_gru2(input,

    return grumemory(
        name=name,
-        size=size,
        input=m,
        reverse=reverse,
        bias_attr=gru_bias_attr,

--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -25,8 +25,9 @@ import uci_housing
 import sentiment
 import wmt14
 import mq2007
+import flowers

 __all__ = [
    'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers'
 ]
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -34,9 +34,9 @@ from common import download
 import tarfile
 import scipy.io as scio
 from paddle.v2.image import *
+from paddle.v2.reader import *
 import os
 import numpy as np
-import paddle.v2 as paddle
 from multiprocessing import cpu_count
 __all__ = ['train', 'test', 'valid']

@@ -46,6 +46,12 @@ SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
 DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'


 def default_mapper(sample):
@@ -53,8 +59,8 @@ def default_mapper(sample):
    map image bytes data to type needed by model input layer
    '''
    img, label = sample
-    img = paddle.image.load_image_bytes(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
+    img = load_image_bytes(img)
+    img = simple_transform(img, 256, 224, True)
    return img.flatten().astype('float32'), label


@@ -63,7 +69,8 @@ def reader_creator(data_file,
                   setid_file,
                   dataset_name,
                   mapper=default_mapper,
-                   buffered_size=1024):
+                   buffered_size=1024,
+                   use_xmap=True):
    '''
    1. read images from tar file and
        merge images into batch files in 102flowers.tgz_batch/
@@ -105,11 +112,13 @@ def reader_creator(data_file,
            for sample, label in itertools.izip(data, batch['label']):
                yield sample, int(label)

-    return paddle.reader.xmap_readers(mapper, reader,
-                                      cpu_count(), buffered_size)
+    if use_xmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)


-def train(mapper=default_mapper, buffered_size=1024):
+def train(mapper=default_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers training set reader.
    It returns a reader, each sample in the reader is
@@ -128,11 +137,11 @@ def train(mapper=default_mapper, buffered_size=1024):
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)


-def test(mapper=default_mapper, buffered_size=1024):
+def test(mapper=default_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers test set reader.
    It returns a reader, each sample in the reader is
@@ -151,11 +160,11 @@ def test(mapper=default_mapper, buffered_size=1024):
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)


-def valid(mapper=default_mapper, buffered_size=1024):
+def valid(mapper=default_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers validation set reader.
    It returns a reader, each sample in the reader is
@@ -174,8 +183,8 @@ def valid(mapper=default_mapper, buffered_size=1024):
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)


 def fetch():

--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -31,13 +31,13 @@ class TestFlowers(unittest.TestCase):
    def test_train(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.flowers.train())
-        self.assertEqual(instances, 1020)
+        self.assertEqual(instances, 6149)
        self.assertEqual(max_label_value, 102)

    def test_test(self):
        instances, max_label_value = self.check_reader(
            paddle.v2.dataset.flowers.test())
-        self.assertEqual(instances, 6149)
+        self.assertEqual(instances, 1020)
        self.assertEqual(max_label_value, 102)

    def test_valid(self):

--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -51,7 +51,7 @@ class Parameters(object):
    def __init__(self):
        self.__param_conf__ = dict()
        self.__gradient_machines__ = []
-        self.__tmp_params__ = []
+        self.__tmp_params__ = dict()

    def __append_config__(self, param_conf):
        """
@@ -128,12 +128,9 @@ class Parameters(object):

        if len(self.__gradient_machines__) == 0:
            # create new parameter in python numpy.
-            if len(self.__tmp_params__) != 0:
-                ret_list = [
-                    mat for name, mat in self.__tmp_params__ if name == key
-                ]
-                if len(ret_list) == 1:
-                    return ret_list[0]
+            if key in self.__tmp_params__:
+                return self.__tmp_params__[key]
+            else:
                return np.ndarray(shape=shape, dtype=np.float32)
        else:
            for each_gradient_machine in self.__gradient_machines__:
@@ -187,7 +184,7 @@ class Parameters(object):
                             (shape, value.shape))

        if len(self.__gradient_machines__) == 0:
-            self.__tmp_params__.append((key, value))
+            self.__tmp_params__[key] = value
        else:
            for each_gradient_machine in self.__gradient_machines__:
                __copy_parameter_to_gradient_machine__(each_gradient_machine,
@@ -231,7 +228,7 @@ class Parameters(object):
            raise ValueError("gradient_machine should be api.GradientMachine")

        if len(self.__tmp_params__) != 0:
-            for name, val in self.__tmp_params__:
+            for name, val in self.__tmp_params__.iteritems():
                try:
                    __copy_parameter_to_gradient_machine__(gradient_machine,
                                                           name, val)
@@ -287,6 +284,18 @@ class Parameters(object):

    @staticmethod
    def from_tar(f):
+        """
+        Create a `Parameters` object from the given file. And
+        the `Parameters` only contains the parameters in this
+        file. It is adapted the parameters are same in the
+        defined network and the given file. For example, it
+        can be used in the inference.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :return: A Parameters object.
+        :rtype: Parameters.
+        """
        params = Parameters()
        tar = tarfile.TarFile(fileobj=f, mode='r')
        for finfo in tar:
@@ -302,6 +311,21 @@ class Parameters(object):
            params.deserialize(param_name, f)
        return params

+    def init_from_tar(self, f):
+        """
+        Different from `from_tar`, this interface can be used to
+        init partial network parameters from another saved model.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :return: Nothing.
+        """
+
+        tar_param = Parameters.from_tar(f)
+        for pname in tar_param.names():
+            if pname in self.names():
+                self.set(pname, tar_param.get(pname))
+

 def __get_parameter_in_gradient_machine__(gradient_machine, name):
    """

--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -248,9 +248,6 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
    :rtype: callable
    """
    end = XmapEndSignal()
-    in_queue = Queue(buffer_size)
-    out_queue = Queue(buffer_size)
-    out_order = [0]

    # define a worker to read samples from reader to in_queue
    def read_worker(reader, in_queue):
@@ -266,12 +263,6 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
            in_order += 1
        in_queue.put(end)

-    # start a read worker in a thread
-    target = order_read_worker if order else read_worker
-    t = Thread(target=target, args=(reader, in_queue))
-    t.daemon = True
-    t.start()
-
    # define a worker to handle samples from in_queue by mapper
    # and put mapped samples into out_queue
    def handle_worker(in_queue, out_queue, mapper):
@@ -298,6 +289,15 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
        in_queue.put(end)
        out_queue.put(end)

+    def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
        # start several handle_workers
        target = order_handle_worker if order else handle_worker
        args = (in_queue, out_queue, mapper, out_order) if order else (
@@ -310,7 +310,6 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
        for w in workers:
            w.start()

-    def xreader():
        sample = out_queue.get()
        while not isinstance(sample, XmapEndSignal):
            yield sample

--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -132,10 +132,12 @@ class TestXmap(unittest.TestCase):
        for order in orders:
            for tNum in thread_nums:
                for size in buffered_size:
-                    result = []
-                    for i in paddle.v2.reader.xmap_readers(mapper,
+                    reader = paddle.v2.reader.xmap_readers(mapper,
                                                           reader_creator_10(0),
-                                                           tNum, size, order)():
+                                                           tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
                            result.append(i)
                        if not order:
                            result.sort()

--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -20,14 +20,17 @@ import cStringIO
 import numpy


-def __rand_param_config__(name):
+def __rand_param_config__(name, psize=None):
    conf = ParameterConfig()
    conf.name = name
    size = 1
+    if psize is None:
        for i in xrange(2):
            dim = random.randint(1, 1000)
            conf.dims.append(dim)
            size *= dim
+    else:
+        size = psize
    conf.size = size
    assert conf.IsInitialized()
    return conf
@@ -77,6 +80,50 @@ class TestParameters(unittest.TestCase):
        expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
        assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))

+    def test_init_from_tar(self):
+        def get_param(names, size):
+            p = parameters.Parameters()
+            for k, v in zip(names, size):
+                p.__append_config__(__rand_param_config__(k, v))
+            for name in p.names():
+                param = p.get(name)
+                param[:] = numpy.random.uniform(
+                    -1.0, 1.0, size=p.get_shape(name))
+                p.set(name, param)
+            return p
+
+        def get_parames():
+            name1 = ['param_0', 'param_1']
+            size1 = [128, 256]
+            p1 = get_param(name1, size1)
+            file1 = cStringIO.StringIO()
+            p1.to_tar(file1)
+            file1.seek(0)
+
+            name2 = ['param_0', 'param_1', 'param_2']
+            size2 = [128, 256, 288]
+            p2 = get_param(name2, size2)
+            file2 = cStringIO.StringIO()
+            p2.to_tar(file2)
+            file2.seek(0)
+            return p1, file1, p2, file2
+
+        p1, file1, p2, file2 = get_parames()
+        p2.init_from_tar(file1)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+
+        p1, file1, p2, file2 = get_parames()
+        p1.init_from_tar(file2)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -15,7 +15,8 @@ setup_requires=["requests",
                "protobuf==3.1",
                "recordio",
                "matplotlib",
-                "rarfile"]
+                "rarfile",
+                "scipy>=0.19.0"]

 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
    setup_requires+=["opencv-python"]