Merge branch 'develop' into add_sequence_slice_layer

7ff689f5 · caoying03 · 26bc5b12 · 56faf513 · 7ff689f5 · 7ff689f5
88 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,5 @@ cmake-build-*
 python/paddle/v2/framework/core.so
 CMakeFiles
 cmake_install.cmake
+paddle/.timestamp
+python/paddlepaddle.egg-info/
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,7 +28,7 @@ RUN apt-get update && \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format-3.8 swig doxygen cmake  \
+    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools && \

--- a/go/glide.lock
+++ b/go/glide.lock
 hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
-updated: 2017-08-03T21:46:51.744995189Z
+updated: 2017-08-07T23:37:48.867469328Z
 imports:
 - name: github.com/beorn7/perks
  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
@@ -10,7 +10,7 @@ imports:
 - name: github.com/cockroachdb/cmux
  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: c31bec0f29facff13f7c3e3d948e55dd6689ed42
+  version: d0d1a87aa96ae14914751d42264262cb69eda170
  subpackages:
  - alarm
  - auth
@@ -24,6 +24,7 @@ imports:
  - error
  - etcdserver
  - etcdserver/api
+  - etcdserver/api/etcdhttp
  - etcdserver/api/v2http
  - etcdserver/api/v2http/httptypes
  - etcdserver/api/v3client
@@ -210,11 +211,6 @@ testImports:
  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
  subpackages:
  - spew
- name: github.com/docker/docker
-  version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e
-  subpackages:
-  - pkg/ioutils
-  - pkg/longpath
 - name: github.com/pmezard/go-difflib
  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
  subpackages:

--- a/go/master/service_test.go
+++ b/go/master/service_test.go
 package master_test
 import (
+	"io/ioutil"
+	"net/url"
 	"os"
+	"strings"
 	"testing"
 	"time"
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/coreos/etcd/clientv3"
 	"github.com/coreos/etcd/embed"
-	"github.com/docker/docker/pkg/ioutils"
 	"github.com/stretchr/testify/assert"
 )
 func TestNewServiceWithEtcd(t *testing.T) {
 	// setup an embed etcd server
-	etcdDir, err := ioutils.TempDir("", "")
+	etcdDir, err := ioutil.TempDir("", "")
 	if err != nil {
 		t.Fatal(err)
 	}
 	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
 	cfg.Dir = etcdDir
 	e, err := embed.StartEtcd(cfg)
 	if err != nil {
@@ -30,15 +36,13 @@ func TestNewServiceWithEtcd(t *testing.T) {
 			t.Fatal(err)
 		}
 	}()
-	select {
-	case <-e.Server.ReadyNotify():
-		t.Log("Server is ready!")
-	case <-time.After(60 * time.Second):
-		e.Server.Stop() // trigger a shutdown
-		t.Fatal("Server took too long to start!")
-	}
-	ep := []string{"127.0.0.1:2379"}
+	<-e.Server.ReadyNotify()
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+	ep := []string{endpoint}
 	masterAddr := "127.0.0.1:3306"
 	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
 	if err != nil {

--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -90,8 +90,12 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 type selector bool
-func (s selector) Select() bool {
+func (s selector) Select() (bool, error) {
-	return bool(s)
+	return bool(s), nil
+}
+func (s selector) Done() error {
+	return nil
 }
 type lister []client.Server
@@ -114,11 +118,10 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
-	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
 	addr := C.GoString(etcdEndpoints)
 	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
+	c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
 	return add(c)
 }
@@ -136,7 +139,12 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
 //export paddle_begin_init_params
 func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
-	if selected := c.BeginInitParams(); selected {
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		panic(err)
+	}
+	if selected {
 		return 1
 	}
 	return 0

--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -27,9 +27,13 @@ import (
 // TODO(helin): add RPC call retry logic
-// Selector selects if the client should initialize parameter servers.
+// Selector selects if the client should initialize parameters and
+// reports the initialization process done.
 type Selector interface {
-	Select() bool
+	// Select selects if the client should initialize parameter servers.
+	Select() (bool, error)
+	// Done indicates the initialization process is done.
+	Done() error
 }
 // Server is the identification of a parameter Server.
@@ -115,7 +119,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 // servers. Other trainers will be blocked until the initialization is
 // done, and they need to get the initialized parameters from
 // parameter servers using GetParams.
-func (c *Client) BeginInitParams() bool {
+func (c *Client) BeginInitParams() (bool, error) {
 	return c.sel.Select()
 }

--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -124,8 +124,12 @@ func initEtcdClient() {
 type selector bool
-func (s selector) Select() bool {
+func (s selector) Select() (bool, error) {
-	return bool(s)
+	return bool(s), nil
+}
+func (s selector) Done() error {
+	return nil
 }
 type lister []client.Server
@@ -135,7 +139,11 @@ func (l lister) List() []client.Server {
 }
 func testClient(t *testing.T, c *client.Client) {
-	selected := c.BeginInitParams()
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
 	if !selected {
 		t.Fatal("should be selected.")
 	}

--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -16,53 +16,60 @@ package client
 import (
 	"context"
+	"errors"
+	"fmt"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
 	log "github.com/sirupsen/logrus"
 )
 const (
 	defaultEtcdTimeout time.Duration = 5 * time.Second
+	initLockPath = "/init_ps/lock"
+	initDonePath = "/init_ps/done"
+	initDoneVal  = "1"
 )
-// EtcdClient is used by pserver client that is a part of trainer process.
+// Etcd is used by pserver client that is a part of trainer process.
 // TODO:
-// 1. add watcher to watch the change state of pservers)
+// 1. add watcher to watch the change state of pservers.
-// 1. add etcd lock)
+type Etcd struct {
-type EtcdClient struct {
 	client    *clientv3.Client
 	timeout   time.Duration
 	endpoints []string
+	lock      *concurrency.Mutex
 }
 // Desired read ps desired number from etcd.
-func (p *EtcdClient) Desired() int {
+func (e *Etcd) Desired() int {
 	var psDesired int
 	for {
-		ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-		resp, err := p.client.Get(ctx, pserver.PsDesired)
+		resp, err := e.client.Get(ctx, pserver.PsDesired)
 		cancel()
 		if err != nil {
 			log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
 		kvs := resp.Kvs
 		if len(kvs) == 0 {
 			log.Infoln("Waiting for ps desired registered ...")
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
 			log.Errorf("psDesired %d invalid %v", psDesired, err)
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
@@ -73,26 +80,26 @@ func (p *EtcdClient) Desired() int {
 }
 // List return the pserver list read from etcd.
-func (p *EtcdClient) List() []Server {
+func (e *Etcd) List() []Server {
-	psDesired := p.Desired()
+	psDesired := e.Desired()
 	servers := make([]Server, psDesired)
 	for {
 		for i := 0; i < psDesired; i++ {
-			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
-			resp, err := p.client.Get(ctx, psKey)
+			resp, err := e.client.Get(ctx, psKey)
 			cancel()
 			if err != nil {
 				log.Infof("Get psKey= %s error, %v", psKey, err)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 			kvs := resp.Kvs
 			if len(kvs) == 0 {
 				log.Infof("Waiting for ps addr registered ...")
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
@@ -100,7 +107,7 @@ func (p *EtcdClient) List() []Server {
 			// TODO(Longfei) check the ps address
 			if psAddr == "" {
 				log.Infof("Get psKey = %s, psAddr is empty", psKey)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 			log.Debugf("got value (%s) for key: %s", psAddr, psKey)
@@ -113,7 +120,7 @@ func (p *EtcdClient) List() []Server {
 }
 // NewEtcd create a etcd client to return the state of pserver on etcd.
-func NewEtcd(endpoints string) *EtcdClient {
+func NewEtcd(endpoints string) *Etcd {
 	ep := strings.Split(endpoints, ",")
 	var cli *clientv3.Client
 	var err error
@@ -130,10 +137,118 @@ func NewEtcd(endpoints string) *EtcdClient {
 		break
 	}
 	log.Infof("Connected to etcd: %s\n", endpoints)
-	client := &EtcdClient{
+	client := &Etcd{
 		client:    cli,
 		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client
 }
+// Select indicates if the current trainer is selected to initialize
+// the pserver parameters.
+func (e *Etcd) Select() (bool, error) {
+	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
+	if err != nil {
+		return false, err
+	}
+	lock := concurrency.NewMutex(sess, initLockPath)
+	log.Infof("Trying to acquire lock at %s.", initLockPath)
+	// Do not use timeout context here, since we don't know how
+	// long does it take for other trainers to initialize the
+	// parameters.
+	err = lock.Lock(context.Background())
+	if err != nil {
+		return false, err
+	}
+	log.Infof("Successfully acquired lock at %s.", initLockPath)
+	get := clientv3.OpGet(initDonePath)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
+	cancel()
+	if err != nil {
+		return false, err
+	}
+	if !tresp.Succeeded {
+		return false, errors.New("no longer the owner of the lock")
+	}
+	resp := tresp.Responses[0].GetResponseRange()
+	if len(resp.Kvs) == 0 {
+		// Key value not set, select current trainer.
+		e.lock = lock
+		log.Infoln("Trainer selected.")
+		return true, nil
+	}
+	if string(resp.Kvs[0].Value) == initDoneVal {
+		log.Infoln("Initialization is already done.")
+		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+		err = lock.Unlock(ctx)
+		cancel()
+		if err != nil {
+			log.Errorln(err)
+		}
+		return false, nil
+	}
+	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
+}
+// Done indicates the parameter initialization process is done.
+func (e *Etcd) Done() error {
+	if e.lock == nil {
+		return errors.New("lock is nil, Done called unexpectedly")
+	}
+	put := clientv3.OpPut(initDonePath, initDoneVal)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	cancel()
+	if err != nil {
+		return err
+	}
+	if !tresp.Succeeded {
+		return errors.New("no longer the owner of the lock")
+	}
+	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+	err = e.lock.Unlock(ctx)
+	cancel()
+	if err != nil {
+		log.Errorln(err)
+	} else {
+		e.lock = nil
+	}
+	return nil
+}
+// Close closes the etcd client.
+func (e *Etcd) Close() error {
+	var err error
+	if e.lock != nil {
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		err = e.lock.Unlock(ctx)
+		cancel()
+		if err == nil {
+			e.lock = nil
+		}
+	}
+	cErr := e.client.Close()
+	if cErr != nil {
+		if err != nil {
+			log.Errorln(cErr)
+			return err
+		}
+		return cErr
+	}
+	return err
+}
--- a/go/pserver/client/etcd_client_test.go
+++ b/go/pserver/client/etcd_client_test.go
+package client_test
+import (
+	"io/ioutil"
+	"net/url"
+	"os"
+	"strings"
+	"sync"
+	"testing"
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/embed"
+)
+func TestSelector(t *testing.T) {
+	etcdDir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+	<-e.Server.ReadyNotify()
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+	var mu sync.Mutex
+	selectedCount := 0
+	var wg sync.WaitGroup
+	selectAndDone := func(c *client.Etcd) {
+		defer wg.Done()
+		selected, err := c.Select()
+		if err != nil {
+			panic(err)
+		}
+		if selected {
+			mu.Lock()
+			selectedCount++
+			mu.Unlock()
+			err = c.Done()
+			if err != nil {
+				t.Fatal(err)
+			}
+		}
+	}
+	c0 := client.NewEtcd(endpoint)
+	c1 := client.NewEtcd(endpoint)
+	c2 := client.NewEtcd(endpoint)
+	c3 := client.NewEtcd(endpoint)
+	wg.Add(3)
+	go selectAndDone(c0)
+	go selectAndDone(c1)
+	go selectAndDone(c2)
+	wg.Wait()
+	// simulate trainer crashed and restarted after the
+	// initialization process.
+	wg.Add(1)
+	go selectAndDone(c3)
+	wg.Wait()
+	mu.Lock()
+	if selectedCount != 1 {
+		t.Fatal("selected count wrong:", selectedCount)
+	}
+	mu.Unlock()
+	err = c0.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+	err = c1.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+	err = c2.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+	err = c3.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+}
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,6 +7,9 @@ cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+cc_library(lod_tensor SRCS lod_tensor.cc details/lod_tensor.cc DEPS ddim place tensor)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
 cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc)
@@ -40,11 +43,13 @@ if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
-	fc_op
+    fc_op
-	sgd_op
+    sgd_op
-	add_op
+    add_op
-	mean_op
+    mean_op
-	cross_entropy_op
+    cross_entropy_op
-	fill_zeros_like_op
+    recurrent_op
-	recurrent_op)
+    uniform_random_op
+    gaussian_random_op
+    fill_zeros_like_op)
 endif(WITH_PYTHON)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -13,6 +13,7 @@
   limitations under the License. */
 #include "paddle/framework/backward.h"
 #include <list>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
@@ -132,8 +133,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
    for (std::string& grad_input : grad_op->inputs_) {
      if (no_grad_names.count(grad_input)) {
-        std::string prefix =
+        // +1 for \0
-            grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        grad_input = prefix + kZeroVarSuffix;
        // If part of input gradient of that operator is not calculated, fill
@@ -166,7 +168,7 @@ std::shared_ptr<OperatorBase> Backward(
  std::unordered_set<std::string> no_grad_names;
  no_grad_names.reserve(no_grad_vars.size());
-  no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
+  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
  for (auto& name : no_grad_vars) {
    no_grad_names.insert(name + kGradVarSuffix);

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -17,16 +17,21 @@
 #include <gtest/gtest.h>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/operators/type_alias.h"
 namespace paddle {
 namespace framework {
+using OperatorBase = framework::OperatorBase;
+using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
+using OpProto = framework::OpProto;
+using OpAttrChecker = framework::OpAttrChecker;
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
 class EmptyOp : public OperatorBase {
 public:
  void InferShape(const Scope &scope) const override {}
-  void Run(const Scope &scope,
+  void Run(const Scope &scope, const DeviceContext &dev_ctx) const override {}
-           const platform::DeviceContext &dev_ctx) const override {}
 };
 class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
@@ -71,7 +76,7 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
  }
 };
-class FcOp : public ops::NetOp {
+class FcOp : public operators::NetOp {
 public:
  void Init() override {
    AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
@@ -143,6 +148,7 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
 }  // namespace paddle
 namespace f = paddle::framework;
+namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
 REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker);
 REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp);
@@ -165,10 +171,10 @@ TEST(Backward, simple_op_grad) {
  ASSERT_EQ(4UL, gop->inputs_.size());
  ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]);
  ASSERT_EQ("rowwise_add_grad", gop->type_);
-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]);
+  ASSERT_EQ(f::GradVarName("X"), gop->outputs_[0]);
-  ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]);
+  ASSERT_EQ(f::GradVarName("b"), gop->outputs_[1]);
-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix));
+  ASSERT_EQ(f::GradVarName("X"), gop->Output(f::GradVarName("X")));
 }
 TEST(Backward, simple_op_not_need_grad) {
@@ -176,7 +182,7 @@ TEST(Backward, simple_op_not_need_grad) {
  ASSERT_NE(fwd, nullptr);
  auto gop = f::Backward(*fwd, {"X"});
  ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
-                      "X" + f::kGradVarSuffix),
+                      f::GradVarName("X")),
            gop->outputs_.end());
  auto no_input_gop = f::Backward(*fwd, {"X", "b"});
@@ -244,18 +250,18 @@ TEST(Backward, net_input_of_network_not_need_grad) {
  all_output.erase(f::kEmptyVarName);
  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end());
+    ASSERT_NE(all_output.find(f::GradVarName(out)), all_output.end());
  }
  // Not Generated X
-  ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end());
+  ASSERT_EQ(all_output.find(f::GradVarName("X")), all_output.end());
  ASSERT_EQ(2UL, bwd_net->ops_.size());
  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix));
+            first_fc_grad->ops_[2]->Output(f::GradVarName("A")));
 }
 TEST(Backward, net_shared_weight) {
@@ -307,15 +313,15 @@ TEST(Backward, op_part_of_output_are_not_need) {
  ASSERT_EQ(1UL, fill_zero.inputs_.size());
  ASSERT_EQ("Z", fill_zero.inputs_[0]);
  ASSERT_EQ(1UL, fill_zero.outputs_.size());
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]);
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.outputs_[0]);
  auto &d_many_out = *net->ops_[1];
  ASSERT_EQ("many_output_op_grad", d_many_out.type_);
  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size());  // I/O/OG
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix));
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
-  ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix));
+            d_many_out.Input(f::GradVarName("z")));
-  ASSERT_EQ("X" + f::kGradVarSuffix,
+  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
-            d_many_out.Output("x" + f::kGradVarSuffix));
+  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
 }
 TEST(Backward, op_part_of_input_are_not_need) {
@@ -325,10 +331,9 @@ TEST(Backward, op_part_of_input_are_not_need) {
  ASSERT_EQ(grad_mul.type_, "mul_grad");
  ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
  ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
-  ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("A")), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("B")), f::GradVarName("b"));
-  ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix),
+  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
-            "out" + f::kGradVarSuffix);
  ASSERT_EQ(grad_mul.Input("A"), "a");
  ASSERT_EQ(grad_mul.Input("B"), "b");
  ASSERT_EQ(grad_mul.Input("Out"), "out");

--- a/paddle/framework/details/lod_tensor.cc
+++ b/paddle/framework/details/lod_tensor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_tensor.h"
+#include <memory>
+namespace paddle {
+namespace framework {
+namespace details {
+using LOD = LODTensor::LOD;
+std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level_begin,
+                              size_t level_end) {
+  auto new_lod = std::make_shared<LOD>();
+  new_lod->reserve(level_end - level_begin);
+  for (size_t i = level_begin; i < level_end; i++) {
+    new_lod->emplace_back(lod[i]);
+  }
+  return new_lod;
+}
+std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level, size_t elem_begin,
+                              size_t elem_end, bool tensor_shared) {
+  // slice the lod.
+  auto new_lod = std::make_shared<LOD>();
+  new_lod->reserve(lod.size() - level);
+  auto start = lod.at(level)[elem_begin];
+  auto end = lod.at(level)[elem_end];
+  for (auto it = lod.begin() + level; it != lod.end(); it++) {
+    auto it_begin = std::find(it->begin(), it->end(), start);
+    auto it_end = std::find(it_begin, it->end(), end);
+    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
+    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
+    new_lod->emplace_back(it_begin, it_end + 1);
+    if (!tensor_shared) {
+      // reset offset if tensor is copyed and sliced.
+      std::transform(new_lod->back().begin(), new_lod->back().end(),
+                     new_lod->back().begin(),
+                     [start](int v) { return v - start; });
+      PADDLE_ENFORCE(new_lod->back().front() == 0, "error in slice LOD");
+    }
+  }
+  return new_lod;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/details/lod_tensor.h
+++ b/paddle/framework/details/lod_tensor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <memory>
+namespace paddle {
+namespace framework {
+namespace details {
+/*
+ * Slice levels from LOD.
+ *
+ * @lod: LOD to slice.
+ * @level_begin: level to begin slice.
+ * @level_end: level to end slice.
+ */
+std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
+                                         size_t level_begin, size_t level_end);
+/*
+ * Slice elements from a level of LOD.
+ *
+ * @lod: LOD to slice.
+ * @level: which level to slice.
+ * @elem_begin: element's index to begin slice.
+ * @elem_end: element's index to end slice.
+ */
+std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
+                                         size_t level, size_t elem_begin,
+                                         size_t elem_end, bool tensor_shared);
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -83,21 +83,19 @@ TEST(GradOpBuilder, MutiInOut) {
  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
-            "out1" + f::kGradVarSuffix);
+            f::GradVarName("out1"));
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
            std::vector<std::string>(
-                {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix}));
+                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-            "in1" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
+            std::vector<std::string>({f::GradVarName("in2_1"),
-            std::vector<std::string>({"in2_1" + f::kGradVarSuffix,
+                                      f::GradVarName("in2_2"),
-                                      "in2_2" + f::kGradVarSuffix,
+                                      f::GradVarName("in2_3")}));
-                                      "in2_3" + f::kGradVarSuffix}));
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
-  EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix),
-            "in3" + f::kGradVarSuffix);
 }
 TEST(GradOpBuilder, IOIgnoredInGradient) {
@@ -119,19 +117,18 @@ TEST(GradOpBuilder, IOIgnoredInGradient) {
  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
            std::vector<std::string>({"out1_1", "out1_2"}));
  EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
            std::vector<std::string>(
-                {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix}));
+                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
-  EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
-            "out2" + f::kGradVarSuffix);
+            f::GradVarName("out2"));
  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-            "in1" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
            std::vector<std::string>(
-                {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix}));
+                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
-  EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
            std::vector<std::string>(
-                {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix}));
+                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
 }
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_tensor.h"
+#include <glog/logging.h>
+namespace paddle {
+namespace framework {
+LODTensor LODTensor::SliceShared(size_t level_begin, size_t level_end) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
+  // slice levels just need to update LOD info, each level will contains the
+  // whole tensor_, so no need to modify tensor_.
+  return LODTensor(tensor_, new_lod);
+}
+LODTensor LODTensor::SliceShared(size_t level, size_t elem_begin,
+                                 size_t elem_end) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
+                                   true /*tensor_shared*/);
+  // slice elements just need to update LOD info, because offsets are not
+  // changed, so the original tensor_ can be reused.
+  return LODTensor(tensor_, new_lod);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <memory>
+#if (!PADDLE_ONLY_CPU)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/enforce.h"
+namespace paddle {
+namespace framework {
+/*
+ * LODTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LODTensor {
+ public:
+// Level save offsets of each unit.
+#ifdef PADDLE_ONLY_CPU
+  using Level = std::vector<size_t>;
+#else
+  using Level = thrust::device_vector<size_t>;
+#endif
+  // LOD stores offsets of each level of units, the largest units level first,
+  // then the smaller units level. Each Level stores the offsets of units in
+  // Tesor.
+  typedef std::vector<Level> LOD;
+  LODTensor() {}
+  LODTensor(const std::shared_ptr<Tensor> &tensor,
+            const std::shared_ptr<LOD> &lod) {
+    Reset(tensor, lod);
+  }
+  void Reset(const std::shared_ptr<Tensor> &tensor,
+             const std::shared_ptr<LOD> &lod) {
+    tensor_ = tensor;
+    lod_start_pos_ = lod;
+  }
+  /*
+   * Get a element from LOD.
+   */
+  size_t lod_element(size_t level, size_t elem) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    PADDLE_ENFORCE(elem < NumElements(level),
+                   "element begin [%d] out of range [%d]", elem,
+                   NumElements(level));
+    return (*lod_start_pos_)[level][elem];
+  }
+  /*
+   * Number of LODTensor's levels, each level has units of data, for example,
+   * in the sentence's view, article, paragraph, sentence are 3 levels.
+   */
+  size_t NumLevels() const {
+    return lod_start_pos_ ? lod_start_pos_->size() : 0UL;
+  }
+  /*
+   * Number of elements in a level.
+   */
+  size_t NumElements(size_t level = 0) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    // the last offset is the end of last element
+    return lod_start_pos_->at(level).size() - 1;
+  }
+  /*
+   * Slice of levels[level_begin:level_end], with tensor copied.
+   */
+  template <typename T>
+  LODTensor SliceCopied(size_t level_begin, size_t level_end,
+                        const platform::Place &dst_place) const;
+  /*
+   * Slice of levels[level_begin:level_end], with tensor shared.
+   */
+  LODTensor SliceShared(size_t level_begin, size_t level_end) const;
+  /*
+   * Slice of elements of a level, [elem_begin: elem_end], with tensor copied.
+   * @note: low performance in slice lod_start_pos_.
+   */
+  template <typename T>
+  LODTensor SliceCopied(size_t level, size_t elem_begin, size_t elem_end,
+                        const platform::Place &dst_place) const;
+  /*
+   * Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
+   * @note: low performance in slice lod_start_pos_.
+   */
+  LODTensor SliceShared(size_t level, size_t elem_begin, size_t elem_end) const;
+  /*
+   * Copy other's lod_start_pos_, to share LOD info.
+   * @note: the LOD info should not be changed.
+   */
+  void ShareLOD(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  /*
+   * Copy other's lod_start_pos_'s content, free to mutate.
+   */
+  void CopyLOD(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared<LOD>(*other.lod_start_pos_);
+  }
+  /*
+   * Determine whether LODTensor has a valid LOD info.
+   */
+  bool HasLOD() const { return bool(lod_start_pos_); }
+  LOD *lod() const { return lod_start_pos_.get(); }
+  std::shared_ptr<Tensor> &tensor() { return tensor_; }
+  Tensor *raw_tensor() { return tensor_.get(); }
+ private:
+  std::shared_ptr<LOD> lod_start_pos_;
+  std::shared_ptr<Tensor> tensor_;
+};
+}  // namespace framework
+}  // namespace paddle
+#include "paddle/framework/lod_tensor_impl.h"
--- a/paddle/framework/lod_tensor_impl.h
+++ b/paddle/framework/lod_tensor_impl.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/details/lod_tensor.h"
+namespace paddle {
+namespace framework {
+template <typename T>
+LODTensor LODTensor::SliceCopied(size_t level_begin, size_t level_end,
+                                 const platform::Place &dst_place) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
+  auto new_tensor = std::make_shared<Tensor>();
+  new_tensor->CopyFrom<T>(*tensor_, dst_place);
+  return LODTensor(new_tensor, new_lod);
+}
+template <typename T>
+LODTensor LODTensor::SliceCopied(size_t level, size_t elem_begin,
+                                 size_t elem_end,
+                                 const platform::Place &dst_place) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
+                                   false /*tensor_shared*/);
+  auto start_idx = new_lod->front().front();
+  auto end_idx = new_lod->front().back() - 1 /*the next element's start*/;
+  auto sliced_tensor = tensor_->Slice<T>(start_idx, end_idx);
+  auto new_tensor = std::make_shared<Tensor>();
+  new_tensor->CopyFrom<T>(sliced_tensor, dst_place);
+  return LODTensor(new_tensor, new_lod);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#include "paddle/framework/lod_tensor.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+namespace paddle {
+namespace framework {
+class LODTensorTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    lod_tensor.reset(new LODTensor);
+    // tensor's batch_size: 30
+    // 3 levels
+    // 0 10 20
+    // 0 5 10 15 20
+    // 0 2 5 7 10 12 15 20
+    auto lod = std::make_shared<LODTensor::LOD>();
+    lod->push_back(std::vector<size_t>{0, 10, 20});
+    lod->push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod->push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
+    auto tensor = std::make_shared<Tensor>();
+    tensor->Resize({20 /*batch size*/, 128 /*dim*/});
+    // malloc memory
+    tensor->mutable_data<float>(place);
+    lod_tensor->Reset(tensor, lod);
+  }
+ protected:
+  std::unique_ptr<LODTensor> lod_tensor;
+  platform::CPUPlace place;
+};
+TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
+TEST_F(LODTensorTester, NumElements) {
+  ASSERT_EQ(lod_tensor->NumElements(0), 2UL);
+  ASSERT_EQ(lod_tensor->NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
+}
+TEST_F(LODTensorTester, SliceShared_Level) {
+  // slice 1 level
+  for (size_t level = 0; level < 3UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 1);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+  }
+  // slice 2 level
+  for (size_t level = 0; level < 2UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 2);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
+              lod_tensor->NumElements(level + 1));
+    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+  }
+}
+TEST_F(LODTensorTester, SliceCopied_Level) {
+  // slice 1 level
+  for (size_t level = 0; level < 3UL; ++level) {
+    auto new_lod_tensor =
+        lod_tensor->SliceCopied<float>(level, level + 1, place);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
+    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+    // TODO(superjom) add tensor comparation here.
+  }
+  // slice 2 level
+  for (size_t level = 0; level < 2UL; ++level) {
+    auto new_lod_tensor =
+        lod_tensor->SliceCopied<float>(level, level + 2, place);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
+              lod_tensor->NumElements(level + 1));
+    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+    // TODO(superjom) add tensor comparation here.
+  }
+}
+TEST_F(LODTensorTester, SliceShared_Element) {
+  size_t level = 0;
+  auto new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+  level = 1;
+  new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+}
+TEST_F(LODTensorTester, SliceCopied_Element) {
+  size_t level = 0;
+  auto new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+  level = 1;
+  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+  level = 1;
+  // LOD is
+  //    0 5 10
+  //    0 2 5 7 10
+  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 1, 3, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(0, 0), 0UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(0, 1), 5UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 0), 0UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 1), 2UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 2), 5UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 3), 7UL);
+  // TODO(superjom) compare the content of these tensors
+}
+TEST_F(LODTensorTester, ShareLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.ShareLOD(*lod_tensor);
+  ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
+}
+TEST_F(LODTensorTester, CopyLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.CopyLOD(*lod_tensor);
+  ASSERT_NE(new_lod_tensor.lod(), lod_tensor->lod());
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -260,12 +260,6 @@ class OpRegistry {
    return CreateOp(op_desc.type(), inputs, outputs, attrs);
  }
-  static bool SupportGPU(const std::string& op_type) {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = platform::GPUPlace();
-    return OperatorWithKernel::AllOpKernels().at(op_type).count(key) != 0;
-  }
  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
    PADDLE_ENFORCE(!op.IsNetOp(),
                   "Use framework::Backward to get backward ops");

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -33,19 +33,19 @@ namespace paddle {
 namespace framework {
 /// If a variable is a empty variable, that name will be used.
-const std::string kEmptyVarName = "@EMPTY@";
+constexpr char kEmptyVarName[] = "@EMPTY@";
 /// If a variable is a temporary variable, that name will be set in Python,
 /// but it will be convert to a unique name in scope after OpCreator.
-const std::string kTempVarName = "@TEMP@";
+constexpr char kTempVarName[] = "@TEMP@";
 /// If a variable's name has a certain suffix, it means that the
 /// variable is the gradient of another varibale.
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
-const std::string kGradVarSuffix = "@GRAD";
+constexpr char kGradVarSuffix[] = "@GRAD";
 /// Variables with this suffix are supposed to be filled up with zeros.
-const std::string kZeroVarSuffix = "@ZERO";
+constexpr char kZeroVarSuffix[] = "@ZERO";
 inline std::string GradVarName(const std::string& var_name) {
  return var_name + kGradVarSuffix;
@@ -88,6 +88,8 @@ class OperatorBase {
  virtual bool IsNetOp() const { return false; }
+  virtual bool SupportGPU() const { return false; }
  /// rename inputs outputs name
  void Rename(const std::string& old_name, const std::string& new_name);
@@ -118,10 +120,10 @@ class OperatorBase {
  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };
-class OperatorContext {
+class InferShapeContext {
 public:
-  OperatorContext(const OperatorBase* op, const Scope& scope)
+  InferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(*op), scope_(scope) {}
+      : op_(op), scope_(scope) {}
  size_t InputSize() const { return op_.inputs_.size(); }
@@ -232,12 +234,6 @@ class OperatorContext {
  const Scope& scope_;
 };
-class InferShapeContext : public OperatorContext {
- public:
-  InferShapeContext(const OperatorBase* op, const Scope& scope)
-      : OperatorContext(op, scope) {}
-};
 template <typename T>
 struct EigenDeviceConverter;
@@ -253,11 +249,11 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 };
 #endif
-class ExecutionContext : public OperatorContext {
+class ExecutionContext : public InferShapeContext {
 public:
-  ExecutionContext(const OperatorBase* op, const Scope& scope,
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
                   const platform::DeviceContext* device_context)
-      : OperatorContext(op, scope), device_context_(device_context) {}
+      : InferShapeContext(op, scope), device_context_(device_context) {}
  template <typename PlaceType,
            typename DeviceType =
@@ -308,14 +304,14 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
-  void InferShape(const Scope& scope) const {
+  void InferShape(const Scope& scope) const override {
-    InferShape(InferShapeContext(this, scope));
+    InferShape(InferShapeContext(*this, scope));
  }
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(this, scope, &dev_ctx));
+    opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -324,6 +320,12 @@ class OperatorWithKernel : public OperatorBase {
    return g_all_op_kernels;
  }
+  bool SupportGPU() const override {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = platform::GPUPlace();
+    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+  }
 protected:
  virtual void InferShape(const InferShapeContext& ctx) const = 0;
 };

--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -18,13 +18,11 @@ limitations under the License. */
 #include "paddle/framework/backward.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
 #include "paddle/framework/tensor_py.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/operators/type_alias.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/string/to_string.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
@@ -42,8 +40,14 @@ USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_OP_WITHOUT_KERNEL(recurrent_op);
+USE_OP(gaussian_random);
+USE_OP(uniform_random);
 namespace paddle {
 namespace framework {
+using Tensor = framework::Tensor;
 template <typename ClassType>
 void ExposeOperator(ClassType &m) {
  m.def("infer_shape", &ClassType::type::InferShape)
@@ -56,6 +60,26 @@ void ExposeOperator(ClassType &m) {
           [](const typename ClassType::type &op) -> std::vector<std::string> {
             return op.outputs_;
           })
+      .def("inputs",
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
+             return op.inputs_;
+           })
+      .def("support_gpu", &ClassType::type::SupportGPU)
+      .def("temp_outputs",
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
+             auto iter = op.attrs_.find("temporary_index");
+             std::vector<std::string> ret;
+             if (iter == op.attrs_.end()) {
+               return ret;
+             } else {
+               auto tmp_idx = boost::get<std::vector<int>>(iter->second);
+               ret.reserve(tmp_idx.size());
+               for (auto &index : tmp_idx) {
+                 ret.push_back(op.outputs_.at(index));
+               }
+               return ret;
+             }
+           })
      .def("__str__", &ClassType::type::DebugString);
 }
@@ -129,8 +153,8 @@ All parameter, weight, gradient are variables in Paddle.
           [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
           py::return_value_policy::reference)
      .def("get_net",
-           [](Variable &self) -> ops::NetOp * {
+           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<ops::NetOp>();
+             return self.GetMutable<operators::NetOp>();
           },
           py::return_value_policy::reference);
@@ -184,9 +208,13 @@ All parameter, weight, gradient are variables in Paddle.
                  });
  // clang-format on
-  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+  py::class_<platform::GPUPlace>(m, "GPUPlace")
+      .def(py::init<int>())
+      .def("__str__", string::to_string<const platform::GPUPlace &>);
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CPUPlace &>);
  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
      m, "Operator");
@@ -201,8 +229,6 @@ All parameter, weight, gradient are variables in Paddle.
    return OpRegistry::CreateOp(desc);
  });
-  operator_base.def_static("support_gpu", &OpRegistry::SupportGPU);
  operator_base.def("backward",
                    [](const OperatorBase &forwardOp,
                       const std::unordered_set<std::string> &no_grad_vars) {
@@ -211,23 +237,24 @@ All parameter, weight, gradient are variables in Paddle.
  ExposeOperator(operator_base);
-  py::class_<ops::NetOp, std::shared_ptr<ops::NetOp>> net(m, "Net");
+  py::class_<operators::NetOp, std::shared_ptr<operators::NetOp>> net(m, "Net");
  net.def_static("create",
-                 []() -> std::shared_ptr<ops::NetOp> {
+                 []() -> std::shared_ptr<operators::NetOp> {
-                   auto retv = std::make_shared<ops::NetOp>();
+                   auto retv = std::make_shared<operators::NetOp>();
                   retv->type_ = "plain_net";
                   return retv;
                 })
-      .def("add_op", &ops::NetOp::AddOp)
+      .def("add_op", &operators::NetOp::AddOp)
-      .def(
+      .def("add_op",
-          "add_op",
+           [](operators::NetOp &self,
-          [](ops::NetOp &self, const std::shared_ptr<ops::NetOp> &net) -> void {
+              const std::shared_ptr<operators::NetOp> &net) -> void {
-            self.AddOp(std::static_pointer_cast<OperatorBase>(net));
+             self.AddOp(std::static_pointer_cast<OperatorBase>(net));
-          })
+           })
-      .def("complete_add_op", &ops::NetOp::CompleteAddOp)
+      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op",
+      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-           [](std::shared_ptr<ops::NetOp> &self) { self->CompleteAddOp(); });
+        self->CompleteAddOp();
+      });
  ExposeOperator(net);

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <cstring>
 #include <memory>
 #include <typeindex>
+#include <vector>
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/device_context.h"

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  Tensor tt;
-  tt.Resize(make_ddim({2, 3, 4}));
+  tt.Resize({2, 3, 4});
  DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {

--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -93,8 +93,8 @@ TEST(Arguments, Matrix) {
  MatrixPtr matrix = Matrix::create(100, 200);
  CheckBufferArg check = [=](const BufferArg& arg) {
    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.shape()[1], 200U);
    EXPECT_EQ(arg.data(), matrix->getData());
    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
@@ -112,8 +112,8 @@ TEST(Arguments, Matrix) {
 TEST(Arguments, Vector) {
  VectorPtr vector = Vector::create(100, false);
  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape().ndims(), 1U);
-    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[0], 100U);
    EXPECT_EQ(arg.data(), vector->getData());
    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
@@ -131,9 +131,9 @@ TEST(Arguments, Vector) {
 TEST(Arguments, CpuSparseMatrix) {
  CpuSparseMatrix sparse(200, 300, 50);
  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[0], 200U);
-    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.shape()[1], 300U);
    EXPECT_EQ(arg.data(), sparse.getData());
    // CHECK_EQ(arg.sparse().nnz(), 50);
    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
@@ -152,10 +152,10 @@ TEST(Arguments, CpuSparseMatrix) {
 TEST(Arguments, BufferArg) {
  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape().ndims(), 3U);
-    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[0], 1U);
-    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[1], 2U);
-    EXPECT_EQ(arg.shape()[2], 3);
+    EXPECT_EQ(arg.shape()[2], 3U);
  };
  BufferArgs argments;

--- a/paddle/function/TensorShapeTest.cpp
+++ b/paddle/function/TensorShapeTest.cpp
@@ -44,7 +44,7 @@ TEST(TensorShape, GetAndSet) {
  EXPECT_EQ(t.ndims(), 3U);
  EXPECT_EQ(t.getElements(), 6U);
-  EXPECT_EQ(t[1], 2);
+  EXPECT_EQ(t[1], 2U);
  t.setDim(1, 100);
  EXPECT_EQ(t.getElements(), 300U);
  EXPECT_EQ(t[1], 100U);

--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -98,10 +98,12 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
  }
  // TODO(caoying)
-  // Here selSubSeqIdx is automatically converted from real to int
+  // In PaddlePaddle, the currently available matrixes all a have real-typed
-  // This is very dangerous if user fill this matrix himself, invalid data may
+  // data field, but the selected indices information are actually int-typed
-  // occur. The selected indices should be stored in
+  // (with -1 as a special token). Storing indices information in real-typed
-  // CpuSparseMatrix with SparseValueType set to NO_VALUE.
+  // Matrix leads to converting real to int. This is very dangerous if a user
+  // fills this matrix himself, invalid data may occur.
+  // The selected indices should be stored in an int-typed matrix.
  Matrix::resizeOrCreate(
      output_.value,
      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),

--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -32,10 +32,13 @@ public:
 private:
  // TODO(caoying)
-  // Here selSubSeqIdx is automatically converted from real to int
+  // In PaddlePaddle, the currently available matrixes all a have real-typed
-  // This is very dangerous if user fill this matrix himself, invalid data
+  // data field, but the selected indices information are actually int-typed
-  // may occur. The selected indices should be stored in CpuSparseMatrix
+  // (with -1 as a special token). Storing indices information in real-typed
-  // with SparseValueType set to NO_VALUE.
+  // Matrix leads to converting real to int. This is very dangerous if a user
+  // fills this matrix himself, invalid data may occur.
+  // The selected indices should be stored in an int-typed matrix.
  MatrixPtr startIdsOnCpu_;
  MatrixPtr endIdsOnCpu_;

--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -59,6 +59,13 @@ private:
                       const std::vector<std::vector<int>>& inputSeqInfo);
  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  // TODO(caoying)
+  // In PaddlePaddle, the currently available matrixes all a have real-typed
+  // data field, but the selected indices information are actually int-typed
+  // (with -1 as a special token). Storing indices information in real-typed
+  // Matrix leads to converting real to int. This is very dangerous if a user
+  // fills this matrix himself, invalid data may occur.
+  // The selected indices should be stored in an int-typed matrix.
  MatrixPtr selIdsCpu_;
  // reorganized sequenceStartPositions and subSequenceStartPositions
@@ -95,12 +102,7 @@ void SubNestedSequenceLayer::calSelectedRows(
  for (size_t i = 0; i < seqNum; ++i) {
    for (size_t j = 0; j < beamSize; ++j) {
      if (selectedIndices->getElement(i, j) == -1.) break;
-      // TODO(caoying)
+      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
-      // Here selSubSeqIdx is automatically converted from real to int
-      // This is very dangerous if user fill this matrix himself, invalid data
-      // may occur. The selected indices should be stored in
-      // CpuSparseMatrix with SparseValueType set to NO_VALUE.
-      int selSubSeqIdx = selectedIndices->getElement(i, j);
      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
@@ -139,7 +141,7 @@ void SubNestedSequenceLayer::forward(PassType passType) {
  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
                              << "must be a nested sequence.";
  const MatrixPtr selectedIndices = getInputValue(1);
-  CHECK_EQ(inputSeq.getNumSequences(), selectedIndices->getHeight());
+  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
    /*

--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -88,7 +88,7 @@ void checkLayerOut(vector<vector<int>> groundTruth,
 TEST(Layer, kmaxSeqScoreLayer) {
  const size_t maxBeamSize = 100;
-  int beamSize = 1 + (rand() % maxBeamSize);
+  size_t beamSize = 1 + (rand() % maxBeamSize);
  vector<int> seqStartPosition;
  vector<int> subSeqStartPosition;

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -45,16 +45,15 @@ cc_library(net_op SRCS net_op.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 op_library(add_op SRCS add_op.cc add_op.cu)
-cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
 op_library(mean_op SRCS mean_op.cc mean_op.cu)
-cc_test(mean_op_test SRCS mean_op_test.cc DEPS mean_op)
 op_library(mul_op SRCS mul_op.cc mul_op.cu)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
 op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
+op_library(gaussian_random_op SRCS gaussian_random_op.cc gaussian_random_op.cu)
 op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
 op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
@@ -66,3 +65,5 @@ op_library(fc_op
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
    DEPS op_desc tensor op_registry operator net_op)
 cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
+op_library(uniform_random_op
+        SRCS uniform_random_op.cc uniform_random_op.cu)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-class AddOp : public OperatorWithKernel {
+class AddOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 2);
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1);
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "Inputs of AddOp must all be set");
@@ -31,9 +31,9 @@ class AddOp : public OperatorWithKernel {
  }
 };
-class AddOpMaker : public OpProtoAndCheckerMaker {
+class AddOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of add op");
    AddInput("Y", "The second input of add op");
@@ -46,14 +46,17 @@ The equation is: Out = X + Y
  }
 };
-class AddOpGrad : public OperatorWithKernel {
+class AddOpGrad : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
 };
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker);
 REGISTER_GRADIENT_OP(add_two, add_two_grad, ops::AddOpGrad);
-REGISTER_OP_CPU_KERNEL(add_two, ops::AddKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -16,4 +16,6 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
-REGISTER_OP_GPU_KERNEL(add_two, ops::AddKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class AddKernel : public OpKernel {
+class AddKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input0 = context.Input<Tensor>(0);
    auto input1 = context.Input<Tensor>(1);
    auto output = context.Output<Tensor>(0);

--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gtest/gtest.h>
-#define private public
-#include <paddle/framework/op_registry.h>
-USE_OP(add_two);
-// USE_OP(add_two_grad);
-TEST(AddOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("add_two");
-  ASSERT_NE(it, protos.end());
-  auto& op_creators = paddle::framework::OpRegistry::op_creators();
-  auto it1 = op_creators.find("add_two_grad");
-  ASSERT_NE(it1, op_creators.end());
-}
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-class OnehotCrossEntropyOp : public OperatorWithKernel {
+class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 2,
                      "Input size of OnehotCrossEntropyOp must be two");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1,
@@ -37,9 +37,9 @@ class OnehotCrossEntropyOp : public OperatorWithKernel {
  }
 };
-class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
+class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto X = ctx.Input<Tensor>("X");
@@ -48,9 +48,10 @@ class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
  }
 };
-class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker {
+class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of OnehotCrossEntropyOp");
    AddInput("label", "The second input of OnehotCrossEntropyOp");
@@ -66,11 +67,14 @@ OnehotCrossEntropy Operator.
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
            ops::OnehotCrossEntropyOpMaker);
-REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
+REGISTER_OP_CPU_KERNEL(
-                       ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>);
+    onehot_cross_entropy,
+    ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_GRADIENT_OP(onehot_cross_entropy, onehot_cross_entropy_grad,
+                     ops::OnehotCrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(
    onehot_cross_entropy_grad,
-    ops::OnehotCrossEntropyGradientOpKernel<ops::CPUPlace, float>);
+    ops::OnehotCrossEntropyGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -14,3 +14,8 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    onehot_cross_entropy,
+    ops::OnehotCrossEntropyOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
 template <typename T>
 T tolerable_value(T x) {
  static_assert(std::is_floating_point<T>::value,
@@ -38,9 +40,9 @@ T tolerable_value(T x) {
 }
 template <typename Place, typename T>
-class OnehotCrossEntropyOpKernel : public OpKernel {
+class OnehotCrossEntropyOpKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    auto X = ctx.Input<Tensor>("X");
    const T* Xdata = X->data<T>();
    const int* label_data = ctx.Input<Tensor>(1)->data<int>();
@@ -61,9 +63,9 @@ class OnehotCrossEntropyOpKernel : public OpKernel {
 };
 template <typename Place, typename T>
-class OnehotCrossEntropyGradientOpKernel : public OpKernel {
+class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    auto X = ctx.Input<Tensor>("X");
    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));

--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -12,11 +12,16 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "type_alias.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using OpRegistry = framework::OpRegistry;
 class FullyConnectedOp : public NetOp {
 public:
  void Init() override {
@@ -39,9 +44,10 @@ class FullyConnectedOp : public NetOp {
  }
 };
-class FullyConnectedOpMaker : public OpProtoAndCheckerMaker {
+class FullyConnectedOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  FullyConnectedOpMaker(framework::OpProto *proto,
+                        framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "the input of fc operator");
    AddInput("W", "the weight of fc operator");
@@ -66,4 +72,5 @@ USE_OP(rowwise_add);
 USE_OP(sigmoid);
 USE_OP(softmax);
+namespace ops = paddle::operators;
 REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker);
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -50,8 +50,8 @@ The output will have the same size with input.
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP(fill_zeros_like, paddle::operators::FillZerosLikeOp,
+namespace ops = paddle::operators;
-            paddle::operators::FillZerosLikeOpMaker);
+REGISTER_OP(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
    fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -16,6 +16,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
+namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <random>
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class GaussianRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    // TODO(dzh): attribute does not support unsigned int.
+    // And we need a global random seed configuration.
+    int seed = context.op_.GetAttr<int>("seed");
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    std::mt19937 g(seed);
+    std::normal_distribution<T> distribution(mean, std);
+    ssize_t size = framework::product(tensor->dims());
+    for (int i = 0; i < size; ++i) {
+      data[i] = distribution(g);
+    }
+  }
+};
+class GaussianRandomOp : public framework::OperatorWithKernel {
+ protected:
+  void InferShape(const framework::InferShapeContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    PADDLE_ENFORCE(dims.size() > 0UL,
+                   "dims can be one int or array. dims must be set.");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GaussianRandomOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "output matrix of random op");
+    AddComment(R"DOC(
+GaussianRandom operator.
+Use to initialize tensor with gaussian random generator.
+)DOC");
+    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
+    AddAttr<float>("mean", "mean value of random.").SetDefault(.0f);
+    AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of generator."
+                 "0 means use system wide seed")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
@@ -12,44 +12,41 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#pragma once
+#include <memory>
+#include <random>
+#include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/gpu_info.h"
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
 namespace paddle {
 namespace operators {
-using OpKernel = framework::OpKernel;
+template <typename T>
-using OperatorBase = framework::OperatorBase;
+class GaussianRandomKernel : public framework::OpKernel {
-using InferShapeContext = framework::InferShapeContext;
+ public:
-using ExecutionContext = framework::ExecutionContext;
+  void Compute(const framework::ExecutionContext& context) const override {
-using Variable = framework::Variable;
+    float mean = context.op_.GetAttr<float>("mean");
-template <typename T, int MajorType = Eigen::RowMajor,
+    float std = context.op_.GetAttr<float>("std");
-          typename IndexType = Eigen::DenseIndex>
+    auto* tensor = context.Output<framework::Tensor>(0);
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+    T* data = tensor->mutable_data<T>(context.GetPlace());
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
+    int seed = context.op_.GetAttr<int>("seed");
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+    if (seed == 0) {
-template <typename T, int MajorType = Eigen::RowMajor,
+      seed = std::random_device()();
-          typename IndexType = Eigen::DenseIndex>
+    }
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+    curandGenerator_t g;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+    PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
-          typename IndexType = Eigen::DenseIndex>
+        &g, CURAND_RNG_PSEUDO_DEFAULT));
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+    PADDLE_ENFORCE(
-using Tensor = framework::Tensor;
+        platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
-using Scope = framework::Scope;
+    curandGenerateNormal(g, data, framework::product(tensor->dims()), mean,
-using OperatorWithKernel = framework::OperatorWithKernel;
+                         std);
-using OperatorBase = framework::OperatorBase;
+  }
-using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
+};
-using OpProto = framework::OpProto;
-using OpAttrChecker = framework::OpAttrChecker;
-using CPUPlace = platform::CPUPlace;
-using GPUPlace = platform::GPUPlace;
-using OpRegistry = framework::OpRegistry;
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
\ No newline at end of file
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-class MeanOp : public OperatorWithKernel {
+class MeanOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 1, "Input size of AddOp must be one");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "Output size of AddOp must be one");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "input should be set");
@@ -28,9 +28,9 @@ class MeanOp : public OperatorWithKernel {
  }
 };
-class MeanOpMaker : public OpProtoAndCheckerMaker {
+class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  MeanOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
    AddOutput("Out", "The output of mean op").IgnoreGradient();
@@ -38,10 +38,10 @@ class MeanOpMaker : public OpProtoAndCheckerMaker {
  }
 };
-class MeanGradOp : public OperatorWithKernel {
+class MeanGradOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>("X" + framework::kGradVarSuffix)
+    ctx.Output<Tensor>(framework::GradVarName("X"))
        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };
@@ -49,7 +49,10 @@ class MeanGradOp : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker);
-REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -16,5 +16,8 @@
 #include "paddle/operators/mean_op.h"
-REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -13,15 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class MeanKernel : public OpKernel {
+class MeanKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input = context.Input<Tensor>(0);
    auto output = context.Output<Tensor>(0);
@@ -36,13 +45,13 @@ class MeanKernel : public OpKernel {
 };
 template <typename Place, typename T>
-class MeanGradKernel : public OpKernel {
+class MeanGradKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
-    auto OG = context.Input<Tensor>("Out" + framework::kGradVarSuffix);
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
                   "Mean Gradient should be scalar");
-    auto IG = context.Output<Tensor>("X" + framework::kGradVarSuffix);
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
    IG->mutable_data<T>(context.GetPlace());
    T ig_size = (T)framework::product(IG->dims());

--- a/paddle/operators/mean_op_test.cc
+++ b/paddle/operators/mean_op_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gtest/gtest.h>
-#include <paddle/framework/op_registry.h>
-USE_OP(mean);
-TEST(MeanOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("mean");
-  ASSERT_NE(it, protos.end());
-}
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -17,9 +17,9 @@
 namespace paddle {
 namespace operators {
-class MulOp : public OperatorWithKernel {
+class MulOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs");
    auto dim0 = ctx.Input<Tensor>(0)->dims();
    auto dim1 = ctx.Input<Tensor>(1)->dims();
@@ -37,9 +37,9 @@ class MulOp : public OperatorWithKernel {
  }
 };
-class MulOpMaker : public OpProtoAndCheckerMaker {
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of mul op");
    AddInput("Y", "The second input of mul op");
@@ -52,9 +52,9 @@ The equation is: Out = X * Y
  }
 };
-class MulOpGrad : public OperatorWithKernel {
+class MulOpGrad : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "MulGrad";
    return "";
@@ -64,7 +64,8 @@ class MulOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker);
 REGISTER_GRADIENT_OP(mul, mul_grad, ops::MulOpGrad);
-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -15,4 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -13,16 +13,21 @@
   limitations under the License. */
 #pragma once
+#include "paddle/framework/eigen.h"
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class MulKernel : public OpKernel {
+class MulKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
@@ -40,5 +45,6 @@ class MulKernel : public OpKernel {
    Z.device(place) = X.contract(Y, dim_pair);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -15,7 +15,6 @@
 */
 #include "paddle/operators/net_op.h"
-#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {

--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -14,13 +14,7 @@ limitations under the License. */
 #pragma once
-#include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
-#include "paddle/operators/type_alias.h"
-#include "paddle/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -65,6 +59,15 @@ class NetOp : public framework::OperatorBase {
    }
  }
+  bool SupportGPU() const override {
+    for (auto& op : ops_) {
+      if (!op->SupportGPU()) {
+        return false;
+      }
+    }
+    return true;
+  }
  /**
   * @brief Add an operator by ptr
   */

--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -2,31 +2,27 @@
 #include <gtest/gtest.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
 namespace paddle {
 namespace operators {
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
 static int infer_shape_cnt = 0;
 static int run_cnt = 0;
-class TestOp : public OperatorBase {
+class TestOp : public framework::OperatorBase {
 public:
-  void InferShape(const framework::Scope& scope) const override {
+  void InferShape(const Scope& scope) const override { ++infer_shape_cnt; }
-    ++infer_shape_cnt;
+  void Run(const Scope& scope,
-  }
+           const platform::DeviceContext& dev_ctx) const override {
-  void Run(const framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {
    ++run_cnt;
  }
 };
-class EmptyOp : public OperatorBase {
+class EmptyOp : public framework::OperatorBase {
 public:
  void InferShape(const Scope& scope) const override {}
-  void Run(const Scope& scope,
+  void Run(const Scope& scope, const DeviceContext& dev_ctx) const override {}
-           const platform::DeviceContext& dev_ctx) const override {}
 };
 template <typename T>
@@ -72,7 +68,7 @@ TEST(OpKernel, all) {
  net->Run(scope, dev_ctx);
  ASSERT_EQ(2, infer_shape_cnt);
  ASSERT_EQ(2, run_cnt);
-  ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
+  ASSERT_THROW(net->AddOp(op2), platform::EnforceNotMet);
 }
 TEST(NetOp, insert_op) {

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -14,17 +14,19 @@
 #include "paddle/operators/recurrent_op.h"
-#include <glog/logging.h>
 #include <cstring>
 #include <sstream>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/platform/enforce.h"
 namespace paddle {
 namespace operators {
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
 void RecurrentAlgorithm::InferShape(const Scope& scope) const {
  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                 ->GetMutable<Tensor>()
@@ -135,10 +137,11 @@ void RecurrentOp::Init() {
  alg_.Init(std::move(arg));
 }
-class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class RecurrentAlgorithmProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
 public:
-  RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto,
+  RecurrentAlgorithmProtoAndCheckerMaker(framework::OpProto* proto,
-                                         OpAttrChecker* op_checker)
+                                         framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    const auto& name = RecurrentOp::kArgName;
    // inputs and outputs stored in proto

--- a/paddle/operators/recurrent_op_test.cc
+++ b/paddle/operators/recurrent_op_test.cc
@@ -27,6 +27,10 @@ namespace operators {
 using framework::make_ddim;
 using framework::DDim;
+using framework::Tensor;
+using framework::Variable;
+using framework::Scope;
+using framework::OpRegistry;
 class RecurrentOpTest : public ::testing::Test {
 protected:
@@ -164,7 +168,7 @@ class RecurrentOpTest : public ::testing::Test {
  // father scope
  Scope scope_;
-  std::shared_ptr<OperatorBase> rnn_op_;
+  std::shared_ptr<framework::OperatorBase> rnn_op_;
 };
 TEST_F(RecurrentOpTest, Run) {

--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -18,7 +18,9 @@ namespace paddle {
 namespace operators {
 namespace rnn {
-namespace fmw = paddle::framework;
+namespace f = paddle::framework;
+using Tensor = framework::Tensor;
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                   const std::vector<Link>& inlinks, const size_t seq_len,
@@ -30,10 +32,10 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
                   inlinks[i].external);
    Tensor* input = input_var->GetMutable<Tensor>();
-    fmw::DDim dims = input->dims();
+    f::DDim dims = input->dims();
    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
                   "all the inlinks must have same length");
-    fmw::DDim step_dims = slice_ddim(dims, 1, dims.size());
+    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
    for (size_t j = 0; j < seq_len; j++) {
      Tensor* step_input =
          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
@@ -58,11 +60,10 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal);
      PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                     outlinks[i].internal);
-      fmw::DDim step_dims =
+      f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
-          step_scope_var->template GetMutable<Tensor>()->dims();
      std::vector<int> dims_vec = vectorize(step_dims);
      dims_vec.insert(dims_vec.begin(), seq_len);
-      output->Resize(fmw::make_ddim(dims_vec));
+      output->Resize(f::make_ddim(dims_vec));
    } else {
      output->mutable_data<float>(platform::CPUPlace());
      for (size_t j = 0; j < seq_len; j++) {
@@ -104,7 +105,7 @@ void LinkMemories(const std::vector<Scope*>& scopes,
 }
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const OperatorBase& op) {
+                  const framework::OperatorBase& op) {
  arg->step_net = op.Input(name.step_net);
  arg->step_scopes = op.Output(name.step_scopes);

--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -17,12 +17,13 @@
 #include <string>
 #include "paddle/framework/operator.h"
-#include "paddle/operators/type_alias.h"
 namespace paddle {
 namespace operators {
 namespace rnn {
+using Scope = framework::Scope;
 /**
 * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
 *
@@ -86,7 +87,7 @@ void LinkMemories(const std::vector<Scope*>& step_scopes,
                  const int offset, bool infer_shape_mode);
 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const OperatorBase& op);
+                  const framework::OperatorBase& op);
 }  // namespace rnn
 }  // namespace operators

--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -13,12 +13,13 @@
   limitations under the License. */
 #include "paddle/operators/rowwise_add_op.h"
 namespace paddle {
 namespace operators {
-class RowWiseAddOp : public OperatorWithKernel {
+class RowWiseAddOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2UL,
                   "Two inputs is needed by rowwise add");
    auto dim0 = ctx.Input<Tensor>(0)->dims();
@@ -32,9 +33,10 @@ class RowWiseAddOp : public OperatorWithKernel {
  }
 };
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The left input of row-wise add op, must be matrix");
    AddInput("b", "The right input of row-wise add op, must be vector");
@@ -50,6 +52,7 @@ for i in xrange(X.shape[0]):
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
+REGISTER_OP_CPU_KERNEL(
-                       ops::RowWiseAddKernel<ops::CPUPlace, float>);
+    rowwise_add, ops::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -15,5 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"
-REGISTER_OP_GPU_KERNEL(rowwise_add,
+namespace ops = paddle::operators;
-                       ops::RowWiseAddKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add, ops::RowWiseAddKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -13,15 +13,24 @@
   limitations under the License. */
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class RowWiseAddKernel : public OpKernel {
+class RowWiseAddKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto out = context.Output<Tensor>(0);
    out->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-class SGDOp : public OperatorWithKernel {
+class SGDOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 2, "Input size of SGDOp must be two");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "Output size of SGDOp must be one");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "inputs[0] mast be set");
@@ -31,9 +31,9 @@ class SGDOp : public OperatorWithKernel {
  }
 };
-class SGDOpMaker : public OpProtoAndCheckerMaker {
+class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("param", "input parameter");
    AddInput("grad", "input gradient");
@@ -51,5 +51,7 @@ param_out = param - learning_rate * grad;
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -15,4 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class SGDOpKernel : public OpKernel {
+class SGDOpKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    auto param = ctx.Input<Tensor>("param");
    auto grad = ctx.Input<Tensor>("grad");
    auto param_out = ctx.Output<Tensor>(0);

--- a/paddle/operators/sgd_op_test.cc
+++ b/paddle/operators/sgd_op_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gtest/gtest.h>
-#include <paddle/framework/op_registry.h>
-USE_OP(sgd);
-TEST(SGDOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("sgd");
-  ASSERT_NE(it, protos.end());
-}
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -13,21 +13,23 @@
   limitations under the License. */
 #include "paddle/operators/sigmoid_op.h"
 namespace paddle {
 namespace operators {
-class SigmoidOp : public OperatorWithKernel {
+class SigmoidOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input");
    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output");
    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };
-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "sigmoid input");
    AddOutput("Y", "sigmoid output");
@@ -35,9 +37,9 @@ class SigmoidOpMaker : public OpProtoAndCheckerMaker {
  }
 };
-class SigmoidOpGrad : public OperatorWithKernel {
+class SigmoidOpGrad : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };
@@ -45,9 +47,11 @@ class SigmoidOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
 REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad);
-REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid,
-REGISTER_OP_CPU_KERNEL(sigmoid_grad,
+                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
-                       ops::SigmoidGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -15,6 +15,9 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"
-REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sigmoid_grad,
-                       ops::SigmoidGradKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -13,16 +13,21 @@
   limitations under the License. */
 #pragma once
+#include "paddle/framework/eigen.h"
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class SigmoidKernel : public OpKernel {
+class SigmoidKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input = context.Input<Tensor>(0);
    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());
@@ -37,9 +42,9 @@ class SigmoidKernel : public OpKernel {
 };
 template <typename Place, typename T>
-class SigmoidGradKernel : public OpKernel {
+class SigmoidGradKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto Y_t = context.Input<Tensor>("Y");
    auto dY_t = context.Input<Tensor>(framework::GradVarName("Y"));
    auto dX_t = context.Output<Tensor>(framework::GradVarName("X"));

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-class SoftmaxOp : public OperatorWithKernel {
+class SoftmaxOp : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 1UL,
                      "Only one input is need for softmax");
    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims().size(), 2UL,
@@ -30,9 +30,10 @@ class SoftmaxOp : public OperatorWithKernel {
  }
 };
-class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "input of softmax");
    AddOutput("Y", "output of softmax");
@@ -40,9 +41,9 @@ class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
  }
 };
-class SoftmaxOpGrad : public OperatorWithKernel {
+class SoftmaxOpGrad : public framework::OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 3UL,
                      "Input of SoftmaxOpGrad should be 3, X, Y, YG");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1UL,
@@ -61,8 +62,11 @@ class SoftmaxOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
+namespace ops = paddle::operators;
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax_grad,
+REGISTER_OP_CPU_KERNEL(
-                       ops::SoftmaxGradKernel<ops::CPUPlace, float>);
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -13,9 +13,11 @@
   limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
-REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(softmax_grad,
-                       ops::SoftmaxGradKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -13,19 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/framework/eigen.h"
-#include "paddle/framework/ddim.h"
+#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/operators/type_alias.h"
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename Place, typename T>
-class SoftmaxKernel : public OpKernel {
+class SoftmaxKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input = context.Input<Tensor>("X");
    auto output = context.Output<Tensor>("Y");
    output->mutable_data<T>(context.GetPlace());
@@ -62,9 +64,9 @@ class SoftmaxKernel : public OpKernel {
 };
 template <typename Place, typename T>
-class SoftmaxGradKernel : public OpKernel {
+class SoftmaxGradKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
    auto Y = context.Input<Tensor>("Y");

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <random>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+namespace paddle {
+namespace operators {
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class CPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.op_.GetAttr<float>("min")),
+        static_cast<T>(context.op_.GetAttr<float>("max")));
+    for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+class UniformRandomOp : public framework::OperatorWithKernel {
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"),
+                   "uniform_random's min must less then max");
+    auto* tensor = ctx.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "The output tensor of uniform random op");
+    AddComment(R"DOC(Uniform random operator.
+Used to initialize tensor with uniform random generator.
+)DOC");
+    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
+    AddAttr<float>("min", "Minimum value of uniform random").SetDefault(-1.0f);
+    AddAttr<float>("max", "Maximun value of uniform random").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of uniform random. "
+                 "0 means generate a seed by system")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP(uniform_random, paddle::operators::UniformRandomOp,
+            paddle::operators::UniformRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(uniform_random,
+                       paddle::operators::CPUUniformRandomKernel<float>);
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class GPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    T min = static_cast<T>(context.op_.GetAttr<float>("min"));
+    T max = static_cast<T>(context.op_.GetAttr<float>("max"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    ssize_t N = framework::product(tensor->dims());
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_GPU_KERNEL(uniform_random,
+                       paddle::operators::GPUUniformRandomKernel<float>);
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -8,7 +8,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 add_subdirectory(dynload)
-cc_test(enforce_test SRCS enforce_test.cc)
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
 IF(WITH_GPU)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)

--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -15,11 +15,12 @@ limitations under the License. */
 #pragma once
 #include <execinfo.h>
-#include <paddle/string/printf.h>
 #include <iomanip>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include "paddle/string/printf.h"
+#include "paddle/string/to_string.h"
 #ifndef PADDLE_ONLY_CPU
@@ -194,8 +195,8 @@ inline void throw_on_error(T e) {
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
-                 #__VAL0, #__VAL1, std::to_string(__VAL0),                    \
+                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
-                 std::to_string(__VAL1),                                      \
+                 paddle::string::to_string(__VAL1),                           \
                 paddle::string::Sprintf("" __VA_ARGS__));
 }  // namespace platform

--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -9,10 +9,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <array>
+#include <iostream>
 #include <memory>
 #include "gtest/gtest.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+using StringPiece = paddle::string::Piece;
+using paddle::string::HasPrefix;
 TEST(ENFORCE, OK) {
  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
@@ -22,19 +28,15 @@ TEST(ENFORCE, OK) {
 }
 TEST(ENFORCE, FAILED) {
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
  } catch (paddle::platform::EnforceNotMet error) {
-    // your error handling code here
+    caught_exception = true;
-    in_catch = true;
+    EXPECT_TRUE(
-    std::string msg = "Enforce is not ok 123 at all";
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }
 TEST(ENFORCE, NO_ARG_OK) {
@@ -47,41 +49,27 @@ TEST(ENFORCE, NO_ARG_OK) {
 TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
  int a = 2;
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_EQ(a, 1 + 3);
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg = "enforce a == 1 + 3 failed, 2 != 4";
+    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
+  EXPECT_TRUE(caught_exception);
-  ASSERT_TRUE(in_catch);
 }
 TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
  int a = 2;
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg =
+    HasPrefix(StringPiece(error.what()),
-        "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match";
+              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
+  EXPECT_TRUE(caught_exception);
-  ASSERT_TRUE(in_catch);
 }
 TEST(ENFORCE_NE, OK) {
@@ -89,42 +77,32 @@ TEST(ENFORCE_NE, OK) {
  PADDLE_ENFORCE_NE(1.0, 2UL);
 }
 TEST(ENFORCE_NE, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    // 2UL here to check data type compatible
    PADDLE_ENFORCE_NE(1.0, 1UL);
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg = "enforce 1.0 != 1UL failed, 1.000000 == 1";
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-    const char* what = error.what();
+                          "enforce 1.0 != 1UL failed, 1 == 1"))
-    for (size_t i = 0; i < msg.length(); ++i) {
+        << error.what() << " does not have expected prefix";
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
+  EXPECT_TRUE(caught_exception);
-  ASSERT_TRUE(in_catch);
 }
 TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
 TEST(ENFORCE_GT, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
-    // 2UL here to check data type compatible
    PADDLE_ENFORCE_GT(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
+    EXPECT_TRUE(
-    const char* what = error.what();
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
+  EXPECT_TRUE(caught_exception);
-  ASSERT_TRUE(in_catch);
 }
 TEST(ENFORCE_GE, OK) {
@@ -134,21 +112,16 @@ TEST(ENFORCE_GE, OK) {
  PADDLE_ENFORCE_GE(3.21, 2UL);
 }
 TEST(ENFORCE_GE, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_GE(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg = "enforce 1 >= 2UL failed, 1 < 2";
+    EXPECT_TRUE(
-    const char* what = error.what();
+        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
+  EXPECT_TRUE(caught_exception);
-  ASSERT_TRUE(in_catch);
 }
 TEST(ENFORCE_LE, OK) {
@@ -159,21 +132,16 @@ TEST(ENFORCE_LE, OK) {
  PADDLE_ENFORCE_LE(2UL, 3.2);
 }
 TEST(ENFORCE_LE, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_GT(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
+    EXPECT_TRUE(
-    const char* what = error.what();
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
+  EXPECT_TRUE(caught_exception);
-  ASSERT_TRUE(in_catch);
 }
 TEST(ENFORCE_LT, OK) {
@@ -182,21 +150,15 @@ TEST(ENFORCE_LT, OK) {
  PADDLE_ENFORCE_LT(2UL, 3);
 }
 TEST(ENFORCE_LT, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_LT(1UL, 0.12);
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg = "enforce 1UL < 0.12 failed, 1 >= 0.12";
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-    const char* what = error.what();
+                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
  }
+  EXPECT_TRUE(caught_exception);
-  ASSERT_TRUE(in_catch);
 }
 TEST(ENFORCE_NOT_NULL, OK) {
@@ -205,20 +167,50 @@ TEST(ENFORCE_NOT_NULL, OK) {
  delete a;
 }
 TEST(ENFORCE_NOT_NULL, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;
-  int* a{nullptr};
  try {
+    int* a = nullptr;
    PADDLE_ENFORCE_NOT_NULL(a);
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
+    caught_exception = true;
-    const std::string msg = "a should not be null";
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
-    const char* what = error.what();
+  }
-    for (size_t i = 0; i < msg.length(); ++i) {
+  EXPECT_TRUE(caught_exception);
-      ASSERT_EQ(what[i], msg[i]);
+}
+struct Dims {
+  size_t dims_[4];
+  bool operator==(const Dims& o) const {
+    for (size_t i = 0; i < 4; ++i) {
+      if (dims_[i] != o.dims_[i]) return false;
    }
+    return true;
  }
+};
-  ASSERT_TRUE(in_catch);
+std::ostream& operator<<(std::ostream& os, const Dims& d) {
+  for (size_t i = 0; i < 4; ++i) {
+    if (i == 0) {
+      os << "[";
+    }
+    os << d.dims_[i];
+    if (i == 4 - 1) {
+      os << "]";
+    } else {
+      os << ", ";
+    }
+  }
+  return os;
 }
+TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
+  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
+  PADDLE_ENFORCE_EQ(a, b);
+}
+TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
+  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
+  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+}
\ No newline at end of file
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
-cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-	fc_op
-	sgd_op
-	add_op
-	mean_op
-	cross_entropy_op
-	recurrent_op
-	fill_zeros_like_op)
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,3 +2,4 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
+cc_test(to_string_test SRCS to_string_test.cc)
--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <sstream>
+#include <string>
+namespace paddle {
+namespace string {
+template <typename T>
+inline std::string to_string(T v) {
+  std::ostringstream sout;
+  sout << v;
+  return sout.str();
+}
+// Faster std::string/const char* type
+template <>
+inline std::string to_string(std::string v) {
+  return v;
+}
+template <>
+inline std::string to_string(const char* v) {
+  return std::string(v);
+}
+}  // namespace string
+}  // namespace paddle
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/string/to_string.h"
+#include <gtest/gtest.h>
+constexpr char kOutputString[] = "User Defined Output";
+class UserDefinedClass {
+public:
+};
+std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
+  s << kOutputString;
+  return s;
+}
+TEST(to_string, normal) {
+  using namespace paddle::string;
+  ASSERT_EQ("10", to_string(10));
+  ASSERT_EQ("abc", to_string("abc"));
+  ASSERT_EQ("1.2", to_string(1.2));
+}
+TEST(to_string, user_defined) {
+  using namespace paddle::string;
+  UserDefinedClass instance;
+  ASSERT_EQ(kOutputString, to_string(instance));
+}
\ No newline at end of file
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -50,8 +50,8 @@ void NewRemoteParameterUpdater::init(
  // create parameter server client.
  if (useEtcd_) {
-    parameterClient_ = paddle_new_etcd_pserver_client(
+    parameterClient_ =
-        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
  } else {
    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
                                                 FLAGS_trainer_id == 0);

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,6 +13,7 @@ py_test(test_protobuf SRCS test_protobuf.py)
 py_test(test_add_two_op SRCS test_add_two_op.py)
 py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
 py_test(test_softmax_op SRCS test_softmax_op.py)
+py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
 py_test(gradient_checker SRCS gradient_checker.py)
@@ -20,4 +21,8 @@ py_test(gradient_checker SRCS gradient_checker.py)
 py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
 py_test(test_operator SRCS test_operator.py)
+py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
+py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
+import unittest
+import numpy
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
-import numpy
-import unittest
 __all__ = ['get_numeric_gradient']
+def create_op(op_type):
+    kwargs = dict()
+    for in_name in Operator.get_op_input_names(op_type):
+        kwargs[in_name] = in_name
+    for out_name in Operator.get_op_output_names(op_type):
+        kwargs[out_name] = out_name
+    return Operator(op_type, **kwargs)
+def grad_var_name(var_name):
+    return var_name + "@GRAD"
 def get_numeric_gradient(op,
                         input_values,
                         output_name,
                         input_to_check,
-                         delta=1e-2,
+                         delta=0.005,
                         local_scope=None):
    """
    Get Numeric Gradient for an operator's input.
@@ -76,6 +91,119 @@ def get_numeric_gradient(op,
    return gradient_flat.reshape(tensor_to_check.get_dims())
+class GradientChecker(unittest.TestCase):
+    def assert_is_close(self, numeric_grads, scope, max_relative_error,
+                        msg_prefix):
+        for name in numeric_grads:
+            b = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+            a = numeric_grads[name]
+            abs_a = numpy.abs(a)
+            # if abs_a is nearly zero, then use abs error for a, not relative
+            # error.
+            abs_a[abs_a < 1e-3] = 1
+            diff_mat = numpy.abs(a - b) / abs_a
+            max_diff = numpy.max(diff_mat)
+            def err_msg():
+                offset = numpy.argmax(diff_mat > max_relative_error)
+                return "%s Variable %s max gradient diff %f over limit %f, the first " \
+                       "error element is %d" % (
+                       msg_prefix, name, max_diff, max_relative_error, offset)
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+            computation will use these variables.
+        :param inputs_to_check: inputs var names that should check gradient.
+        :param output_name: output name that used to
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used when create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+        if no_grad_set is None:
+            no_grad_set = set()
+        tmp_outs = forward_op.temp_outputs()
+        no_tmp_out = filter(lambda name: name not in tmp_outs,
+                            forward_op.outputs())
+        if len(no_tmp_out) != 1:
+            raise ValueError("non temp out_names should be 1")
+        in_names = forward_op.inputs()
+        for no_grad in no_grad_set:
+            if no_grad not in in_names:
+                raise ValueError("no_grad should be in in_names")
+        backward_op = core.Operator.backward(forward_op, no_grad_set)
+        places = [core.CPUPlace()]
+        if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu():
+            places.append(core.GPUPlace(0))
+        numeric_grad = dict()
+        # get numeric gradient
+        for check_name in inputs_to_check:
+            numeric_grad[check_name] = \
+                get_numeric_gradient(forward_op, input_vars, output_name,
+                                     check_name)
+        # get operator gradient according to different device
+        for place in places:
+            scope = core.Scope()
+            ctx = core.DeviceContext.create(place)
+            # create input var and set value
+            for name, value in input_vars.iteritems():
+                if name not in in_names:
+                    raise ValueError(name + " not in op.inputs_")
+                var = scope.new_var(name).get_tensor()
+                var.set_dims(value.shape)
+                var.set(value, place)
+            # create output var
+            for out_name in forward_op.outputs():
+                scope.new_var(out_name).get_tensor()
+            # infer the shape of output var and compute/set value of output var
+            forward_op.infer_shape(scope)
+            forward_op.run(scope, ctx)
+            # create output grad var
+            # set shape as the output var
+            # set value of this grad to ones
+            for name in forward_op.outputs():
+                out_tensor = scope.find_var(name).get_tensor()
+                grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+                grad_tensor.set_dims(out_tensor.shape())
+                data = 1.0 * numpy.ones(out_tensor.shape())
+                grad_tensor.set(data, place)
+            # create input grad var
+            for name in backward_op.outputs():
+                scope.new_var(name).get_tensor()
+            # infer the shape of input gradient var and compute/set it's value
+            # with backward op
+            backward_op.infer_shape(scope)
+            backward_op.run(scope, ctx)
+            self.assert_is_close(numeric_grad, scope, max_relative_error,
+                                 "Gradient Check On %s" % str(place))
 if __name__ == '__main__':
    class GetNumericGradientTest(unittest.TestCase):
@@ -87,4 +215,28 @@ if __name__ == '__main__':
            arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
            self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)
+        def test_softmax_op(self):
+            def stable_softmax(x):
+                """Compute the softmax of vector x in a numerically stable way."""
+                shiftx = x - numpy.max(x)
+                exps = numpy.exp(shiftx)
+                return exps / numpy.sum(exps)
+            def label_softmax_grad(Y, dY):
+                dX = Y * 0.0
+                for i in range(Y.shape[0]):
+                    d = numpy.dot(Y[i, :], dY[i, :])
+                    dX[i, :] = Y[i, :] * (dY[i, :] - d)
+                return dX
+            softmax_op = Operator("softmax", X="X", Y="Y")
+            X = numpy.random.random((2, 2)).astype("float32")
+            Y = numpy.apply_along_axis(stable_softmax, 1, X)
+            dY = numpy.ones(Y.shape)
+            dX = label_softmax_grad(Y, dY)
+            arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X')
+            numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)
    unittest.main()
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
-import paddle.v2.framework.core as core
-import unittest
 import numpy
+import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
@@ -24,7 +23,7 @@ class OpTestMeta(type):
            scope = core.Scope()
            kwargs = dict()
            places = [core.CPUPlace()]
-            if core.is_compile_gpu() and core.Operator.support_gpu(self.type):
+            if core.is_compile_gpu():
                places.append(core.GPUPlace(0))
            for place in places:
@@ -53,6 +52,8 @@ class OpTestMeta(type):
                        kwargs[attr_name] = self.attrs[attr_name]
                op = Operator(self.type, **kwargs)
+                if isinstance(place, core.GPUPlace) and not op.support_gpu():
+                    return
                op.infer_shape(scope)

--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
 import unittest
 import numpy
 from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
-class TestSGD(unittest.TestCase):
+class TestCrossEntropy(unittest.TestCase):
    __metaclass__ = OpTestMeta
    def setUp(self):
@@ -20,7 +21,18 @@ class TestSGD(unittest.TestCase):
        self.outputs = {'Y': numpy.array(Y).astype("float32")}
-# TODO(superjom) add gradient check
+class CrossEntropyGradOpTest(GradientChecker):
+    def test_softmax_grad(self):
+        op = create_op("onehot_cross_entropy")
+        batch_size = 100
+        class_num = 10
+        inputs = {
+            "X": numpy.random.uniform(
+                0.1, 1.0, [batch_size, class_num]).astype("float32"),
+            "label": (class_num / 2) * numpy.ones(batch_size).astype("int32")
+        }
+        self.check_grad(op, inputs, set("X"), "Y")
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+import unittest
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import numpy
+class GaussianRandomTest(unittest.TestCase):
+    def test_cpu(self):
+        self.gaussian_random_test(place=core.CPUPlace())
+    def test_gpu(self):
+        if core.is_compile_gpu():
+            self.gaussian_random_test(place=core.GPUPlace(0))
+    def gaussian_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("Out").get_tensor()
+        op = Operator(
+            "gaussian_random",
+            Out="Out",
+            dims=[1000, 784],
+            mean=.0,
+            std=1.,
+            seed=10)
+        op.infer_shape(scope)
+        context = core.DeviceContext.create(place)
+        op.run(scope, context)
+        tensor = numpy.array(scope.find_var("Out").get_tensor())
+        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
+        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+from gradient_checker import GradientChecker, create_op
 from op_test_util import OpTestMeta
@@ -25,62 +24,11 @@ class TestSoftmaxOp(unittest.TestCase):
        }
-class TestSoftmaxGradOp(unittest.TestCase):
+class SoftmaxGradOpTest(GradientChecker):
-    def test_softmax_grad(self):
+    def test_softmax(self):
-        op = Operator('softmax', X="X", Y="Y")
+        op = create_op("softmax")
-        backward_op = core.Operator.backward(op, set())
+        inputs = {"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32")}
-        self.assertEqual(backward_op.type(), "softmax_grad")
+        self.check_grad(op, inputs, set("X"), "Y")
-        expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).'''
-        self.assertEqual(expected, str(backward_op))
-        batch_size = 3
-        class_num = 5
-        # Initialize X and add 1e-2 for numerical stability
-        Y = np.random.rand(batch_size, class_num).astype(np.float32)
-        Y = Y + 1e-2
-        dY = np.random.rand(batch_size, class_num).astype(np.float32)
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_grad(Y, dY):
-            dX = Y * 0.0
-            for i in range(batch_size):
-                d = np.dot(Y[i, :], dY[i, :])
-                dX[i, :] = Y[i, :] * (dY[i, :] - d)
-            return dX
-        expected = label_softmax_grad(Y, dY)
-        scope = core.Scope()
-        places = []
-        places.append(core.CPUPlace())
-        if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
-        for place in places:
-            y = scope.new_var("Y")
-            y_tensor = y.get_tensor()
-            y_tensor.set_dims([batch_size, class_num])
-            y_tensor.alloc_float(place)
-            y_tensor.set(Y, place)
-            dy = scope.new_var("Y@GRAD")
-            dy_tensor = dy.get_tensor()
-            dy_tensor.set_dims([batch_size, class_num])
-            dy_tensor.alloc_float(place)
-            dy_tensor.set(dY, place)
-            x = scope.new_var("X")
-            dx = scope.new_var("X@GRAD")
-            tensor = scope.find_var("X@GRAD").get_tensor()
-            backward_op.infer_shape(scope)
-            self.assertEqual([batch_size, class_num], tensor.shape())
-            ctx = core.DeviceContext.create(place)
-            backward_op.run(scope, ctx)
-            actual = np.array(tensor)
-            np.testing.assert_almost_equal(actual, expected, decimal=3)
 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
+import unittest
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+import numpy
+class UniformRandomTest(unittest.TestCase):
+    def test_uniform_random_cpu(self):
+        self.uniform_random_test(place=core.CPUPlace())
+    def test_uniform_random_gpu(self):
+        if core.is_compile_gpu():
+            self.uniform_random_test(place=core.GPUPlace(0))
+    def uniform_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("X").get_tensor()
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            dims=[1000, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+        op.infer_shape(scope)
+        ctx = core.DeviceContext.create(place)
+        op.run(scope, ctx)
+        tensor = numpy.array(scope.find_var("X").get_tensor())
+        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
+if __name__ == '__main__':
+    unittest.main()