Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lstm_bp

1d7c03e7 · dangqingqing · b50c33fd · 03136f63 · 1d7c03e7 · 1d7c03e7
201 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/v2/framework/tests/tmp/*
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -67,7 +67,7 @@ func main() {
 		cp, err = pserver.LoadCheckpoint(e, idx)
 		if err != nil {
 			if err == pserver.ErrCheckpointNotFound {
-				log.Info("Could not find the pserver checkpoint.")
+				log.Info("load checkpoint error", "error", err)
 			} else {
 				panic(err)
 			}
@@ -99,7 +99,7 @@ func main() {
 	candy.Must(err)

 	go func() {
-		log.Info("starting pserver", log.Ctx{"port": *port})
+		log.Info("serving pserver", log.Ctx{"port": *port})
 		err = http.Serve(l, nil)
 		candy.Must(err)
 	}()

--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -123,7 +123,8 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	}
 	err := c.SetDataset(paths)
 	if err != nil {
-		log.Error("error set dataset", log.Ctx{"error": err})
+		log.Error("error set dataset",
+			log.Ctx{"error": err, "paths": paths})
 		return C.PADDLE_MASTER_ERROR
 	}


--- a/go/master/client.go
+++ b/go/master/client.go
@@ -121,6 +121,7 @@ func (c *Client) StartGetRecords(passID int) {
 }

 func (c *Client) getRecords(passID int) {
+	i := 0
 	for {
 		t, err := c.getTask(passID)
 		if err != nil {
@@ -130,12 +131,20 @@ func (c *Client) getRecords(passID int) {
 				c.ch <- record{nil, err}
 				break
 			}
-			if err.Error() == ErrPassAfter.Error() {
-				// wait util last pass finishes
-				time.Sleep(time.Second * 3)
-				continue
+
+			if i%60 == 0 {
+				log.Debug("getTask of passID error.",
+					log.Ctx{"error": err, "passID": passID})
+				i = 0
 			}
-			log.Error("getTask error.", log.Ctx{"error": err})
+
+			// if err.Error() == ErrPassAfter.Error()
+			//   wait util last pass finishes
+			// if other error such as network error
+			//   wait to reconnect or task time out
+			time.Sleep(time.Second * 3)
+			i += 3
+			continue
 		}

 		for _, chunk := range t.Chunks {

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -117,6 +117,7 @@ func TestNextRecord(t *testing.T) {
 			if e != nil {
 				panic(e)
 			}
+
 			// test for n passes
 			for pass := 0; pass < 10; pass++ {
 				c.StartGetRecords(pass)

--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -71,9 +71,15 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer
 		cstate = unsafe.Pointer(&s[0])
 	}

+	var cptr (*C.uchar)
+	if len(c) > 0 {
+		cptr = (*C.uchar)(&c[0])
+	} else {
+		log.Error("empty config", "param name", paramWithConfigs.Param.Name)
+	}
 	o.config = c
 	o.opt = C.paddle_create_optimizer(
-		(*C.uchar)(&c[0]),
+		cptr,
 		C.int(len(c)),
 		C.paddle_element_type(p.ElementType),
 		cbuffer,

--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -17,12 +17,11 @@ package pserver
 import (
 	"bufio"
 	"bytes"
-	"crypto/md5"
 	"encoding/gob"
-	"encoding/hex"
 	"encoding/json"
 	"errors"
 	"fmt"
+	"hash/crc32"
 	"io/ioutil"
 	"os"
 	"path"
@@ -40,7 +39,7 @@ type ElementType int

 // ErrCheckpointNotFound indicates that the pserver checkpoint could
 // not be found.
-var ErrCheckpointNotFound = errors.New("checkpoint not found")
+var ErrCheckpointNotFound = errors.New("checkpoint not found in etcd")

 // RPC error message.
 const (
@@ -76,7 +75,7 @@ type ParameterWithConfig struct {
 type checkpointMeta struct {
 	UUID      string `json:"uuid"`
 	Path      string `json:"path"`
-	MD5       string `json:"md5"`
+	CRC32     uint32 `json:"crc32"`
 	Timestamp int64  `json:"timestamp"`
 }

@@ -92,7 +91,7 @@ type Service struct {
 	idx                int
 	checkpointInterval time.Duration
 	checkpointPath     string
-	client             *EtcdClient
+	client             KVStore

 	mu     sync.Mutex
 	optMap map[string]*optimizer
@@ -104,7 +103,12 @@ type parameterCheckpoint struct {
 	State []byte
 }

-func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
+type KVStore interface {
+	GetKey(key string, timeout time.Duration) ([]byte, error)
+	PutKey(key string, value []byte, timeout time.Duration, withLease bool) error
+}
+
+func loadMeta(e KVStore, idx int) (meta checkpointMeta, err error) {
 	v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second)
 	if err != nil {
 		return
@@ -123,7 +127,7 @@ func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
 }

 // LoadCheckpoint loads checkpoint from file.
-func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
+func LoadCheckpoint(e KVStore, idx int) (Checkpoint, error) {
 	log.Info("Loading checkpoint", "pserver index", idx)
 	defer traceTime(time.Now(), "load checkpoint")

@@ -137,11 +141,8 @@ func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
 		return nil, err
 	}

-	// TODO(helin): change MD5 to CRC since CRC is better for file
-	// checksum in our use case (emphasize speed over security).
-	h := md5.New()
-	md5 := hex.EncodeToString(h.Sum(content))
-	if md5 != cpMeta.MD5 {
+	crc32 := crc32.ChecksumIEEE(content)
+	if crc32 != cpMeta.CRC32 {
 		return nil, errors.New(WrongChecksum)
 	}

@@ -150,12 +151,13 @@ func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
 	if err = dec.Decode(&cp); err != nil {
 		return nil, err
 	}
+
 	return cp, nil
 }

 // NewService creates a new service, will bypass etcd registration if no
 // endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
-func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
+func NewService(idx int, interval time.Duration, path string, client KVStore, cp Checkpoint) (*Service, error) {
 	s := &Service{
 		idx:                idx,
 		checkpointInterval: interval,
@@ -173,6 +175,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
 			}
 			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
 		}
+		close(s.initialized)
 	}
 	return s, nil
 }
@@ -221,7 +224,7 @@ func (s *Service) FinishInitParams(_ int, _ *int) error {
 		for range t {
 			err := s.checkpoint()
 			if err != nil {
-				log.Error("finish init params error", log.Ctx{"error": err})
+				log.Error("checkpoint error", log.Ctx{"error": err})
 			}
 		}
 	}()
@@ -274,6 +277,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
+
 	log.Info("sending parameter to the trainer", "name", parameter.Name, "size", len(parameter.Content), "type", parameter.ElementType)
 	return nil
 }
@@ -354,20 +358,29 @@ func (s *Service) checkpoint() (err error) {

 	oldMeta, err := loadMeta(s.client, s.idx)
 	if err == ErrCheckpointNotFound {
-		log.Info("Do not have existing checkpoint.")
+		log.Info("old meta not found, skip removing old meta")
 		err = nil
+	} else if err == nil {
+		log.Info("removing old meta")
+		if oldMeta.Path != "" {
+			rmErr := os.Remove(oldMeta.Path)
+			if rmErr != nil {
+				// log error, but still treat checkpoint as
+				// successful.
+				log.Error("remove old meta file error", log.Ctx{"error": rmErr})
+			}
+		}
 	}

 	if err != nil {
 		return
 	}

-	h := md5.New()
-	md5 := hex.EncodeToString(h.Sum(buf.Bytes()))
+	crc32 := crc32.ChecksumIEEE(buf.Bytes())
 	cpMeta := checkpointMeta{
 		UUID:      id,
 		Timestamp: time.Now().UnixNano(),
-		MD5:       md5,
+		CRC32:     crc32,
 		Path:      p,
 	}

@@ -381,14 +394,5 @@ func (s *Service) checkpoint() (err error) {
 		return
 	}

-	if oldMeta.Path != "" {
-		rmErr := os.Remove(oldMeta.Path)
-		if rmErr != nil {
-			// log error, but still treat checkpoint as
-			// successful.
-			log.Error("remove old meta file error", log.Ctx{"error": rmErr})
-		}
-	}
-
 	return
 }
--- a/go/pserver/service_internal_test.go
+++ b/go/pserver/service_internal_test.go
+package pserver
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+const testDir = "./test_data"
+
+type myKV struct {
+	m map[string][]byte
+}
+
+func (m *myKV) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	if m.m == nil {
+		m.m = make(map[string][]byte)
+	}
+	return m.m[key], nil
+}
+
+func (m *myKV) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
+	if m.m == nil {
+		m.m = make(map[string][]byte)
+	}
+	m.m[key] = value
+	return nil
+}
+
+func TestCheckpoint(t *testing.T) {
+	kv := &myKV{}
+	s, err := NewService(0, time.Hour, testDir, kv, nil)
+	assert.Nil(t, err)
+	err = s.checkpoint()
+	assert.Nil(t, err)
+	_, err = LoadCheckpoint(kv, 0)
+	assert.Nil(t, err)
+}
+
+func float32ToByte(f float32) []byte {
+	var buf bytes.Buffer
+	err := binary.Write(&buf, binary.LittleEndian, f)
+	if err != nil {
+		fmt.Println("binary.Write failed:", err)
+	}
+	return buf.Bytes()
+}
+
+func TestCheckpointWithData(t *testing.T) {
+	kv := &myKV{}
+	s, err := NewService(0, time.Hour, testDir, kv, nil)
+	assert.Nil(t, err)
+
+	var content []byte
+	for i := 0; i < 50000; i++ {
+		content = append(content, float32ToByte(float32(i))...)
+	}
+
+	p1 := Parameter{Name: "p1", ElementType: 1, Content: content}
+	err = s.InitParam(ParameterWithConfig{Param: p1}, nil)
+	assert.Nil(t, err)
+
+	err = s.FinishInitParams(0, nil)
+	assert.Nil(t, err)
+
+	var p2 Parameter
+	err = s.GetParam(p1.Name, &p2)
+	assert.Nil(t, err)
+	assert.Equal(t, p1, p2)
+
+	err = s.checkpoint()
+	assert.Nil(t, err)
+	cp, err := LoadCheckpoint(kv, 0)
+	assert.Nil(t, err)
+	s1, err := NewService(0, time.Hour, testDir, kv, cp)
+	assert.Nil(t, err)
+
+	var p3 Parameter
+	err = s1.GetParam(p1.Name, &p3)
+	assert.Nil(t, err)
+	assert.Equal(t, p1, p3)
+}
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -178,7 +178,3 @@ func TestBlockUntilInitialized(t *testing.T) {

 	wg.Wait()
 }
-
-func TestCheckpointSpeed(t *testing.T) {
-	//TODO(zhihong): test speed
-}
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -15,7 +15,7 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)

 cc_test(variable_test SRCS variable_test.cc)

-cc_library(scope SRCS scope.cc)
+cc_library(scope SRCS scope.cc DEPS glog)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)


@@ -24,9 +24,10 @@ cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog)
+cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@@ -42,7 +43,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
+cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)

 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog)


--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -315,6 +315,7 @@ static void CreateGradVarInBlock(
                     return false; /* not break */
                   });
    if (need_infer_shape) {
+      ops[op_index]->InferVarType(block_desc);
      ops[op_index]->InferShape(*block_desc);
    }
  }
@@ -452,11 +453,16 @@ ParamGradInfoMap AppendBackward(
  std::transform(target_shape_desc.begin(), target_shape_desc.end(),
                 std::back_inserter(target_shape),
                 [](int64_t dim) { return static_cast<int>(dim); });
+  VLOG(3) << "backward from loss=" << target.Name()
+          << " data_type=" << target.GetDataType();
  std::unique_ptr<OpDescBind> fill_one_op(
      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
                     {{"shape", target_shape},
                      {"value", static_cast<float>(1.0)},
-                      {"data_type", framework::DataType::FP32}}));
+                      {"data_type", target.GetDataType()}}));
+  // infer var type of fill_one_op
+  fill_one_op->InferVarType(root_block);
+
  root_block->AppendAllocatedOp(std::move(fill_one_op));
  size_t forward_op_num = root_block->OpSize();
  size_t forward_block_num = program_desc.Size();
@@ -475,8 +481,7 @@ ParamGradInfoMap AppendBackward(
  std::unordered_map<std::string, GradVarInfo> retv;

  auto var = root_block->Var(fill_one_op_out);
-  // FIXME(qiao) infer the data type
-  var->SetDataType(framework::DataType::FP32);
+  var->SetDataType(target.GetDataType());
  var->SetShape(target.Shape());
  auto& target_grad = retv[target.Name()];
  target_grad.name_ = fill_one_op_out;

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -21,6 +21,8 @@
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"

+USE_OP(fill_constant);
+
 namespace paddle {
 namespace framework {


--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -120,6 +120,17 @@ BlockDesc *BlockDescBind::Proto() {
  Flush();
  return desc_;
 }
+
+BlockDescBind::BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+    : prog_(prog), desc_(desc), need_update_(false) {
+  for (const VarDesc &var_desc : desc_->vars()) {
+    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
+  }
+  for (const OpDesc &op_desc : desc_->ops()) {
+    ops_.emplace_back(new OpDescBind(op_desc, prog));
+  }
+}
+
 BlockDescBind::BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
                             ProgramDescBind *prog)
    : prog_(prog), desc_(desc) {

--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -36,8 +36,7 @@ class ProgramDescBind;

 class BlockDescBind {
 public:
-  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
-      : prog_(prog), desc_(desc), need_update_(false) {}
+  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc);

  BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
                ProgramDescBind *prog);

--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -34,5 +34,25 @@ inline DataType ToDataType(std::type_index type) {
  }
 }

+template <typename Visitor>
+inline void VisitDataType(DataType type, Visitor visitor) {
+  switch (type) {
+    case DataType::FP32:
+      visitor.template operator()<float>();
+      break;
+    case DataType::FP64:
+      visitor.template operator()<double>();
+      break;
+    case DataType::INT32:
+      visitor.template operator()<int>();
+      break;
+    case DataType::INT64:
+      visitor.template operator()<int64_t>();
+      break;
+    default:
+      PADDLE_THROW("Not supported");
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -195,6 +195,14 @@ std::vector<int64_t> vectorize(const DDim& ddim) {
  return result;
 }

+// NOTE: framework::vectorize converts to type int64_t
+//       which does not fit cudnn inputs.
+std::vector<int> vectorize2int(const DDim& ddim) {
+  std::vector<int64_t> temp = vectorize(ddim);
+  std::vector<int> result(temp.begin(), temp.end());
+  return result;
+}
+
 struct ProductVisitor : public boost::static_visitor<int64_t> {
  template <int D>
  int64_t operator()(const Dim<D>& dim) {

--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -93,6 +93,7 @@ int64_t get(const DDim& dim, int idx);
 void set(DDim& dim, int idx, int val);

 std::vector<int64_t> vectorize(const DDim& ddim);
+std::vector<int> vectorize2int(const DDim& ddim);

 int64_t product(const DDim& ddim);


--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -28,7 +28,8 @@ enum OpInfoFillType {
  kOperator = 0,
  kOpProtoAndCheckerMaker = 1,
  kGradOpDescMaker = 2,
-  kVarTypeInference = 3
+  kVarTypeInference = 3,
+  kShapeInference = 4
 };

 template <typename T>
@@ -42,7 +43,10 @@ struct OpInfoFillTypeID {
                             ? kGradOpDescMaker
                             : (std::is_base_of<VarTypeInference, T>::value
                                    ? kVarTypeInference
-                                    : static_cast<OpInfoFillType>(-1))));
+                                    : (std::is_base_of<InferShapeBase, T>::value
+                                           ? kShapeInference
+                                           : static_cast<OpInfoFillType>(
+                                                 -1)))));
  }
 };

@@ -121,6 +125,16 @@ struct OpInfoFiller<T, kVarTypeInference> {
  }
 };

+template <typename T>
+struct OpInfoFiller<T, kShapeInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_shape_ = [](InferShapeContext* ctx) {
+      T inference;
+      inference(ctx);
+    };
+  }
+};
+
 }  // namespace details

 }  // namespace framework

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <set>
 #include <vector>

+#include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
@@ -56,6 +57,22 @@ Executor::~Executor() {
  }
 }

+static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
+  if (var_type == VarDesc::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == VarDesc::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == VarDesc::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == VarDesc::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else {
+    PADDLE_THROW(
+        "Variable type must be "
+        "LoDTensor/SelectedRows/FEED_MINIBATCH/FETCH_LIST.");
+  }
+}
+
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
  // TODO(tonyyang-svail):
  //    - only runs on the first device (i.e. no interdevice communication)
@@ -69,10 +86,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
  for (auto& var : block.vars()) {
    if (var.persistable()) {
      auto* ptr = scope->Var(var.name());
+      CreateTensor(ptr, var.type());
      VLOG(3) << "Create Variable " << var.name()
              << " global, which pointer is " << ptr;
    } else {
      auto* ptr = local_scope.Var(var.name());
+      CreateTensor(ptr, var.type());
      VLOG(3) << "Create Variable " << var.name()
              << " locally, which pointer is " << ptr;
    }

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -14,26 +14,97 @@ limitations under the License. */

 #include "paddle/framework/op_desc.h"
 #include <functional>
+#include <mutex>
 #include <unordered_map>
+#include "glog/logging.h"
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/shape_inference.h"

 namespace paddle {
 namespace framework {

+class OpDescBind;
+class BlockDescBind;
+class CompileTimeInferShapeContext : public InferShapeContext {
+ public:
+  CompileTimeInferShapeContext(const OpDescBind &op,
+                               const BlockDescBind &block);
+
+  bool HasInput(const std::string &name) const override;
+
+  bool HasOutput(const std::string &name) const override;
+
+  bool HasInputs(const std::string &name) const override;
+
+  bool HasOutputs(const std::string &name) const override;
+
+  DDim GetInputDim(const std::string &name) const override;
+
+  void SetOutputDim(const std::string &name, const DDim &dim) override;
+
+  AttrReader Attrs() const override;
+
+  const std::vector<std::string> &Inputs(
+      const std::string &name) const override;
+
+  const std::vector<std::string> &Outputs(
+      const std::string &name) const override;
+
+ private:
+  DDim GetDim(const std::string &name) const override;
+
+  void SetDim(const std::string &name, const DDim &dim) override;
+
+  const OpDescBind &op_;
+  const BlockDescBind &block_;
+};
+
 OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
                       const VariableNameMap &outputs,
                       const AttributeMap &attrs) {
-  op_desc_.set_type(type);
+  desc_.set_type(type);
  inputs_ = inputs;
  outputs_ = outputs;
  attrs_ = attrs;
  need_update_ = true;
 }

+OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
+    : desc_(desc), need_update_(false) {
+  // restore inputs_
+  int input_size = desc_.inputs_size();
+  for (int i = 0; i < input_size; ++i) {
+    const OpDesc::Var &var = desc_.inputs(i);
+    std::vector<std::string> &args = inputs_[var.parameter()];
+    int argu_size = var.arguments_size();
+    args.reserve(argu_size);
+    for (int j = 0; j < argu_size; ++j) {
+      args.push_back(var.arguments(j));
+    }
+  }
+  // restore outputs_
+  int output_size = desc_.outputs_size();
+  for (int i = 0; i < output_size; ++i) {
+    const OpDesc::Var &var = desc_.outputs(i);
+    std::vector<std::string> &args = outputs_[var.parameter()];
+    int argu_size = var.arguments_size();
+    args.reserve(argu_size);
+    for (int j = 0; j < argu_size; ++j) {
+      args.push_back(var.arguments(j));
+    }
+  }
+  // restore attrs_
+  for (const OpDesc::Attr &attr : desc_.attrs()) {
+    std::string attr_name = attr.name();
+    attrs_[attr_name] = GetAttrValue(attr, prog->Proto());
+  }
+}
+
 OpDesc *OpDescBind::Proto() {
  Flush();
-  return &op_desc_;
+  return &desc_;
 }

 const std::vector<std::string> &OpDescBind::Input(
@@ -167,23 +238,23 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {

 void OpDescBind::Flush() {
  if (need_update_) {
-    this->op_desc_.mutable_inputs()->Clear();
+    this->desc_.mutable_inputs()->Clear();
    for (auto &ipt : inputs_) {
-      auto *input = op_desc_.add_inputs();
+      auto *input = desc_.add_inputs();
      input->set_parameter(ipt.first);
      VectorToRepeated(ipt.second, input->mutable_arguments());
    }

-    this->op_desc_.mutable_outputs()->Clear();
+    this->desc_.mutable_outputs()->Clear();
    for (auto &opt : outputs_) {
-      auto *output = op_desc_.add_outputs();
+      auto *output = desc_.add_outputs();
      output->set_parameter(opt.first);
      VectorToRepeated(opt.second, output->mutable_arguments());
    }

-    this->op_desc_.mutable_attrs()->Clear();
+    this->desc_.mutable_attrs()->Clear();
    for (auto &attr : attrs_) {
-      auto *attr_desc = op_desc_.add_attrs();
+      auto *attr_desc = desc_.add_attrs();
      attr_desc->set_name(attr.first);
      attr_desc->set_type(
          static_cast<framework::AttrType>(attr.second.which() - 1));
@@ -195,26 +266,26 @@ void OpDescBind::Flush() {
  }
 }

-using InferShapeFuncMap =
-    std::unordered_map<std::string /*op_type*/,
-                       std::function<void(InferShapeContext *)>>;
+static std::once_flag init_infer_shape_funcs;

-static InferShapeFuncMap &InferShapeFuncs() {
-  static InferShapeFuncMap *g_map = nullptr;
-  if (g_map == nullptr) {
-    g_map = new InferShapeFuncMap();
-    auto &info_map = OpInfoMap::Instance();
-    // all registered kernels
-    for (auto &pair : OperatorWithKernel::AllOpKernels()) {
-      auto &info = info_map.Get(pair.first);
-      // use empty type here to avoid runtime checks.
+static void InitInferShapeFuncs() {
+  std::call_once(init_infer_shape_funcs, [] {
+    auto &map = OpInfoMap::Instance();
+    auto &info_map = *map.mutable_map();
+
+    for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
+      auto op_type = kern_pair.first;
+      auto &op_info = info_map.at(op_type);
      auto op =
-          static_cast<OperatorWithKernel *>(info.Creator()("", {}, {}, {}));
-      g_map->insert(
-          {pair.first, [op](InferShapeContext *ctx) { op->InferShape(ctx); }});
+          static_cast<OperatorWithKernel *>(op_info.Creator()("", {}, {}, {}));
+      if (op_info.infer_shape_) {  // infer_shape has been registered.
+        continue;
+      }
+      op_info.infer_shape_ = [op](InferShapeContext *ctx) {
+        op->InferShape(ctx);
+      };
    }
-  }
-  return *g_map;
+  });
 }

 void OpDescBind::CheckAttrs() {
@@ -230,13 +301,13 @@ void OpDescBind::CheckAttrs() {
 }

 void OpDescBind::InferShape(const BlockDescBind &block) const {
-  auto &funcs = InferShapeFuncs();
-  auto it = funcs.find(this->Type());
-  if (it == funcs.end()) {
-    PADDLE_THROW("Operator %s has not been registered", this->Type());
-  }
+  VLOG(3) << "CompileTime infer shape on " << Type();
+  InitInferShapeFuncs();
+  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
+  PADDLE_ENFORCE(static_cast<bool>(infer_shape),
+                 "%s's infer_shape has not been registered", this->Type());
  CompileTimeInferShapeContext ctx(*this, block);
-  it->second(&ctx);
+  infer_shape(&ctx);
 }

 void OpDescBind::InferVarType(BlockDescBind *block) const {
@@ -253,5 +324,97 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
  }
 }

+CompileTimeInferShapeContext::CompileTimeInferShapeContext(
+    const OpDescBind &op, const BlockDescBind &block)
+    : op_(op), block_(block) {}
+
+bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  auto length = input_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(input_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  auto length = output_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(output_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  if (input_names.empty()) {
+    return false;
+  }
+  for (auto &input : input_names) {
+    if (!block_.HasVarRecursive(input)) return false;
+  }
+  return true;
+}
+
+bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  if (output_names.empty()) {
+    return false;
+  }
+  for (auto &output : output_names) {
+    if (!block_.HasVarRecursive(output)) return false;
+  }
+  return true;
+}
+
+DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
+  std::vector<DDim> ddims = GetInputsDim(name);
+  auto length = ddims.size();
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input(%s) should have 1 value, "
+                    "but it has %d now",
+                    name, length);
+  return ddims[0];
+}
+
+void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
+                                                const DDim &dim) {
+  SetOutputsDim(name, {dim});
+}
+
+AttrReader CompileTimeInferShapeContext::Attrs() const {
+  return AttrReader(op_.GetAttrMap());
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Inputs(
+    const std::string &name) const {
+  return op_.Input(name);
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
+    const std::string &name) const {
+  return op_.Output(name);
+}
+
+DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  return framework::make_ddim(var->Shape());
+}
+
+void CompileTimeInferShapeContext::SetDim(const std::string &name,
+                                          const DDim &dim) {
+  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -24,6 +24,7 @@ namespace paddle {
 namespace framework {

 class BlockDescBind;
+class ProgramDescBind;

 class OpDescBind {
 public:
@@ -32,11 +33,13 @@ class OpDescBind {
  OpDescBind(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs, const AttributeMap &attrs);

+  OpDescBind(const OpDesc &desc, ProgramDescBind *prog);
+
  OpDesc *Proto();

-  std::string Type() const { return op_desc_.type(); }
+  std::string Type() const { return desc_.type(); }

-  void SetType(const std::string &type) { op_desc_.set_type(type); }
+  void SetType(const std::string &type) { desc_.set_type(type); }

  const std::vector<std::string> &Input(const std::string &name) const;

@@ -104,6 +107,8 @@ class OpDescBind {

  void InferVarType(BlockDescBind *block) const;

+  void MarkAsTarget() { desc_.set_is_target(true); }
+
  void Flush();

 private:
@@ -117,7 +122,7 @@ class OpDescBind {
    return ret_val;
  }

-  OpDesc op_desc_;
+  OpDesc desc_;
  VariableNameMap inputs_;
  VariableNameMap outputs_;
  AttributeMap attrs_;

--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -25,12 +25,19 @@
 namespace paddle {
 namespace framework {

+class InferShapeBase {
+ public:
+  virtual ~InferShapeBase() = default;
+  virtual void operator()(InferShapeContext*) const = 0;
+};
+
 struct OpInfo {
  OpCreator creator_;
  GradOpMakerFN grad_op_maker_;
  OpProto* proto_{nullptr};
  OpAttrChecker* checker_{nullptr};
  InferVarTypeFN infer_var_type_;
+  InferShapeFN infer_shape_;

  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;
@@ -87,13 +94,13 @@ class OpInfoMap {
    }
  }

-  const std::unordered_map<std::string, const OpInfo>& map() const {
-    return map_;
-  }
+  const std::unordered_map<std::string, OpInfo>& map() const { return map_; }
+
+  std::unordered_map<std::string, OpInfo>* mutable_map() { return &map_; }

 private:
  OpInfoMap() = default;
-  std::unordered_map<std::string, const OpInfo> map_;
+  std::unordered_map<std::string, OpInfo> map_;

  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
 };

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/framework/shape_inference.h"

 namespace paddle {
 namespace framework {
@@ -161,6 +162,10 @@ class OpKernelRegistrar : public Registrar {
  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
                    op_maker_class);

+#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
+  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
+                    ##__VA_ARGS__)
+
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
  REGISTER_OPERATOR(op_type, op_class, op_maker_class)

@@ -223,6 +228,10 @@ class OpKernelRegistrar : public Registrar {
  USE_OP_ITSELF(op_type);        \
  USE_OP_DEVICE_KERNEL(op_type, CPU);

+#define USE_GPU_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);        \
+  USE_OP_DEVICE_KERNEL(op_type, GPU)
+
 #define USE_OP(op_type)   \
  USE_OP_ITSELF(op_type); \
  USE_OP_KERNEL(op_type)

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+#include "paddle/framework/shape_inference.h"

 namespace paddle {
 namespace framework {
@@ -33,24 +34,6 @@ ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 }
 #endif

-const Tensor* GetTensorFromVar(const Variable* var) {
-  if (var->IsType<LoDTensor>()) {
-    return &var->Get<LoDTensor>();
-  }
-  PADDLE_ENFORCE(var->IsType<Tensor>(),
-                 "The Input must be LoDTensor or Tensor.");
-  return &var->Get<Tensor>();
-}
-
-Tensor* GetTensorFromVar(Variable* var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  }
-  PADDLE_ENFORCE(var->IsType<Tensor>(),
-                 "The Input must be LoDTensor or Tensor.");
-  return var->GetMutable<Tensor>();
-}
-
 std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
  PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -204,6 +187,30 @@ void OperatorBase::GenerateTemporaryNames() {
  }
 }

+static const Tensor* GetTensorFromVar(const Variable* var) {
+  const Tensor* t = nullptr;
+  if (var->IsType<LoDTensor>()) {
+    t = &(var->Get<LoDTensor>());
+  } else if (var->IsType<SelectedRows>()) {
+    t = &(var->Get<SelectedRows>().value());
+  } else {
+    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+  }
+  return t;
+}
+
+static Tensor* GetMutableTensorFromVar(Variable* var) {
+  Tensor* t = nullptr;
+  if (var->IsType<LoDTensor>()) {
+    t = var->GetMutable<LoDTensor>();
+  } else if (var->IsType<SelectedRows>()) {
+    t = var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+  }
+  return t;
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  auto* var = InputVar(name);
@@ -227,7 +234,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
  auto var = OutputVar(name);
-  return var == nullptr ? nullptr : var->GetMutable<LoDTensor>();
+  return var == nullptr ? nullptr : GetMutableTensorFromVar(var);
 }

 template <>
@@ -240,7 +247,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
                 [&](const std::string& sub_name) {
                   auto var = scope_.FindVar(sub_name);
                   return var == nullptr ? nullptr
-                                         : var->GetMutable<LoDTensor>();
+                                         : GetMutableTensorFromVar(var);
                 });
  return res;
 }
@@ -267,5 +274,137 @@ bool OpSupportGPU(const std::string& op_type) {
  return false;
 }

+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  bool HasInput(const std::string& name) const override {
+    auto& ins = Inputs(name);
+    size_t length = ins.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
+                      name);
+    auto ipt = ins[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    auto& outs = Outputs(name);
+    size_t length = outs.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
+                      name);
+    auto ipt = outs[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasInputs(const std::string& name) const override {
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
+    return GetDim(op_.Input(name));
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    SetDim(op_.Output(name), dim);
+  }
+
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+ private:
+  DDim GetDim(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      return var->Get<LoDTensor>().dims();
+    } else if (var->IsType<SelectedRows>()) {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    } else {
+      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    }
+  }
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      var->GetMutable<LoDTensor>()->Resize(dim);
+    } else if (var->IsType<SelectedRows>()) {
+      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else {
+      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    }
+  }
+
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+void OperatorWithKernel::Run(const Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  VLOG(3) << "Running operator " << this->Type();
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+
+  ExecutionContext ctx(*this, scope, dev_ctx);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
+
+  // check if op[type] have kernel for kernel_key
+  OpKernelMap& kernels = kernels_iter->second;
+  auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+  auto kernel_iter = kernels.find(kernel_key);
+
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("The operator %s does not support %s", type_, kernel_key);
+  }
+
+  kernel_iter->second->Compute(ctx);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
-#include "paddle/framework/shape_inference.h"
+#include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
@@ -60,9 +60,6 @@ inline std::string GradVarName(const std::string& var_name) {
 class OperatorBase;
 class ExecutionContext;

-extern const Tensor* GetTensorFromVar(const Variable* var);
-extern Tensor* GetTensorFromVar(Variable* var);
-
 /**
 * OperatorBase has the basic element that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -125,7 +122,7 @@ class OperatorBase {
 protected:
  std::string type_;
  // NOTE: in case of OpGrad, inputs_ contains:
-  // I (Inputs)opear
+  // I (Inputs)
  // O (Outputs)
  // OG (Output Gradients)
  VariableNameMap inputs_;
@@ -290,6 +287,16 @@ class ExecutionContext {
    return device_context_;
  }

+  //! Get actual name vector for this input.
+  const std::vector<std::string>& Inputs(const std::string& name) const {
+    return op_.Inputs(name);
+  }
+
+  //! Get actual name vector for this output.
+  const std::vector<std::string>& Outputs(const std::string& name) const {
+    return op_.Outputs(name);
+  }
+
 #ifdef PADDLE_WITH_CUDA
  const platform::CUDADeviceContext& cuda_device_context() const {
    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
@@ -319,226 +326,6 @@ template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const;

-class CompileTimeInferShapeContext : public InferShapeContext {
- public:
-  CompileTimeInferShapeContext(const OpDescBind& op, const BlockDescBind& block)
-      : op_(op), block_(block) {}
-
-  bool HasInput(const std::string& name) const override {
-    const std::vector<std::string>& input_names = op_.Input(name);
-    auto length = input_names.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Input(%s) should have only one value, "
-                      "but it have %d now",
-                      name, length);
-    return block_.HasVarRecursive(input_names[0]);
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    const std::vector<std::string>& output_names = op_.Output(name);
-    auto length = output_names.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output(%s) should have only one value, "
-                      "but it have %d now",
-                      name, length);
-    return block_.HasVarRecursive(output_names[0]);
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    const std::vector<std::string>& input_names = op_.Input(name);
-    if (input_names.empty()) {
-      return false;
-    }
-    for (auto& input : input_names) {
-      if (!block_.HasVarRecursive(input)) return false;
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    const std::vector<std::string>& output_names = op_.Output(name);
-    if (output_names.empty()) {
-      return false;
-    }
-    for (auto& output : output_names) {
-      if (!block_.HasVarRecursive(output)) return false;
-    }
-    return true;
-  }
-
-  DDim GetInputDim(const std::string& name) const override {
-    std::vector<DDim> ddims = GetInputsDim(name);
-    auto length = ddims.size();
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Input(%s) should have 1 value, "
-                      "but it has %d now",
-                      name, length);
-    return ddims[0];
-  }
-
-  void SetInputDim(const std::string& name, const DDim& dim) override {
-    SetInputsDim(name, {dim});
-  }
-
-  DDim GetOutputDim(const std::string& name) const override {
-    std::vector<DDim> ddims = GetOutputsDim(name);
-    auto length = ddims.size();
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output(%s) should have 1 value, "
-                      "but it has %d now",
-                      name, length);
-    return ddims[0];
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetOutputsDim(name, {dim});
-  }
-
-  AttrReader Attrs() const override { return AttrReader(op_.GetAttrMap()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Input(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Output(name);
-  }
-
- private:
-  DDim GetDim(const std::string& name) const override {
-    return framework::make_ddim(block_.FindVarRecursive(name)->Shape());
-  }
-
-  void SetDim(const std::string& name, const DDim& dim) override {
-    block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
-  }
-
-  const OpDescBind& op_;
-  const BlockDescBind& block_;
-};
-
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
-
-  bool HasInput(const std::string& name) const override {
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
-                      name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
-                      name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    auto inputs = op_.Inputs(name);
-    if (inputs.empty()) {
-      return false;
-    }
-    for (auto& input : inputs) {
-      if (scope_.FindVar(input) == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    auto outputs = op_.Outputs(name);
-    if (outputs.empty()) {
-      return false;
-    }
-    for (auto& output : outputs) {
-      if (scope_.FindVar(output) == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  DDim GetInputDim(const std::string& name) const override {
-    return GetDim(op_.Input(name));
-  }
-
-  void SetInputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Input(name), dim);
-  }
-
-  DDim GetOutputDim(const std::string& name) const override {
-    return GetDim(op_.Output(name));
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Output(name), dim);
-  }
-
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
-  }
-
- private:
-  template <bool Allocate>
-  Tensor* GetTensor(const std::string& name) const {
-    Tensor* t = nullptr;
-    auto* var = scope_.FindVar(name);
-    if (!var->IsType<LoDTensor>() && !var->IsType<Tensor>()) {
-      if (Allocate) {
-        t = var->GetMutable<LoDTensor>();
-      } else {
-        PADDLE_THROW("Variable(%s) should be tensor", name);
-      }
-    } else {
-      t = GetTensorFromVar(scope_.FindVar(name));
-    }
-    return t;
-  }
-
-  DDim GetDim(const std::string& name) const override {
-    return GetTensor<false>(name)->dims();
-  }
-
-  void SetDim(const std::string& name, const DDim& dim) override {
-    GetTensor<true>(name)->Resize(dim);
-  }
-
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
-
 class OpKernelBase {
 public:
  /**
@@ -597,32 +384,7 @@ class OperatorWithKernel : public OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}

  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const final {
-    VLOG(3) << "Running operator " << this->Type();
-    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-    this->InferShape(&infer_shape_ctx);
-
-    ExecutionContext ctx(*this, scope, dev_ctx);
-
-    // check if op[type] has kernel registered.
-    auto& all_op_kernels = AllOpKernels();
-    auto kernels_iter = all_op_kernels.find(type_);
-    if (kernels_iter == all_op_kernels.end()) {
-      PADDLE_THROW("op[%s] has no kernel", type_);
-    }
-
-    // check if op[type] have kernel for kernel_key
-    OpKernelMap& kernels = kernels_iter->second;
-    auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
-    auto kernel_iter = kernels.find(kernel_key);
-
-    if (kernel_iter == kernels.end()) {
-      PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_,
-                   kernel_key);
-    }
-
-    kernel_iter->second->Compute(ctx);
-  }
+           const platform::DeviceContext& dev_ctx) const final;

  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
  AllOpKernels() {
@@ -638,12 +400,15 @@ class OperatorWithKernel : public OperatorBase {
                       });
  }

-  virtual void InferShape(InferShapeContext* ctx) const = 0;
+  virtual void InferShape(InferShapeContext* ctx) const {
+    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
+  }

 protected:
  // indicate kernel DataType by input data. Defaultly all input data must be
  // same.
  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+    VLOG(3) << "Default IndicateDataType " << this->Type();
    auto& scope = ctx.scope();
    int data_type = -1;
    for (auto& input : this->inputs_) {
@@ -655,11 +420,14 @@ class OperatorWithKernel : public OperatorBase {
            t = &var->Get<Tensor>();
          } else if (var->IsType<LoDTensor>()) {
            t = &var->Get<LoDTensor>();
+          } else if (var->IsType<SelectedRows>()) {
+            t = &(var->Get<SelectedRows>().value());
          }
          if (t != nullptr) {
            int tmp = static_cast<int>(ToDataType(t->type()));
+            VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op must be same.");
+                           "DataType of Paddle Op %s must be same.", Type());
            data_type = tmp;
          }
        }

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -237,12 +237,12 @@ TEST(OpKernel, multi_inputs) {

  paddle::platform::CPUDeviceContext cpu_device_context;
  paddle::framework::Scope scope;
-  scope.Var("x0")->GetMutable<Tensor>();
-  scope.Var("x1")->GetMutable<Tensor>();
-  scope.Var("x2")->GetMutable<Tensor>();
-  scope.Var("k0")->GetMutable<Tensor>();
-  scope.Var("y0")->GetMutable<Tensor>();
-  scope.Var("y1")->GetMutable<Tensor>();
+  scope.Var("x0")->GetMutable<LoDTensor>();
+  scope.Var("x1")->GetMutable<LoDTensor>();
+  scope.Var("x2")->GetMutable<LoDTensor>();
+  scope.Var("k0")->GetMutable<LoDTensor>();
+  scope.Var("y0")->GetMutable<LoDTensor>();
+  scope.Var("y1")->GetMutable<LoDTensor>();

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
  op->Run(scope, cpu_device_context);

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -19,9 +19,9 @@ namespace paddle {
 namespace framework {

 BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
-  auto *b = prog_.add_blocks();
+  auto *b = desc_.add_blocks();
  b->set_parent_idx(parent.ID());
-  b->set_idx(prog_.blocks_size() - 1);
+  b->set_idx(desc_.blocks_size() - 1);
  blocks_.emplace_back(new BlockDescBind(this, b));
  return blocks_.back().get();
 }
@@ -30,23 +30,39 @@ ProgramDesc *ProgramDescBind::Proto() {
  for (auto &block : blocks_) {
    block->Flush();
  }
-  return &prog_;
+  return &desc_;
 }

 ProgramDescBind::ProgramDescBind() {
-  auto *block = prog_.mutable_blocks()->Add();
+  auto *block = desc_.mutable_blocks()->Add();
  block->set_idx(kRootBlockIndex);
  block->set_parent_idx(kNoneBlockIndex);
  blocks_.emplace_back(new BlockDescBind(this, block));
 }

 ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
-  prog_ = o.prog_;
+  desc_ = o.desc_;

-  for (int i = 0; i < prog_.blocks_size(); ++i) {
-    auto *block = prog_.mutable_blocks(i);
+  for (int i = 0; i < desc_.blocks_size(); ++i) {
+    auto *block = desc_.mutable_blocks(i);
    blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this));
  }
 }
+
+ProgramDescBind::ProgramDescBind(const ProgramDesc &desc) {
+  desc_ = desc;
+  for (auto &block_desc : *desc_.mutable_blocks()) {
+    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+  }
+}
+
+ProgramDescBind::ProgramDescBind(const std::string &binary_str) {
+  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
+                 "Fail to parse program_desc from binary string.");
+  for (auto &block_desc : *desc_.mutable_blocks()) {
+    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -29,8 +29,12 @@ class ProgramDescBind {
 public:
  ProgramDescBind();

+  explicit ProgramDescBind(const ProgramDesc &desc);
+
  ProgramDescBind(const ProgramDescBind &o);

+  explicit ProgramDescBind(const std::string &binary_str);
+
  BlockDescBind *AppendBlock(const BlockDescBind &parent);

  BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
@@ -40,7 +44,7 @@ class ProgramDescBind {
  ProgramDesc *Proto();

 private:
-  ProgramDesc prog_;
+  ProgramDesc desc_;

  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
 };

--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -59,7 +59,7 @@ TEST(ProgramDesc, copy_ctor) {
  };

  ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames());
-  ASSERT_EQ(3, global_block_copy->LocalVarNames().size());
+  ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size());
  assert_same_var("X", x);
  assert_same_var("Y", y);
  assert_same_var("Out", out);
@@ -79,5 +79,67 @@ TEST(ProgramDesc, copy_ctor) {
  // Not check block's protostr are same it because the order of vars could be
  // different and it is correct.
 }
+
+TEST(ProgramDescBind, serialize_and_deserialize) {
+  ProgramDescBind program_origin;
+  auto* global_block = program_origin.Block(0);
+  auto* x = global_block->Var("X");
+  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(FP32);
+  y->SetShape({784, 100});
+
+  auto* op = global_block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {x->Name()});
+  op->SetInput("Y", {y->Name()});
+
+  auto* out = global_block->Var("Out");
+  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  op->SetOutput("Y", {out->Name()});
+
+  std::string binary_str;
+  program_origin.Proto()->SerializeToString(&binary_str);
+
+  ProgramDescBind program_restored(binary_str);
+  auto* global_block_restored = program_restored.Block(0);
+  ASSERT_NE(global_block, global_block_restored);
+
+  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+    ASSERT_TRUE(global_block_restored->HasVar(name));
+    auto* restored = global_block_restored->Var(name);
+    ASSERT_NE(restored, var_before);
+    ASSERT_EQ(restored->Name(), var_before->Name());
+    ASSERT_EQ(restored->GetType(), var_before->GetType());
+    ASSERT_EQ(restored->Shape(), var_before->Shape());
+    ASSERT_EQ(restored->Proto()->SerializeAsString(),
+              var_before->Proto()->SerializeAsString());
+  };
+
+  ASSERT_EQ(global_block->LocalVarNames(),
+            global_block_restored->LocalVarNames());
+  ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size());
+  assert_same_var("X", x);
+  assert_same_var("Y", y);
+  assert_same_var("Out", out);
+
+  for (size_t i = 0; i < global_block->OpSize(); ++i) {
+    auto op_origin = global_block->Op(i);
+    auto op_restored = global_block->Op(i);
+
+    ASSERT_EQ(op_origin->Type(), op_restored->Type());
+    ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
+    ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs());
+
+    ASSERT_EQ(op_restored->Proto()->SerializeAsString(),
+              op_origin->Proto()->SerializeAsString());
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -46,7 +46,7 @@ bool IsTarget(const OpDesc& op_desc) {
  return false;
 }

-void prune_impl(const ProgramDesc& input, ProgramDesc& output, int block_id) {
+void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
  // TODO(tonyyang-svail):
  //    - will change to use multiple blocks for RNN op and Cond Op

@@ -91,8 +91,8 @@ void prune_impl(const ProgramDesc& input, ProgramDesc& output, int block_id) {
  // we reverse the should_run vector
  std::reverse(should_run.begin(), should_run.end());

-  output = input;
-  auto* op_field = output.mutable_blocks(block_id)->mutable_ops();
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
  op_field->Clear();
  for (size_t i = 0; i < should_run.size(); ++i) {
    if (should_run[i]) {
@@ -101,7 +101,8 @@ void prune_impl(const ProgramDesc& input, ProgramDesc& output, int block_id) {
  }
 }

-void Prune(const ProgramDesc& input, ProgramDesc& output) {
+// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
+void Prune(const ProgramDesc& input, ProgramDesc* output) {
  prune_impl(input, output, 0);
 }


--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-void Prune(const ProgramDesc& input, ProgramDesc& output);
+void Prune(const ProgramDesc& input, ProgramDesc* output);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -59,11 +59,11 @@ TEST(Prune, one_operator) {
  f::ProgramDesc *pdesc = program.Proto();
  f::ProgramDesc pruned;

-  Prune(*pdesc, pruned);
+  Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);

  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  Prune(*pdesc, pruned);
+  Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
 }

@@ -81,7 +81,7 @@ TEST(Prune, forward) {
  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
    f::ProgramDesc pruned;
    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    Prune(*pdesc, pruned);
+    Prune(*pdesc, &pruned);
    PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
  }
 }
@@ -100,7 +100,7 @@ TEST(Prune, multi_input_op) {
  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);

  f::ProgramDesc pruned;
-  Prune(*pdesc, pruned);
+  Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
 }

@@ -116,7 +116,7 @@ TEST(Prune, multi_output_op) {
  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);

  f::ProgramDesc pruned;
-  Prune(*pdesc, pruned);
+  Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
 }

@@ -133,6 +133,6 @@ TEST(Prune, multi_target) {
  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);

  f::ProgramDesc pruned;
-  Prune(*pdesc, pruned);
+  Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
 }
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <memory>  // for unique_ptr
 #include <mutex>   // for call_once
+#include "glog/logging.h"
 #include "paddle/string/printf.h"

 namespace paddle {
@@ -23,7 +24,10 @@ namespace framework {

 Scope::~Scope() {
  DropKids();
-  for (auto& kv : vars_) delete kv.second;
+  for (auto& kv : vars_) {
+    VLOG(3) << "Destroy variable " << kv.first;
+    delete kv.second;
+  }
 }

 Scope& Scope::NewScope() const {
@@ -38,6 +42,7 @@ Variable* Scope::Var(const std::string& name) {
  }
  Variable* v = new Variable();
  vars_[name] = v;
+  VLOG(3) << "Create variable " << name << " on scope";
  v->name_ = &(vars_.find(name)->first);
  return v;
 }

--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
@@ -23,7 +23,10 @@ class SelectedRows {
    value_.reset(new Tensor());
  }

-  SelectedRows() { value_.reset(new Tensor()); }
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+  }

  platform::Place place() const { return value_->place(); }

@@ -37,6 +40,8 @@ class SelectedRows {

  const Vector<int64_t>& rows() const { return rows_; }

+  Vector<int64_t>* mutable_rows() { return &rows_; }
+
  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }

  DDim GetCompleteDims() const {

--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/shape_inference.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+    const std::string &name) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return GetDims(names);
+}
+
+void InferShapeContext::SetOutputsDim(
+    const std::string &name, const std::vector<framework::DDim> &dims) {
+  auto &names = Outputs(name);
+  SetDims(names, dims);
+}
+
+void InferShapeContext::ShareLoD(const std::string &in, const std::string &out,
+                                 size_t i, size_t j) const {}
+
+std::vector<framework::DDim> InferShapeContext::GetDims(
+    const std::vector<std::string> &names) const {
+  std::vector<framework::DDim> ret;
+  ret.reserve(names.size());
+  std::transform(
+      names.begin(), names.end(), std::back_inserter(ret),
+      [this](const std::string &name) { return this->GetDim(name); });
+  return ret;
+}
+
+void InferShapeContext::SetDims(const std::vector<std::string> &names,
+                                const std::vector<framework::DDim> &dims) {
+  size_t length = names.size();
+  PADDLE_ENFORCE_EQ(length, dims.size());
+  for (size_t i = 0; i < length; ++i) {
+    SetDim(names[i], dims[i]);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include "paddle/framework/attribute.h"
 #include "paddle/framework/ddim.h"

 namespace paddle {
@@ -21,7 +22,7 @@ namespace framework {

 class InferShapeContext {
 public:
-  virtual ~InferShapeContext() {}
+  virtual ~InferShapeContext() = default;
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;

@@ -29,57 +30,32 @@ class InferShapeContext {
  virtual bool HasOutputs(const std::string &name) const = 0;

  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
-    const std::vector<std::string> &names = Inputs(name);
-    return GetDims(names);
-  }
-  virtual void SetInputDim(const std::string &name,
-                           const framework::DDim &dim) = 0;
-  void SetInputsDim(const std::string &name,
-                    const std::vector<framework::DDim> &dims) {
-    auto &names = Inputs(name);
-    SetDims(names, dims);
-  }
-  virtual framework::DDim GetOutputDim(const std::string &name) const = 0;
-  std::vector<framework::DDim> GetOutputsDim(const std::string &name) const {
-    const std::vector<std::string> &names = Outputs(name);
-    return GetDims(names);
-  }
+
+  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+
  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
  void SetOutputsDim(const std::string &name,
-                     const std::vector<framework::DDim> &dims) {
-    auto &names = Outputs(name);
-    SetDims(names, dims);
-  }
+                     const std::vector<framework::DDim> &dims);
+
  virtual AttrReader Attrs() const = 0;
  virtual const std::vector<std::string> &Inputs(
      const std::string &name) const = 0;
  virtual const std::vector<std::string> &Outputs(
      const std::string &name) const = 0;
+
  // TODO(qiao) implement this function
  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
-                size_t j = 0) const {}
+                size_t j = 0) const;

 protected:
  virtual framework::DDim GetDim(const std::string &name) const = 0;
  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+
  std::vector<framework::DDim> GetDims(
-      const std::vector<std::string> &names) const {
-    std::vector<framework::DDim> ret;
-    ret.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(ret),
-        [this](const std::string &name) { return this->GetDim(name); });
-    return ret;
-  }
+      const std::vector<std::string> &names) const;
+
  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims) {
-    size_t length = names.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
-    for (size_t i = 0; i < length; ++i) {
-      SetDim(names[i], dims[i]);
-    }
-  }
+               const std::vector<framework::DDim> &dims);
 };

 }  // namespace framework

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -126,11 +126,16 @@ class Tensor {
  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;

  platform::Place place() const {
-    PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::place() is called.");
    return holder_->place();
  }

-  std::type_index type() const { return holder_->type(); }
+  std::type_index type() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+    return holder_->type();
+  }

  size_t memory_size() const;


--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -28,6 +28,8 @@ class OperatorBase;
 class OpDescBind;
 class BlockDescBind;
 class BlockDesc;
+class InferShapeContext;
+
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;

 // The order should be as same as framework.proto
@@ -49,5 +51,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
 using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
                                          BlockDescBind* /*block*/)>;

+using InferShapeFN = std::function<void(InferShapeContext*)>;
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -59,6 +59,8 @@ class VarDescBind {
    desc_.set_type(VarDesc::LOD_TENSOR);
  }

+  explicit VarDescBind(const VarDesc &desc) : desc_(desc) {}
+
  VarDesc *Proto() { return &desc_; }

  std::string Name() const { return desc_.name(); }

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -216,17 +216,13 @@ void MKLDNNBatchNormLayer::resetFwdPD(
  }
  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
-  // TODO(TJ): use check macro
-  CHECK(out);
-  CHECK(out->getPrimitiveDesc() == pd->dst_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
  if (wgt) {
-    CHECK(wgt->getPrimitiveDesc() == pd->weights_primitive_desc());
+    CHECK_PRIMITIVE_DESC_EQ(wgt, pd->weights_primitive_desc());
  }
  if (passType_ != PASS_TEST || useGlobalStats_) {
-    CHECK(mean_);
-    CHECK(mean_->getPrimitiveDesc() == pd->mean_primitive_desc());
-    CHECK(var_);
-    CHECK(var_->getPrimitiveDesc() == pd->variance_primitive_desc());
+    CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
+    CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
  }
 }

@@ -283,19 +279,14 @@ void MKLDNNBatchNormLayer::resetBwdPD(
  if (in == nullptr) {
    return;
  }
-  CHECK(out);
-  CHECK(out->getPrimitiveDesc() == in->getPrimitiveDesc());
+  CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
  auto md = in->getMemoryDesc();
  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-  // TODO(TJ): use check macro
-  CHECK(wgt);
-  CHECK(wgt->getPrimitiveDesc() == pd->diff_weights_primitive_desc());
  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
-  CHECK(mean_);
-  CHECK(mean_->getPrimitiveDesc() == pd->mean_primitive_desc());
-  CHECK(var_);
-  CHECK(var_->getPrimitiveDesc() == pd->variance_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
 }

 void MKLDNNBatchNormLayer::resetBwdPipeline(

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -262,12 +262,15 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                            padR,
                                            padKind);
  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK(pd->src_primitive_desc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of in value should equal";
-  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad should equal the out value";
-  CHECK(pd->diff_weights_primitive_desc() == wgtVal_->getPrimitiveDesc())
-      << "primitive desc of weight grad should equal the weight value";
+  CHECK_PRIMITIVE_DESC_EQ(inVal_, pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      pd->diff_weights_primitive_desc(),
+      "primitive desc of weight value and grad should be equal");
 }

 void MKLDNNConvLayer::resetBwdDataPD(
@@ -292,10 +295,14 @@ void MKLDNNConvLayer::resetBwdDataPD(
                                        padR,
                                        padding_kind::zero);
  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-  CHECK(pd->diff_src_primitive_desc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of in grad should equal the in value";
-  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad should equal";
+  CHECK_PRIMITIVE_DESC_EQ(
+      inVal_,
+      pd->diff_src_primitive_desc(),
+      "primitive desc of in value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
 }

 void MKLDNNConvLayer::resetBwdBuffers(
@@ -310,17 +317,20 @@ void MKLDNNConvLayer::resetBwdBuffers(

  resetWithMatrix(
      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
-  CHECK(wgtVal_ != nullptr &&
-        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
-      << "primitive desc of weight grad and value should be equal";
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      wgt->getPrimitiveDesc(),
+      "primitive desc of weight grad and value should be equal");

  bias = nullptr;
  if (biases_ && biases_->getWGrad()) {
    resetWithMatrix(
        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
-    CHECK(bias && biasVal_ &&
-          bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
-        << "primitive desc of bias grad should equal the bias value";
+    CHECK(bias);
+    CHECK_PRIMITIVE_DESC_EQ(
+        biasVal_,
+        bias->getPrimitiveDesc(),
+        "primitive desc of bias grad and value should be equal");
  }

  if (dataPD == nullptr) {

--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -235,8 +235,7 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
  in = MKLDNNMatrix::create(intPD, inMat);
  Argument& arg = input->getOutput(this->getName());
  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK(inVal_);
-  CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal";
+  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
  if (inputIsOnlyMKLDNN()) {
    return;
  }
@@ -250,8 +249,7 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
      << "should have external input value and the format must be nchw(nc)";
  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
-  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
-      << "should have internal input value and primitive desc must equal";
+  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
  in = MKLDNNMatrix::create(intPD);
  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
  CHECK(cvtInGrad_);
@@ -277,8 +275,7 @@ void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
      << "should have external output value and the format must be nchw(nc)";
  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
-  CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
-      << "should have internal output value and primitive desc must equal";
+  CHECK_PRIMITIVE_DESC_EQ(outVal_, intPD);
  out = MKLDNNMatrix::create(intPD);
  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
  CHECK(cvtOutGrad_);

--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -24,6 +24,12 @@ namespace paddle {
 class MKLDNNMatrix;
 typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;

+#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
+  CHECK(MAT) << " can not be empty.";                                \
+  CHECK(MAT->getPrimitiveDesc() == PD)                               \
+      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
+      << "" __VA_ARGS__;
+
 /**
 * @brief MKLDNN Matrix.
 *

--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
 add_subdirectory(detail)

-cc_library(memory SRCS memory.cc)
+cc_library(memory SRCS memory.cc DEPS place)
 cc_library(memcpy SRCS memcpy.cc)

 cc_library(paddle_memory

--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
@@ -13,6 +13,7 @@
   limitations under the License. */

 #include "paddle/memory/detail/meta_cache.h"
+#include "glog/logging.h"
 #include "paddle/memory/detail/memory_block.h"
 #include "paddle/platform/assert.h"

@@ -28,7 +29,9 @@ Metadata MetadataCache::load(const MemoryBlock* block) {
    PADDLE_ASSERT(existing_metadata->second.check_guards());
    return existing_metadata->second;
  } else {
-    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    auto* meta = reinterpret_cast<const Metadata*>(block);
+    VLOG(3) << "Load MetaData type=" << meta->type;
+    PADDLE_ASSERT(meta->check_guards());
    return *reinterpret_cast<const Metadata*>(block);
  }
 }

--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -39,11 +39,15 @@ BuddyAllocator* GetCPUBuddyAllocator() {

 template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
-  return GetCPUBuddyAllocator()->Alloc(size);
+  VLOG(3) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void* p = GetCPUBuddyAllocator()->Alloc(size);
+  VLOG(3) << "  pointer=" << p;
+  return p;
 }

 template <>
 void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  VLOG(3) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }


--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -69,6 +69,13 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
    endif()

+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+    endif()
+
    # save_restore_op contains several operators
    if ("${TARGET}" STREQUAL "save_restore_op")
        set(pybind_flag 1)
@@ -83,6 +90,13 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
    endif()

+    # nccl_op contains several operators
+    if ("${TARGET}" STREQUAL "nccl_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+    endif()
+    
    # reduce_op contains several operators
    if ("${TARGET}" STREQUAL "reduce_op")
        set(pybind_flag 1)
@@ -114,6 +128,7 @@ function(op_library TARGET)
 endfunction()

 add_subdirectory(math)
+add_subdirectory(nccl)

 set(DEPS_OPS
    recurrent_op
@@ -123,6 +138,8 @@ set(DEPS_OPS
    sum_op
    pool_op
    pool_with_index_op
+    nccl_op
+    sequence_conv_op
    lstm_op)


@@ -131,9 +148,13 @@ op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
-op_library(sum_op DEPS net_op)
+op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
+if(WITH_GPU)
+op_library(nccl_op DEPS nccl_common)
+endif()
+op_library(sequence_conv_op DEPS context_project)
 op_library(lstm_op DEPS sequence2batch lstm_compute)

 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
@@ -148,4 +169,8 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array)
+
+if(WITH_GPU)
+  nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context)
+endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -70,7 +70,5 @@ information, or not. But the output only shares the LoD with input `Inference`.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-    ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
-    ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
+    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
    ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -81,7 +81,5 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-                       paddle::operators::AccuracyOpCUDAKernel<double>,
-                       paddle::operators::AccuracyOpCUDAKernel<int>,
+REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<int>,
                       paddle::operators::AccuracyOpCUDAKernel<int64_t>);
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -446,12 +446,16 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp,
 REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
            hard_sigmoid_grad, ops::ActivationOpGrad);

-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      act_type,                                                                \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>); \
-  REGISTER_OP_CPU_KERNEL(act_type##_grad,                                      \
-                         ops::ActivationGradKernel<paddle::platform::CPUPlace, \
-                                                   ops::grad_functor<float>>);
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
+  REGISTER_OP_CPU_KERNEL(                                                     \
+      act_type,                                                               \
+      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
+      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
+                            ops::functor<double>>);                           \
+  REGISTER_OP_CPU_KERNEL(                                                     \
+      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
+                                                 ops::grad_functor<float>>,   \
+      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+                                ops::grad_functor<double>>);

 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -17,12 +17,16 @@

 namespace ops = paddle::operators;

-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)        \
-  REGISTER_OP_GPU_KERNEL(                                                      \
-      act_type,                                                                \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>); \
-  REGISTER_OP_GPU_KERNEL(act_type##_grad,                                      \
-                         ops::ActivationGradKernel<paddle::platform::GPUPlace, \
-                                                   ops::grad_functor<float>>);
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
+  REGISTER_OP_GPU_KERNEL(                                                     \
+      act_type,                                                               \
+      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
+      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
+                            ops::functor<double>>);                           \
+  REGISTER_OP_GPU_KERNEL(                                                     \
+      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
+                                                 ops::grad_functor<float>>,   \
+      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+                                ops::grad_functor<double>>);

 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -210,8 +210,8 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
-    auto temp2 = (x > threshold).template cast<T>().eval();
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
    y.device(d) = x * (temp1 + temp2);
  }
 };
@@ -226,8 +226,8 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
-    auto temp2 = (x > threshold).template cast<T>().eval();
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
  }
 };
@@ -243,9 +243,10 @@ struct SoftShrinkFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    auto temp1 = (x > lambda).template cast<T>().eval();
-    auto temp2 = (x < -lambda).template cast<T>().eval();
-    y.device(d) = temp1 * (x - lambda) + temp2 * (x + lambda);
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
+    y.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
  }
 };

@@ -257,8 +258,9 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp1 = (x > lambda).template cast<T>().eval();
-    auto temp2 = (x < -lambda).template cast<T>().eval();
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
  }
 };
@@ -362,7 +364,8 @@ struct BReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
+    y.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
  }
 };

@@ -375,7 +378,9 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+    dx.device(d) = dy *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
  }
 };

@@ -390,7 +395,8 @@ struct Relu6Functor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(static_cast<T>(0)).cwiseMin(threshold);
+    y.device(d) =
+        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
  }
 };

@@ -402,8 +408,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) =
-        dy * ((x > static_cast<T>(0)) * (x < threshold)).template cast<T>();
+    dx.device(d) = dy *
+                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
+                       .template cast<T>();
  }
 };

@@ -463,7 +470,8 @@ struct SoftReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
+    auto tmp = static_cast<T>(threshold);
+    auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
  }
 };
@@ -476,7 +484,8 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
+    auto tmp = static_cast<T>(threshold);
+    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
  }
 };
@@ -490,7 +499,7 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.cwiseMax(alpha * x);
+    y.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
  }
 };

@@ -502,7 +511,8 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
+    auto temp1 = static_cast<T>(alpha) *
+                 (x < static_cast<T>(0)).template cast<T>().eval();
    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
  }
@@ -517,9 +527,9 @@ struct ELUFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) =
-        x.cwiseMax(static_cast<T>(0)) +
-        (alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
+    y.device(d) = x.cwiseMax(static_cast<T>(0)) +
+                  (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
+                      .cwiseMin(static_cast<T>(0));
  }
 };

@@ -531,9 +541,9 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) =
-        dy * (x > static_cast<T>(0)).template cast<T>() +
-        dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>() +
+                   dy * (y + static_cast<T>(alpha)) *
+                       (x < static_cast<T>(0)).template cast<T>();
  }
 };

@@ -545,7 +555,7 @@ struct PowFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = x.pow(factor);
+    y.device(d) = x.pow(static_cast<T>(factor));
  }
 };

@@ -557,7 +567,8 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
+    dx.device(d) = dy * static_cast<T>(factor) *
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
  }
 };

@@ -571,7 +582,8 @@ struct STanhFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = scale_b * (scale_a * x).tanh();
+    y.device(d) =
+        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
  }
 };

@@ -585,8 +597,10 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
-    dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
+    auto a = static_cast<T>(scale_a);
+    auto b = static_cast<T>(scale_b);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) = dy * a * b * (static_cast<T>(1) - temp);
  }
 };

@@ -599,7 +613,8 @@ struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
-    y.device(d) = (x > static_cast<T>(threshold)).template cast<T>() * x;
+    auto th = static_cast<T>(threshold);
+    y.device(d) = (x > th).template cast<T>() * x;
  }
 };

@@ -612,7 +627,8 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {

  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
-    dx.device(d) = dy * (x > static_cast<T>(threshold)).template cast<T>();
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dy * (x > th).template cast<T>();
  }
 };


--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/auc_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AucOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Inference"),
+                   "Input of Inference must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input of Label must be initialized.");
+    auto inference_dim = ctx->GetInputDim("Inference");
+    auto label_dim = ctx->GetInputDim("Label");
+
+    PADDLE_ENFORCE_EQ(inference_dim, label_dim,
+                      "inference and label should have same shape");
+
+    ctx->SetOutputDim("AUC", {1});
+    ctx->ShareLoD("Inference", /*->*/ "AUC");
+  }
+};
+
+class AucOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AucOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Inference",
+             "A floating point tensor of arbitrary shape and whose values"
+             "are in the range [0, 1].");
+    AddInput("Label",
+             "A tensor whose shape matches "
+             "Inference. Will be cast to bool.");
+    // TODO(typhoonzero): support weight input
+    AddOutput("AUC",
+              "A scalar representing the "
+              "current area-under-curve.");
+
+    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
+        .SetDefault("ROC");
+    AddAttr<int>("num_thresholds",
+                 "The number of thresholds to use when discretizing the"
+                 " roc curve.")
+        .SetDefault(200);
+
+    AddComment(
+        R"DOC(Computes the AUC according forward output and label.
+Best to use for binary classification evaluations.
+
+If input label contains values other than 0 and 1, it will be cast
+to bool.
+
+You can find the definations here: 
+https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+
+Possible curves are:
+- ROC: Receiver operating characteristic
+- PR: Precision Recall
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
+REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class AucKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Inference");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* auc = ctx.Output<Tensor>("AUC");
+
+    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
+
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    std::vector<float> thresholds_list;
+    thresholds_list.reserve(num_thresholds);
+    for (int i = 1; i < num_thresholds - 1; i++) {
+      thresholds_list[i] = (float)i / (num_thresholds - 1);
+    }
+    const float kEpsilon = 1e-7;
+    thresholds_list[0] = 0.0f - kEpsilon;
+    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
+
+    size_t num_samples = inference->numel();
+
+    const T* inference_data = inference->data<T>();
+    Tensor label_casted;
+    label_casted.Resize(label->dims());
+    bool* label_casted_data = label_casted.mutable_data<bool>(ctx.GetPlace());
+
+    const int* label_data = label->data<int>();
+    // cast label_data to bool
+    for (size_t i = 0; i < num_samples; i++) {
+      label_casted_data[i] = static_cast<bool>(label_data[i]);
+    }
+
+    // Create local tensor for storing the curve: TP, FN, TN, FP
+    // TODO(typhoonzero): use eigen op to caculate these values.
+    Tensor true_positive, false_positive, true_negative, false_negative;
+
+    true_positive.Resize({num_thresholds});
+    false_negative.Resize({num_thresholds});
+    true_negative.Resize({num_thresholds});
+    false_positive.Resize({num_thresholds});
+
+    int* tp_data = true_positive.mutable_data<int>(ctx.GetPlace());
+    int* fn_data = false_negative.mutable_data<int>(ctx.GetPlace());
+    int* tn_data = true_negative.mutable_data<int>(ctx.GetPlace());
+    int* fp_data = false_positive.mutable_data<int>(ctx.GetPlace());
+
+    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
+      // caculate TP, FN, TN, FP for current thresh
+      int tp = 0, fn = 0, tn = 0, fp = 0;
+      for (size_t i = 0; i < num_samples; i++) {
+        if (label_casted_data[i]) {
+          if (inference_data[i] >= (thresholds_list[idx_thresh])) {
+            tp++;
+          } else {
+            fn++;
+          }
+        } else {
+          if (inference_data[i] >= (thresholds_list[idx_thresh])) {
+            fp++;
+          } else {
+            tn++;
+          }
+        }
+      }
+      // store rates
+      tp_data[idx_thresh] = tp;
+      fn_data[idx_thresh] = fn;
+      tn_data[idx_thresh] = tn;
+      fp_data[idx_thresh] = fp;
+    }
+    // epsilon to avoid divide by zero.
+    float epsilon = 1e-6;
+    // Riemann sum to caculate auc.
+    Tensor tp_rate, fp_rate, rec_rate;
+    tp_rate.Resize({num_thresholds});
+    fp_rate.Resize({num_thresholds});
+    rec_rate.Resize({num_thresholds});
+    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
+    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
+    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
+    for (int i = 0; i < num_thresholds; i++) {
+      tp_rate_data[i] =
+          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
+      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] =
+          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+    }
+    *auc_data = 0.0f;
+    if (curve == "ROC") {
+      for (int i = 0; i < num_thresholds - 1; i++) {
+        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
+        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
+        *auc_data = *auc_data + dx * y;
+      }
+    } else if (curve == "PR") {
+      for (int i = 1; i < num_thresholds; i++) {
+        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
+        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
+        *auc_data = *auc_data + dx * y;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
@@ -64,6 +65,9 @@ class BatchNormOp : public framework::OperatorWithKernel {
        (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                             : x_dims[x_dims.size() - 1]);

+    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+                   "Input x must have 3 to 5 dimensions.");
+
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -108,10 +112,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
              "Store the global Variance when training");
    AddOutput("SavedMean",
              "Mean of the current mini batch, "
-              "will apply to output when training");
+              "will apply to output when training")
+        .AsIntermediate();
    AddOutput("SavedVariance",
              "Variance of the current mini batch, "
-              "will apply to output when training");
+              "will apply to output when training")
+        .AsIntermediate();
    AddComment(R"DOC(
 https://arxiv.org/pdf/1502.03167.pdf

@@ -135,7 +141,6 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {

    const auto *x = ctx.Input<Tensor>("X");
    const auto &x_dims = x->dims();
-
    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
                   "The Input dim size should be between 3 and 5");
    const int N = x_dims[0];
@@ -289,6 +294,25 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
  }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    VLOG(3) << "IndicateDataType " << this->Type();
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::ToDataType(t->type());
+  }
 };

 template <typename T>

--- a/paddle/operators/batch_norm_op.cu
+++ b/paddle/operators/batch_norm_op.cu
@@ -117,9 +117,6 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
    math::SetConstant<platform::GPUPlace, T> functor;
    functor(ctx.device_context(), saved_mean, 0);
    functor(ctx.device_context(), saved_variance, 0);
-    // FIXME(qiao) should not set zero self
-    functor(ctx.device_context(), mean_out, 0);
-    functor(ctx.device_context(), variance_out, 0);

    auto handle = ctx.cuda_device_context().cudnn_handle();

@@ -211,8 +208,15 @@ class BatchNormGradKernel<platform::GPUPlace, T>
    mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif

-    std::vector<int> dims = {N, C, H, W, D};
-    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    std::vector<int> dims;
+    std::vector<int> strides;
+    if (tensor_format == TensorFormat::NCHW) {
+      dims = {N, C, H, W, D};
+      strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    } else {
+      dims = {N, C, H, W, D};
+      strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    }
    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
        data_desc_, CudnnDataType<T>::type,
        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));

--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/cast_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CastOpProtoMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensor of cast op");
+    AddOutput("Out", "the output tensor of cast op");
+    AddComment(R"DOC(Cast operator.
+cast the input tensor to other data type.
+)DOC");
+    AddAttr<int>("out_data_type", "output data type");
+    AddAttr<int>("in_data_type", "input data type");
+  }
+};
+
+class CastOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "The output of cast op must be set");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CastOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto grad = new framework::OpDescBind();
+    grad->SetType("cast");
+    grad->SetInput("X", OutputGrad("Out"));
+    grad->SetOutput("Out", InputGrad("X"));
+    grad->SetAttr("out_data_type", GetAttr("in_data_type"));
+    grad->SetAttr("in_data_type", GetAttr("out_data_type"));
+    return std::unique_ptr<framework::OpDescBind>(grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUPlace;
+REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
+                        ops::CastOpProtoMaker);
+REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
+                       ops::CastOpKernel<CPU, double>,
+                       ops::CastOpKernel<CPU, int>,
+                       ops::CastOpKernel<CPU, int64_t>);
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/cast_op.h"
+
+template <typename T>
+using CastOpKernel =
+    paddle::operators::CastOpKernel<paddle::platform::GPUPlace, T>;
+
+REGISTER_OP_GPU_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+                       CastOpKernel<int>, CastOpKernel<int64_t>);
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <typename Place, typename InT>
+struct CastOpFunctor {
+  const framework::Tensor* in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext& ctx_;
+  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+                const platform::DeviceContext& ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* in_begin = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
+    platform::Transform<Place> trans;
+    trans(ctx_, in_begin, in_end, out_begin,
+          CastOpTransformFunctor<InT, OutT>());
+  }
+};
+
+template <typename Place, typename InT>
+class CastOpKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::VisitDataType(
+        static_cast<framework::DataType>(context.Attr<int>("out_data_type")),
+        CastOpFunctor<Place, InT>(in, out, context.device_context()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
@@ -31,16 +31,6 @@ using CUDADeviceContext = platform::CUDADeviceContext;

 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;

-// NOTE: framework::vectorize converts to type int64_t
-//       which does not fit cudnn inputs.
-std::vector<int> Dims2Vector(const framework::DDim& dims) {
-  std::vector<int> ret;
-  for (int i = 0; i < dims.size(); i++) {
-    ret.push_back(dims[i]);
-  }
-  return ret;
-}
-
 template <typename T>
 class CudnnConvOpKernel : public framework::OpKernel<T> {
 public:
@@ -68,12 +58,12 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
    ScopedConvolutionDescriptor conv_desc;
    DataLayout layout = DataLayout::kNCHW;

-    cudnnTensorDescriptor_t cudnn_input_desc =
-        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_output_desc =
-        output_desc.descriptor<T>(layout, Dims2Vector(output->dims()), groups);
-    cudnnFilterDescriptor_t cudnn_filter_desc =
-        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()), groups);
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);

@@ -156,13 +146,13 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    ScopedConvolutionDescriptor conv_desc;
    DataLayout layout = DataLayout::kNCHW;

-    cudnnTensorDescriptor_t cudnn_input_desc =
-        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()), groups);
    cudnnTensorDescriptor_t cudnn_output_grad_desc =
-        output_grad_desc.descriptor<T>(layout, Dims2Vector(output_grad->dims()),
-                                       groups);
-    cudnnFilterDescriptor_t cudnn_filter_desc =
-        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+        output_grad_desc.descriptor<T>(
+            layout, framework::vectorize2int(output_grad->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()), groups);
    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;

@@ -192,7 +182,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    auto handle = ctx.cuda_device_context().cudnn_handle();
    if (input_grad) {
      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
-          layout, Dims2Vector(input_grad->dims()), groups);
+          layout, framework::vectorize2int(input_grad->dims()), groups);
      PADDLE_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
              handle, cudnn_filter_desc,
@@ -213,7 +203,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {

    if (filter_grad) {
      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
-          layout, Dims2Vector(filter_grad->dims()), groups);
+          layout, framework::vectorize2int(filter_grad->dims()), groups);
      PADDLE_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,

--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -21,7 +21,7 @@ namespace {

 template <typename T>
 __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                           const int* label, const int N,
+                                           const int64_t* label, const int N,
                                           const int D) {
  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
  // CUDA_1D_KERNEL_LOOP(i, N) {
@@ -77,8 +77,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
    const T* x_data = x->data<T>();

-    int batch_size = x->dims()[0];
-    int class_num = x->dims()[1];
+    int64_t batch_size = x->dims()[0];
+    int64_t class_num = x->dims()[1];

    int block = 512;
    int grid = (batch_size * class_num + block - 1) / block;
@@ -93,7 +93,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
    } else {
      math::SetConstant<platform::GPUPlace, T> functor;
      functor(ctx.device_context(), dx, 0);
-      auto* label_data = label->data<int>();
+      auto* label_data = label->data<int64_t>();
      grid = (batch_size + block - 1) / block;
      CrossEntropyGradientKernel<T><<<
          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(

--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -54,7 +54,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());

-    int class_num = x->dims()[1];
+    int64_t class_num = x->dims()[1];
    if (ctx.Attr<bool>("soft_label")) {
      auto x_mat = EigenMatrix<T>::From(*x);
      auto dy_mat = EigenMatrix<T>::From(*dy);
@@ -62,20 +62,20 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
      auto dx_mat = EigenMatrix<T>::From(*dx);

      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
-          -(lbl_mat * dy_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) /
-            x_mat);
+          -(lbl_mat *
+            dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
    } else {
-      int batch_size = x->dims()[0];
+      int64_t batch_size = x->dims()[0];
      const T* dy_data = dy->data<T>();
      const T* x_data = x->data<T>();
-      const int* label_data = label->data<int>();
+      const int64_t* label_data = label->data<int64_t>();

      math::SetConstant<platform::CPUPlace, T> functor;
      functor(ctx.device_context(), dx, 0);

-      for (int i = 0; i < batch_size; ++i) {
+      for (int64_t i = 0; i < batch_size; ++i) {
        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
-        int index = i * class_num + label_data[i];
+        int64_t index = i * class_num + label_data[i];
        dx_data[index] = -dy_data[i] / x_data[index];
      }
    }

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -30,7 +30,7 @@ class DropoutOp : public framework::OperatorWithKernel {

    auto x_dims = ctx->GetInputDim("X");
    ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_training") == 1) {
+    if (ctx->Attrs().Get<bool>("is_training") == true) {
      ctx->SetOutputDim("Mask", x_dims);
    }
    ctx->ShareLoD("X", /*->*/ "Out");
@@ -43,7 +43,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  DropoutOpMaker(framework::OpProto* proto,
                 framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
+    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
        .SetDefault(.5f);
    AddAttr<bool>("is_training", "Whether in training phase.").SetDefault(true);
    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
@@ -69,7 +69,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), 1,
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), true,
                      "GradOp is only callable when is_training is true");

    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
@@ -77,8 +77,8 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) must not be null.");

-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<AttrType>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx->Attrs().Get<AttrType>("dropout_prob"), 1);
+    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
    auto x_dims = ctx->GetInputDim("X");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    PADDLE_ENFORCE_EQ(x_dims, out_dims,

--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -33,7 +33,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
    auto* y = context.Output<Tensor>("Out");
    const auto* x_data = x->data<T>();
    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+    float dropout_prob = context.Attr<float>("dropout_prob");

    if (context.Attr<bool>("is_training")) {
      auto* mask = context.Output<Tensor>("Mask");
@@ -41,7 +41,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
      int seed = context.Attr<int>("seed");
      std::minstd_rand engine;
      engine.seed(seed);
-      std::uniform_real_distribution<AttrType> dist(0, 1);
+      std::uniform_real_distribution<float> dist(0, 1);
      size_t size = framework::product(mask->dims());
      for (size_t i = 0; i < size; ++i) {
        if (dist(engine) < dropout_prob) {

--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -41,7 +41,7 @@ class FeedOp : public framework::OperatorBase {

    auto col = Attr<int>("col");

-    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var"
+    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
            << out_name;

    auto &feed_list = feed_var->Get<framework::FeedFetchList>();

--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -52,6 +52,7 @@ class FetchOp : public framework::OperatorBase {
    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
+    dev_ctx.Wait();
    dst_item.set_lod(src_item.lod());

    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -64,5 +64,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
                             ops::FillConstantOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>);
+    fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>);
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
@@ -18,5 +18,6 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>);
+    fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>);
--- a/paddle/operators/fill_constant_op.h
+++ b/paddle/operators/fill_constant_op.h
@@ -25,7 +25,7 @@ class FillConstantOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* out = ctx.Output<framework::Tensor>("Out");
    out->mutable_data<T>(ctx.GetPlace());
-    auto value = ctx.Attr<T>("value");
+    auto value = ctx.Attr<float>("value");

    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
    auto place = ctx.GetEigenDevice<Place>();

--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -171,8 +171,7 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(
        weight_width, frame_size * 3,
        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-    auto bias = Input("Bias");
-    if (bias != framework::kEmptyVarName) {
+    if (ctx->HasInput("Bias")) {
      auto bias_dims = ctx->GetInputDim("Bias");
      int bias_height = bias_dims[0];
      int bias_width = bias_dims[1];
@@ -203,6 +202,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
            ops::GRUUnitGradOp);
 REGISTER_OP_CPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>);
+                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>);
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
@@ -17,6 +17,8 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>);
+                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUUnitKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>);
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GRUUnitGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class HuberLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must be initialized.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2,
+                      "The rank of Input(X) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(x_dims[1], 1,
+                      "Each row of Input(X) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Residual", x_dims);
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+template <typename AttrType>
+class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HuberLossOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input value of huber loss op."
+             "X is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Y",
+             "The target value of huber loss op."
+             "Y is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Residual",
+              "Intermediate tensor to cache residual value between Y and X."
+              "The shape is same as Input(X) and will be reused in backward.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "The output tensor with shape [batch_size, 1] which represents "
+              "the huber loss.");
+    AddAttr<AttrType>("delta", "Hyper parameter in huber loss.");
+    AddComment(R"DOC(
+Huber loss is a loss function used in robust regression. We define X as the
+input value and Y as the target value. Huber loss can evaluate the fitness of
+X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
+shape of X and Y are [batch_size, 1]. The equation is:
+
+L_{\delta}(y, f(x)) =
+\begin{cases}
+0.5 * (y - f(x))^2, \quad |y - f(x)| \leq \delta \\
+\delta * (|y - f(x)| - 0.5 * \delta),   \quad otherwise
+\end{cases}
+
+)DOC");
+  }
+};
+
+class HuberLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Residual"),
+                   "Input(Residual) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto residual_dims = ctx->GetInputDim("Residual");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(residual_dims, x_dims);
+    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims);
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+            huber_loss_grad, ops::HuberLossGradOp);
+REGISTER_OP_CPU_KERNEL(huber_loss,
+                       ops::HuberLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss_grad,
+    ops::HuberLossGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/huber_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(huber_loss,
+                       ops::HuberLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    huber_loss_grad,
+    ops::HuberLossGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/operators/huber_loss_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+struct HuberLossForward {
+  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return static_cast<T>(0.5) * val * val;
+    } else {
+      return delta * (abs_val - static_cast<T>(0.5) * delta);
+    }
+  }
+
+  T delta;
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class HuberLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<Tensor>("Residual");
+    auto* out1 = context.Output<Tensor>("Out");
+    auto delta = static_cast<T>(context.Attr<AttrType>("delta"));
+    auto place = context.GetEigenDevice<Place>();
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    out0->mutable_data<T>(context.GetPlace());
+    auto residual = EigenVector<T>::Flatten(*out0);
+    residual.device(place) = y - x;
+    out1->mutable_data<T>(context.GetPlace());
+    auto loss = EigenVector<T>::Flatten(*out1);
+    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
+  }
+};
+
+template <typename T>
+struct HuberLossBackward {
+  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
+      : sign(sign), delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return sign * val;
+    } else {
+      if (val > 0) {
+        return sign * delta;
+      } else {
+        return -1 * sign * delta;
+      }
+    }
+  }
+
+  T sign;
+  T delta;
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class HuberLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("Residual");
+    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = static_cast<T>(context.op().Attr<AttrType>("delta"));
+    auto place = context.GetEigenDevice<Place>();
+
+    auto residual = EigenVector<T>::Flatten(*in0);
+    auto out_grad = EigenVector<T>::Flatten(*in1);
+
+    if (out0) {
+      out0->mutable_data<T>(context.GetPlace());
+      auto x_grad = EigenVector<T>::Flatten(*out0);
+      x_grad.device(place) =
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
+    }
+
+    if (out1) {
+      out1->mutable_data<T>(context.GetPlace());
+      auto y_grad = EigenVector<T>::Flatten(*out1);
+      y_grad.device(place) =
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/l1_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class L1NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class L1NormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  L1NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input of l1_norm op.");
+    AddOutput("Out", "(Scalar) The output of l1_norm op.");
+    AddComment(R"DOC(
+L1 Norm Operator.
+
+Computes the L1 norm of a tensor.
+
+Out = sum (abs(X))
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
+            ops::L1NormGradOp);
+REGISTER_OP_CPU_KERNEL(l1_norm,
+                       ops::L1NormKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/l1_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(l1_norm,
+                       ops::L1NormKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = sum(abs(X))
+template <typename Place, typename T>
+class L1NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto place = context.GetEigenDevice<Place>();
+
+    out.device(place) = x.abs().sum();
+  }
+};
+
+// dX = dout * sign(X)
+template <typename Place, typename T>
+class L1NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *x = context.Input<framework::Tensor>("X");
+    const framework::Tensor *d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(d_out->numel() == 1, "L1 Norm Gradient should be scalar");
+    framework::Tensor *dx =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(context.GetPlace());
+
+    auto x_eigen = framework::EigenVector<T>::Flatten(*x);
+    auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
+    auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
+    auto place = context.GetEigenDevice<Place>();
+
+    Eigen::DSizes<int, 1> x_dsize(x->numel());
+    dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -13,6 +13,7 @@
   limitations under the License. */

 #include "paddle/operators/lookup_table_op.h"
+#include "paddle/framework/var_type_inference.h"

 namespace paddle {
 namespace operators {
@@ -60,6 +61,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
             "Ids must be a column vector with rank = 2."
             "The 2nd dimension size must be 1");
    AddOutput("Out", "The lookup results, which have the same type with W.");
+    AddAttr<bool>("is_sparse", "Sparse update").SetDefault(false);
    AddComment(R"DOC(
 This operator is used to perform lookups on the parameter W,
 then concatenated into a dense tensor.
@@ -70,6 +72,15 @@ or not. And the output only shares the LoD with input `Ids`.
  }
 };

+class LookupTableOpGradDescMaker
+    : public framework::DefaultGradOpDescMaker<true> {
+  using ::paddle::framework::DefaultGradOpDescMaker<
+      true>::DefaultGradOpDescMaker;
+
+ protected:
+  virtual std::string GradOpType() const { return "lookup_table_grad"; }
+};
+
 class LookupTableOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -86,12 +97,35 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  }
 };

+class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind& op_desc,
+                  framework::BlockDescBind* block) const override {
+    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto attr = op_desc.GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to SelectedRows";
+      block->Var(out_var_name)->SetType(framework::VarDesc::SELECTED_ROWS);
+    } else {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to LoDTensor";
+      block->Var(out_var_name)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
-            lookup_table_grad, ops::LookupTableOpGrad);
-
-REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>);
-REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>);
+REGISTER_OPERATOR(lookup_table, ops::LookupTableOp,
+                  ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker);
+REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
+                  ops::LookupTableOpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
+                       ops::LookupTableKernel<double>);
+REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
+                       ops::LookupTableGradKernel<double>);
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
-
   http://www.apache.org/licenses/LICENSE-2.0
-
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,22 +11,21 @@

 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/lookup_table_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cuda_helper.h"

 namespace paddle {
 namespace operators {

-using Tensor = framework::Tensor;
-
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTable(T* output, const T* table, const int32_t* ids,
-                            const int N, const int K, const int D) {
+__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+                            const int64_t N, const int64_t K, const int64_t D) {
  int idx = threadIdx.x;
  int idy = blockIdx.x + threadIdx.y * GridDimX;

  while (idy < K) {
-    int id = ids[idy];
+    int64_t id = ids[idy];
    PADDLE_ASSERT(id >= 0);
    PADDLE_ASSERT(id < N);
    T* out = output + idy * D;
@@ -42,8 +38,9 @@ __global__ void LookupTable(T* output, const T* table, const int32_t* ids,
 }

 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
-                                const int N, const int K, const int D) {
+__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+                                const int64_t N, const int64_t K,
+                                const int64_t D) {
  int idx = threadIdx.x;
  int idy = blockIdx.x + threadIdx.y * GridDimX;

@@ -71,7 +68,7 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
    size_t N = table_t->dims()[0];
    size_t D = table_t->dims()[1];
    size_t K = ids_t->numel();
-    auto ids = ids_t->data<int32_t>();
+    auto ids = ids_t->data<int64_t>();
    auto table = table_t->data<T>();
    auto output = output_t->mutable_data<T>(context.GetPlace());

@@ -88,27 +85,63 @@ template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto ids_t = context.Input<Tensor>("Ids");
-    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
-
-    int N = d_table_t->dims()[0];
-    int D = d_table_t->dims()[1];
-    int K = ids_t->numel();
-    const int32_t* ids = ids_t->data<int32_t>();
-    const T* d_output = d_output_t->data<T>();
-    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
-
-    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-    t.device(context.GetEigenDevice<platform::GPUPlace>()) =
-        t.constant(static_cast<T>(0));
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-    LookupTableGrad<T, 128, 8, 8><<<
-        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    if (is_sparse) {
+      auto* ids = context.Input<Tensor>("Ids");
+      auto* table = context.Input<Tensor>("W");
+      auto* d_output = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                        context.device_context())
+                        .stream();
+      // copy GPU memory to CPU pinned memory
+      framework::Vector<int64_t> new_rows;
+      new_rows.resize(ids_dim[0]);
+      auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace());
+
+      memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
+                   ids_dim[0] * sizeof(int64_t), stream);
+
+      d_table->set_rows(new_rows);
+
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      auto* d_table_data = d_table_value->data<T>();
+      auto* d_output_data = d_output->data<T>();
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
+                   d_output->numel(), stream);
+
+    } else {
+      auto ids_t = context.Input<Tensor>("Ids");
+      auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+
+      int N = d_table_t->dims()[0];
+      int D = d_table_t->dims()[1];
+      int K = ids_t->numel();
+      const int64_t* ids = ids_t->data<int64_t>();
+      const T* d_output = d_output_t->data<T>();
+      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+      t.device(context.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+      LookupTableGrad<T, 128, 8,
+                      8><<<grids, threads, 0,
+                           reinterpret_cast<const platform::CUDADeviceContext&>(
                               context.device_context())
                               .stream()>>>(d_table, d_output, ids, N, K, D);
+    }
  }
 };

@@ -116,6 +149,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(lookup_table_grad,
-                       ops::LookupTableGradCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
+                       ops::LookupTableCUDAKernel<double>);
+REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel<float>,
+                       ops::LookupTableGradCUDAKernel<double>);
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
-
   http://www.apache.org/licenses/LICENSE-2.0
-
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,12 +12,15 @@
 #pragma once

 #include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/selected_rows.h"

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;

 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
@@ -32,7 +32,7 @@ class LookupTableKernel : public framework::OpKernel<T> {

    int N = table_t->dims()[0];
    int D = table_t->dims()[1];
-    auto ids = ids_t->data<int32_t>();
+    auto ids = ids_t->data<int64_t>();
    auto table = table_t->data<T>();
    auto output = output_t->mutable_data<T>(context.GetPlace());
    for (int64_t i = 0; i < ids_t->numel(); ++i) {
@@ -47,25 +47,55 @@ template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto ids_t = context.Input<Tensor>("Ids");
-    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    if (is_sparse) {
+      auto* ids = context.Input<Tensor>("Ids");
+      auto* table = context.Input<Tensor>("W");
+      auto* d_output = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));

-    int N = d_table_t->dims()[0];
-    int D = d_table_t->dims()[1];
-    auto ids = ids_t->data<int32_t>();
-    const T* d_output = d_output_t->data<T>();
-    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();

-    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-    t.device(context.GetEigenDevice<platform::CPUPlace>()) =
-        t.constant(static_cast<T>(0));
+      framework::Vector<int64_t> new_rows;
+      new_rows.reserve(ids_dim[0]);
+      for (int64_t i = 0; i < ids_dim[0]; i++) {
+        new_rows.push_back(ids_data[i]);
+      }
+      d_table->set_rows(new_rows);

-    for (int64_t i = 0; i < ids_t->numel(); ++i) {
-      PADDLE_ENFORCE_LT(ids[i], N);
-      PADDLE_ENFORCE_GE(ids[i], 0);
-      for (int j = 0; j < D; ++j) {
-        d_table[ids[i] * D + j] += d_output[i * D + j];
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      d_table->set_height(table->dims()[0]);
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table_value->data<T>();
+
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+    } else {
+      auto* ids = context.Input<Tensor>("Ids");
+      auto* d_output = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<Tensor>(framework::GradVarName("W"));
+      auto* table = context.Input<Tensor>("W");
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      int N = table->dims()[0];
+      int D = d_output->dims()[1];
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+
+      for (int64_t i = 0; i < ids->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids_data[i], N);
+        PADDLE_ENFORCE_GE(ids_data[i], 0);
+        for (int j = 0; j < D; ++j) {
+          d_table_data[ids_data[i] * D + j] = d_output_data[i * D + j];
+        }
      }
    }
  }

--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lrn_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class LRNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LRNOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LRNOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MidOut"),
+                   "MidOut(Out) of LRNOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
+
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->SetOutputDim("MidOut", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename T>
+class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", R"DOC(
+ (Tensor) The input of LRN operator. It must be a 4D tenor with NCHW format.
+ )DOC");
+
+    AddOutput("Out",
+              "(Tensor) The output of LRN operator, which is also the 4D "
+              "tensor with NCHW format.");
+    AddOutput("MidOut", R"Doc(
+(Tensor)Middle result of lrn op.It's computed in forward process 
+and also used in backward process.
+    )Doc");
+
+    AddAttr<int>("n", R"DOC(
+(int, default 5)n is “adjacent” kernel maps at the same spatial position.
+        )DOC")
+        .SetDefault(5)
+        .GreaterThan(0);
+
+    AddAttr<T>("k", R"DOC(
+(float, default 2.0)k is the bias.
+        )DOC")
+        .SetDefault(2.0)
+        .GreaterThan(0.0);
+
+    AddAttr<T>("alpha", R"DOC(
+(float, default 0.0001)alpha is the scale number.
+        )DOC")
+        .SetDefault(0.0001)
+        .GreaterThan(0.0);
+
+    AddAttr<T>("beta", R"DOC(
+(float, default 0.75)beta is the power number.
+        )DOC")
+        .SetDefault(0.75)
+        .GreaterThan(0.0);
+
+    AddComment(R"DOC(
+ Local Response Normalization.
+
+ This Function comes from the paper
+ "ImageNet Classification with Deep Convolutional Neural Networks".
+
+ The original formula is:
+
+                                Input(i, x, y)
+ Output(i, x, y) = ----------------------------------------------
+                                 -- upper
+                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+                                 -- j = lower
+
+ upper is `min(C, c + n/2)`
+ lower if `max(0, c - n/2)`
+
+ Function implementation:
+
+ inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
+ And the meaning of each dimension(0-3) is respectively batch size,
+ feature maps, rows and columns.
+
+ Input and Output in the above formula is for each map(i) of one image, and
+ Input(i, x, y), Output(i, x, y) represents an element in an image.
+
+ C is the number of feature maps of one image, and n is a hyper-parameters
+ is configured when Function is initialized. The sum in the denominator
+ is the sum of the same position in the neighboring maps.
+    )DOC");
+  }
+};
+
+class LRNOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("MidOut")),
+                   "Input(MidOut@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(lrn_grad,
+                       ops::LRNGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/lrn_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(lrn_grad,
+                       ops::LRNGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class LRNKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+
+  // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
+  // x represents inputs
+  // f(x) represents outputs
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // input
+    const Tensor* x = ctx.Input<Tensor>("X");
+    auto x_dims = x->dims();
+
+    // NCHW
+    int N = x_dims[0];
+    int C = x_dims[1];
+    int H = x_dims[2];
+    int W = x_dims[3];
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // MidOut save the intermediate result for backward
+    Tensor* mid = ctx.Output<Tensor>("MidOut");
+    mid->mutable_data<T>(ctx.GetPlace());
+
+    int n = ctx.Attr<int>("n");
+    T alpha = ctx.Attr<float>("alpha");
+    T beta = ctx.Attr<float>("beta");
+    T k = ctx.Attr<float>("k");
+
+    PADDLE_ENFORCE(n > 0, "n should >= 0");
+    PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0");
+    PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
+    PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
+
+    auto x_v = framework::EigenVector<T>::Flatten(*x);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid.device(ctx.GetEigenDevice<Place>()) = e_mid.constant(k);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(*x);
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch >= 0 && ch < C) {
+            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                               Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            s.device(ctx.GetEigenDevice<Place>()) += alpha * r.square();
+          }
+        }
+      }
+    }
+
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    out_e.device(ctx.GetEigenDevice<Place>()) =
+        x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+  }
+};
+
+/**
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ */
+template <typename Place, typename T>
+class LRNGradKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* out = ctx.Input<Tensor>("Out");
+    const Tensor* out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor* mid = ctx.Input<Tensor>("MidOut");
+
+    auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
+    x_g->mutable_data<T>(ctx.GetPlace());
+
+    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
+    x_g_e.device(ctx.GetEigenDevice<Place>()) = x_g_e.constant(0.0);
+
+    auto x_dims = x->dims();
+    int N = x_dims[0];
+    int C = x_dims[1];
+    int H = x_dims[2];
+    int W = x_dims[3];
+
+    int n = ctx.Attr<int>("n");
+    T alpha = ctx.Attr<T>("alpha");
+    T beta = ctx.Attr<T>("beta");
+    T ratio = -2 * alpha * beta;
+
+    auto e_x = framework::EigenTensor<T, 4>::From(*x);
+    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
+    auto e_out = framework::EigenTensor<T, 4>::From(*out);
+    auto e_out_g = framework::EigenTensor<T, 4>::From(*out_g);
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                             Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                     Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        i_x_g.device(ctx.GetEigenDevice<Place>()) = i_mid.pow(-beta) * i_out_g;
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch < 0 || ch >= C) {
+            continue;
+          }
+
+          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                       Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          i_x_g.device(ctx.GetEigenDevice<Place>()) +=
+              ratio * c_out_g * c_out * i_x / c_mid;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -9,6 +9,7 @@ if(WITH_GPU)
    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
+    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
 else()
@@ -18,6 +19,7 @@ else()
    cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
    cc_library(pooling SRCS pooling.cc DEPS device_context)
    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
+    cc_library(context_project SRCS context_project.cc DEPS device_context)
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
 endif()

--- a/paddle/operators/math/context_project.cc
+++ b/paddle/operators/math/context_project.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/context_project.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class ContextProjectFunctor<platform::CPUPlace, float>;
+template class ContextProjectFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/operators/math/context_project.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/math/context_project.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class ContextProjectFunctor<platform::GPUPlace, float>;
+template class ContextProjectFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/math/im2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+/*
+ * \brief Context projection concatenate features in adjacent time steps in
+ * a sequence. The i-th row of the output is the concatenation of
+ * context_length rows of the input. The context_length rows are the
+ * consecutive rows from the i+shift_start row.
+
+ * \param in            Input data.
+ * \param Shape         The shape of Input data,
+ *                      [minibatch, number_of_input_features].
+ * \param type          A float LoDTensor.
+ *
+ * \param padding_data  Padding data.
+ * \param Shape         The shape of Padding data,
+ *                      [up_pad + down_pad, number_of_input_features].
+ * \param type          A float Tensor.
+ *
+ * \param col           Col data.
+ * \param Shape         The shape of Col data,
+ *                      [minibatch, context_length * number_of_input_features].
+ * \param type           A float Tensor.
+ *
+ * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
+ * time-steps:
+ *
+ * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3,
+ * 4].
+ * Besides, for the sake of simplicity, we assume M=1 and N=2.
+ *
+ * X = [[a1, a2;
+ *       b1, b2;
+ *       c1, c2]
+ *      [d1, d2]]
+ *
+ * This is to say that input (X) has 4 words and the dimension of each word
+ * representation is 2.
+ *
+ * - Case1:
+ * If context_start is -1 and padding_trainable is false, we use zero to pad
+ * instead of learned weight to pad,
+ * and the context_lenth is 3, the output (Out) is:
+ *
+ * Out =[[0,  0,  a1, a2, b1, b2;
+ *        a1, a2, b1, b2, c1, c2;
+ *        b1, b2, c1, c2, 0,  0 ]
+ *       [0,  0,  d1, d2, 0,  0 ]]
+ *
+ * - Case2:
+ * If context_start is -1 and padding_trainable is true, we use learned weight
+ * to pad,
+ * and the context_lenth is 3, the output (Out) is:
+ *
+ * Out = [[w1, w2, a1, a2, b1, b2;
+ *         a1, a2, b1, b2, c1, c2;
+ *         b1, b2, c1, c2, w3, w4]
+ *        [w1, w2, d1, d2, w3, w4]]
+ *
+ */
+
+template <typename Place, typename T>
+class ContextProjectFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::LoDTensor& in, framework::Tensor& padding_data,
+                  framework::Tensor& col, bool padding_trainable,
+                  int context_start, int context_length, int context_stride,
+                  int up_pad, int down_pad, bool gradient, bool input_grad,
+                  bool pad_grad) {
+    auto lod_level_0 = in.lod()[0];
+
+    paddle::operators::math::Im2ColFunctor<
+        paddle::operators::math::ColFormat::kOCF, Place, float>
+        im2col_ocf;
+    paddle::operators::math::Col2ImFunctor<
+        paddle::operators::math::ColFormat::kOCF, Place, float>
+        col2im_ocf;
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+    input_grad = gradient && input_grad;
+    pad_grad = gradient && pad_grad;
+
+    if (!gradient || input_grad) {
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        input_row_begin = (context_start > 0)
+                              ? static_cast<int>(lod_level_0[i]) + context_start
+                              : static_cast<int>(lod_level_0[i]);
+        input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+        framework::Tensor out_t =
+            col.Slice(static_cast<int>(lod_level_0[i]),
+                      static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        if (input_row_begin < input_row_end) {
+          framework::Tensor in_t = in.Slice(input_row_begin, input_row_end);
+
+          std::vector<int64_t> output_shape(
+              {sequence_height, 1, 1, context_length,
+               sequence_width});  // output_height, output_width,
+          // input_channels, filter_height, filter_width
+
+          out_t.Resize(framework::make_ddim(output_shape));
+
+          std::vector<int64_t> input_shape(
+              {1, input_row_end - input_row_begin,
+               sequence_width});  // input_channels, input_height, input_width
+          in_t.Resize(framework::make_ddim(input_shape));
+
+          if (gradient) {
+            col2im_ocf(context, in_t, out_t,
+                       /*stride_height*/ context_stride, /*stride_width*/ 1,
+                       up_pad, down_pad, 0, 0);
+          } else {
+            im2col_ocf(context, in_t, out_t,
+                       /*stride_height*/ context_stride, /*stride_width*/ 1,
+                       up_pad, down_pad, 0, 0);
+          }
+          out_t.Resize({sequence_height, context_length * sequence_width});
+        }
+      }
+    }
+    if (!gradient || pad_grad) {
+      if (padding_trainable) {
+        for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+          framework::Tensor out_t =
+              col.Slice(static_cast<int>(lod_level_0[i]),
+                        static_cast<int>(lod_level_0[i + 1]));
+
+          sequence_height = static_cast<int>(out_t.dims()[0]);
+
+          // add up trainable data
+          out_t.Resize({sequence_height * context_length, sequence_width});
+
+          if (up_pad > 0) {  // add up pad
+            int padding_rows = std::min(
+                up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+            for (int k = 0; k < padding_rows; ++k) {
+              int padding_size =
+                  k + context_length < up_pad ? context_length : up_pad - k;
+              framework::Tensor out_t_sub = out_t.Slice(
+                  k * context_length, k * context_length + padding_size);
+              framework::Tensor w_sub = padding_data.Slice(k, k + padding_size);
+              // in this block, using EigenVector<T>::Flatten is ok too.
+              auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
+              auto w_sub_e = EigenMatrix<T>::From(w_sub);
+              if (gradient) {
+                w_sub_e.device(*context.GetEigenDevice<Place>()) =
+                    w_sub_e + out_t_sub_e;
+              } else {
+                out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
+              }
+            }
+          }
+          if (down_pad > 0) {  // add down pad
+            int down_pad_begin_row =
+                std::max(
+                    0, (sequence_height - context_start - context_length) + 1) +
+                1;
+            int padding_begin = std::max(0, context_start - sequence_height);
+            int padding_size =
+                sequence_height - context_start >= context_length
+                    ? 1
+                    : context_length - (sequence_height - context_start);
+            if (context_start >= sequence_height) padding_size = context_length;
+            int padding_idx = padding_begin;
+            for (int t = 0; t + down_pad_begin_row <= sequence_height;
+                 ++t, ++padding_size) {
+              if (context_start >= sequence_height)
+                padding_size = context_length;
+              if (padding_size > context_length) {
+                padding_size = context_length;
+                padding_idx++;
+              }
+              if (padding_begin > 0 || sequence_height == context_start)
+                padding_idx = padding_begin + t;
+              framework::Tensor out_t_sub = out_t.Slice(
+                  (down_pad_begin_row + t) * context_length - padding_size,
+                  (down_pad_begin_row + t) * context_length);
+              framework::Tensor w_sub = padding_data.Slice(
+                  up_pad + padding_idx, up_pad + padding_idx + padding_size);
+              auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
+              auto w_sub_e = EigenMatrix<T>::From(w_sub);
+              if (gradient) {
+                w_sub_e.device(*context.GetEigenDevice<Place>()) =
+                    w_sub_e + out_t_sub_e;
+              } else {
+                out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
+              }
+            }
+          }
+          out_t.Resize({sequence_height, context_length * sequence_width});
+        }
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -44,7 +44,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
      const T* prob_data = prob->data<T>();
      T* loss_data = out->data<T>();

-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
      for (int i = 0; i < batch_size; ++i) {
        int index = i * class_num + label_data[i];
        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));

--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -20,7 +20,7 @@ namespace math {

 namespace {
 template <typename T>
-__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                   const int N, const int D) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
@@ -115,7 +115,7 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
          loss_data, prob_data, label_data, class_num);
    } else {
-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
      int block = 512;
      int grid = (batch_size + block - 1) / block;
      CrossEntropyKernel<T><<<

--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -68,6 +68,7 @@ struct SelectedRowsAdd<platform::CPUPlace, T> {
 };

 template struct SelectedRowsAdd<platform::CPUPlace, float>;
+template struct SelectedRowsAdd<platform::CPUPlace, double>;

 template <typename T>
 struct SelectedRowsAddTensor<platform::CPUPlace, T> {
@@ -108,6 +109,72 @@ struct SelectedRowsAddTensor<platform::CPUPlace, T> {
 };

 template struct SelectedRowsAddTensor<platform::CPUPlace, float>;
+template struct SelectedRowsAddTensor<platform::CPUPlace, double>;
+
+template <typename T>
+struct SelectedRowsAddTo<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    auto in2_place = input2->place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::CPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T));
+  }
+};
+
+template struct SelectedRowsAddTo<platform::CPUPlace, float>;
+template struct SelectedRowsAddTo<platform::CPUPlace, double>;
+
+template <typename T>
+struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+
+template struct SelectedRowsAddToTensor<platform::CPUPlace, float>;
+template struct SelectedRowsAddToTensor<platform::CPUPlace, double>;

 }  // namespace math
 }  // namespace operators

--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -73,12 +73,13 @@ struct SelectedRowsAdd<platform::GPUPlace, T> {
 };

 template struct SelectedRowsAdd<platform::GPUPlace, float>;
+template struct SelectedRowsAdd<platform::GPUPlace, double>;

 namespace {
-template <typename T>
+template <typename T, int block_size>
 __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
                                            const int64_t* rows, T* tensor_out,
-                                            int64_t row_numel, int block_size) {
+                                            int64_t row_numel) {
  const int ty = blockIdx.y;
  int tid = threadIdx.x;

@@ -119,14 +120,13 @@ struct SelectedRowsAddTensor<platform::GPUPlace, T> {
    SetConstant<platform::GPUPlace, T> functor;
    functor(context, output, 0.0);

-    int block_size = 256;
+    const int block_size = 256;
    dim3 threads(block_size, 1);
    dim3 grid(1, in1_rows.size());
-    SelectedRowsAddTensorKernel<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(in1_data, in1_rows.data(), out_data,
-                              in1_row_numel, block_size);
+    SelectedRowsAddTensorKernel<T, block_size><<<
+        grid, threads, 0,
+        reinterpret_cast<const platform::CUDADeviceContext&>(context)
+            .stream()>>>(in1_data, in1_rows.data(), out_data, in1_row_numel);

    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -136,6 +136,93 @@ struct SelectedRowsAddTensor<platform::GPUPlace, T> {
 };

 template struct SelectedRowsAddTensor<platform::GPUPlace, float>;
+template struct SelectedRowsAddTensor<platform::GPUPlace, double>;
+
+template <typename T>
+struct SelectedRowsAddTo<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
+    auto in2_place = input2->place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(
+        boost::get<platform::GPUPlace>(in2_place), in2_data + input2_offset,
+        boost::get<platform::GPUPlace>(in1_place), in1_data,
+        in1_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+  }
+};
+
+template struct SelectedRowsAddTo<platform::GPUPlace, float>;
+template struct SelectedRowsAddTo<platform::GPUPlace, double>;
+
+namespace {
+template <typename T, int block_size>
+__global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
+                                              const int64_t* rows,
+                                              T* tensor_out,
+                                              int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2->data<T>();
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in1_rows.size());
+    SelectedRowsAddToTensorKernel<T, block_size><<<
+        grid, threads, 0,
+        reinterpret_cast<const platform::CUDADeviceContext&>(context)
+            .stream()>>>(in1_data, in1_rows.data(), in2_data, in1_row_numel);
+  }
+};
+
+template struct SelectedRowsAddToTensor<platform::GPUPlace, float>;
+template struct SelectedRowsAddToTensor<platform::GPUPlace, double>;

 }  // namespace math
 }  // namespace operators

--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -36,6 +36,22 @@ struct SelectedRowsAddTensor {
                  const framework::Tensor& input2, framework::Tensor* output);
 };

+// input2 = input1 + input2
+template <typename Place, typename T>
+struct SelectedRowsAddTo {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset, framework::SelectedRows* input2);
+};
+
+// input2 = input1 + input2
+template <typename Place, typename T>
+struct SelectedRowsAddToTensor {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
@@ -104,3 +104,91 @@ TEST(selected_rows_functor, cpu_add) {
  // row9: 2.0 + 3.0
  EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0);
 }
+
+TEST(selected_rows_functor, cpu_add_to) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CPUPlace cpu_place;
+  CPUDeviceContext ctx(cpu_place);
+  SetConstant<CPUPlace, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  output->set_height(height);
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+
+  SelectedRowsAddTo<CPUPlace, float> add_to_functor;
+  add_to_functor(ctx, *selected_rows1, 0, output.get());
+  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  SelectedRowsAddToTensor<CPUPlace, float> add_to_tensor_functor;
+  add_to_tensor_functor(ctx, *output, tensor1.get());
+
+  auto* tensor1_data = tensor1->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
+}
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -71,7 +71,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean,
-                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<paddle::platform::CPUPlace, float>,
+                       ops::MeanKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>);
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -17,7 +17,8 @@
 #include "paddle/operators/mean_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mean,
-                       ops::MeanKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<paddle::platform::GPUPlace, float>,
+                       ops::MeanKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -19,11 +19,9 @@ namespace operators {

 using framework::Tensor;

-class MulOp : public framework::OperatorWithKernel {
+class MulOpShapeInference : public framework::InferShapeBase {
 public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void operator()(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -137,7 +135,10 @@ class MulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
+                  ops::MulOpShapeInference,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mul_grad,
                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
+if(WITH_GPU)
+  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
+endif()
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace platform {}  // namespace platform
+}  // namespace paddle
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu
--- a/paddle/operators/pool_cudnn_op.h
+++ b/paddle/operators/pool_cudnn_op.h
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/operators/proximal_adagrad_op.cu
--- a/paddle/operators/proximal_adagrad_op.h
+++ b/paddle/operators/proximal_adagrad_op.h
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/operators/seq_expand_op.cu
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
--- a/paddle/operators/sequence_conv_op.cu
+++ b/paddle/operators/sequence_conv_op.cu
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/framework/regularizer.py
--- a/python/paddle/v2/framework/tests/.gitignore
+++ b/python/paddle/v2/framework/tests/.gitignore
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_auc_op.py
+++ b/python/paddle/v2/framework/tests/test_auc_op.py
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_batch_norm_op.py
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/framework/tests/test_cast_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
--- a/python/paddle/v2/framework/tests/test_l1_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_l1_norm_op.py
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
--- a/python/paddle/v2/framework/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py
--- a/python/paddle/v2/framework/tests/test_lrn_op.py
+++ b/python/paddle/v2/framework/tests/test_lrn_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
--- a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/framework/tests/test_regularizer.py
--- a/python/paddle/v2/framework/tests/test_seq_conv.py
+++ b/python/paddle/v2/framework/tests/test_seq_conv.py
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_sign_op.py
+++ b/python/paddle/v2/framework/tests/test_sign_op.py
--- a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py