Merge branch 'develop' into feature/is_in_gpu

8539222a · Yu Yang · GitHub · 1dc53a28 · 875946ff · 8539222a
45 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
-   repo: https://github.com/dnephin/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
+    sha: 16398aeccf263adaf53b2495eed0406347d76281
    hooks:
      -   id: go-fmt
-          files: (.*\.go)
-      -   id: go-lint
-          files: (.*\.go)
+          types: [go]
+      -   id: gometalinter
+          types: [go]
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,6 +41,8 @@ before_install:
  - pip install rarfile
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,8 @@ if(WITH_GPU)
 endif(WITH_GPU)

 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)

 add_subdirectory(proto)

--- a/paddle/function/nnpack/nnpack.cmake
+++ b/paddle/function/nnpack/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)

 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
  set(NNPACK_FOUND ON)
  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -23,7 +23,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )

-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -114,13 +113,13 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	if err != nil {
 		// Error
 		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return -1
 	}

 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}


--- a/go/master/client.go
+++ b/go/master/client.go
@@ -69,7 +69,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.Meta.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }


--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -66,11 +66,21 @@ func TestGetFinishTask(t *testing.T) {

 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
+
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()

 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,7 +89,11 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -57,14 +57,30 @@ func TestNextRecord(t *testing.T) {

 	w := recordio.NewWriter(f, -1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	err = w.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
+
 	curAddr := make(chan string, 1)
 	curAddr <- fmt.Sprintf(":%d", p)
 	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {

--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -30,7 +30,7 @@ type EtcdClient struct {
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// TODO(helin): gracefully shutdown etcd store. Because etcd
 	// store holds a etcd lock, even though the lock will expire
 	// when the lease timeout, we need to implement graceful
 	// shutdown to release the lock.
@@ -60,7 +60,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 	log.Debugf("Successfully acquired lock at %s.", lockPath)

-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err

--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -4,7 +4,7 @@ import "sync"

 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte

--- a/go/master/service.go
+++ b/go/master/service.go
@@ -160,7 +160,7 @@ func (s *Service) recover() (bool, error) {

 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -289,7 +289,6 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {

 	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
-	return
 }

 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {

--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -34,7 +34,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )

-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -63,7 +62,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }

 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}

@@ -101,11 +100,11 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }

 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
 	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
-	etcd_client := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	addr := C.GoString(etcdEndpoints)
+	etcdClient := client.NewEtcd(addr)
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
 	return add(c)
 }

@@ -124,20 +123,20 @@ func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 }

 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)

 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +152,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}

@@ -223,12 +222,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))

-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}

-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR

--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -233,7 +233,7 @@ func (c *Client) Save(path string) error {

 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }


--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -79,15 +79,33 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	_, err = client.Delete(ctx, pserver.PsDesired)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
+
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }

 type selector bool

--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,8 +12,7 @@ import (
 )

 const (
-	// DefaultEtcdTimeout is the default etcd timeout
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
+	defaultEtcdTimeout time.Duration = 5 * time.Second
 )

 // EtcdClient is used by pserver client that is a part of trainer process.
@@ -48,7 +47,7 @@ func (p *EtcdClient) Desired() int {

 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
 			time.Sleep(p.timeout)
 			continue
 		}
@@ -67,12 +66,12 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
-			cancel()
 			if err != nil {
-				log.Infof("Get psKey=%s error, %v", psKey, err)
+				log.Infof("Get psKey= %s error, %v", psKey, err)
 				time.Sleep(p.timeout)
 				continue
 			}
@@ -107,11 +106,11 @@ func NewEtcd(endpoints string) *EtcdClient {
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
@@ -119,7 +118,7 @@ func NewEtcd(endpoints string) *EtcdClient {
 	log.Infof("Connected to etcd: %s\n", endpoints)
 	client := &EtcdClient{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client

--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -177,10 +177,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))

 	if err != nil {
@@ -211,8 +211,5 @@ func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) err
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err := e.etcdClient.Put(ctx, key, string(value))
 	cancel()
-	if err != nil {
 	return err
-	}
-	return nil
 }
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -14,8 +14,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )

-var nullPtr = unsafe.Pointer(uintptr(0))
-
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
@@ -23,7 +21,7 @@ type optimizer struct {
 }

 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}

@@ -92,8 +90,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }

 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -211,7 +211,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
@@ -219,7 +219,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 }

 // pserver save checkpoint
-func (s *Service) doCheckpoint() error {
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -237,9 +237,9 @@ func (s *Service) doCheckpoint() error {
 	}
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
-	err := encoder.Encode(cp)
+	err = encoder.Encode(cp)
 	if err != nil {
-		return err
+		return
 	}

 	cpMeta := checkpointMeta{}
@@ -248,10 +248,14 @@ func (s *Service) doCheckpoint() error {
 	h := md5.New()
 	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))

-	cpMetajson, _ := json.Marshal(cpMeta)
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
+
 	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
-		return err
+		return
 	}
 	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
 		log.Info("checkpoint does not exists.")
@@ -264,15 +268,32 @@ func (s *Service) doCheckpoint() error {
 		}
 	}
 	f, err := os.Create(cpMeta.UUID)
-	defer f.Close()
 	if err != nil {
-		return err
+		return
+	}
+
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
 			}
+		}
+	}()
+
 	writer := bufio.NewWriter(f)
 	_, err = writer.Write(buf.Bytes())
-	writer.Flush()
 	if err != nil {
-		return err
+		return
 	}
-	return nil
+
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+
+	return
 }
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -117,6 +117,8 @@ int DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }

+ssize_t DDim::size() const { return arity(*this); }
+
 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {
    return false;
@@ -278,5 +280,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  return os;
 }

+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -29,6 +29,8 @@ struct DDim {
  template <int D>
  explicit DDim(const Dim<D>& in) : var(in) {}

+  /*implicit*/ DDim(std::initializer_list<int> init_list);
+
  template <int D>
  DDim& operator=(const Dim<D>& in) {
    var = in;
@@ -57,6 +59,8 @@ struct DDim {
  DDim operator+(DDim d) const;

  DDim operator*(DDim d) const;
+
+  ssize_t size() const;
 };

 /**

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,6 +49,7 @@ TEST(DDim, Equality) {

  // arity of a DDim
  EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);

  // product of a DDim
  EXPECT_EQ(paddle::framework::product(vddim), 45);

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -198,6 +198,7 @@ Add a mark to which output is temporary is helpful for future optimization.

 class OpRegistry {
  using OpCreator = std::function<OperatorBase*()>;
+  using VarIndexMap = std::unordered_map<std::string, int>;

 public:
  template <typename OpType, typename ProtoMakerType>
@@ -212,6 +213,17 @@ class OpRegistry {
        op_proto.IsInitialized(),
        "Fail to initialize %s's OpProto, because %s is not initialized",
        op_type, op_proto.InitializationErrorString());
+
+    VarIndexMaps()[op_type].reset(new VarIndexMap());
+    auto& varmap = *VarIndexMaps()[op_type];
+    int idx = 0;
+    for (auto& var : op_proto.inputs()) {
+      varmap[var.name()] = idx++;
+    }
+    idx = 0;
+    for (auto& var : op_proto.outputs()) {
+      varmap[var.name()] = idx++;
+    }
  }

  static OperatorPtr CreateOp(const OpDesc& op_desc) {
@@ -220,7 +232,6 @@ class OpRegistry {
    OperatorPtr op(creators().at(op_type)());
    //! Fill op's data member. Not use constructor because it will be noising
    //! for Op developer.
-    const OpProto& op_proto = protos().at(op_type);
    op->type_ = op_desc.type();
    // set op's inputs_ from desc.
    op->inputs_.reserve((size_t)op_desc.inputs_size());
@@ -240,25 +251,31 @@ class OpRegistry {
    //! Convert Temporary variable name to an unique variable name.
    GenerateTempVariableName(op.get());

-    // set argument offsets stored in op.
-    CreateInOutOffsetMap(op, op_proto);
+    //! set argument offsets stored in op.
+    {
+      auto var_index_it = VarIndexMaps().find(op_type);
+      if (var_index_it != VarIndexMaps().end()) {
+        op->in_out_idxs_ = var_index_it->second;
+      }
+    }
    //! Other op's custom Init for a complex Op. For simple Op, the Init
    //! method do nothing.
    op->Init();
    return op;
  }

-  // init op.in_out_idxs_ to accelerate argument's offset lookup.
-  static void CreateInOutOffsetMap(OperatorPtr op, const OpProto& proto) {
-    op->CreateInOutOffsetMap(proto);
-  }
-
  static std::unordered_map<std::string, OpProto>& protos() {
    static std::unordered_map<std::string, OpProto> protos_;
    return protos_;
  };

 private:
+  static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
+  VarIndexMaps() {
+    static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
+    return maps_;
+  }
+
  static void GenerateTempVariableName(OperatorBase* op) {
    static std::atomic<size_t> gUniqId(0UL);
    for (auto& outname : op->outputs_) {
@@ -311,7 +328,7 @@ class OpRegisterHelper {
 /**
 * Macro to Register OperatorKernel.
 */
-#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType)      \
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
      "REGISTER_OP_KERNEL must be in global namespace");                  \
@@ -320,17 +337,19 @@ class OpRegisterHelper {
      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
      key.place_ = PlaceType();                                           \
      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new KernelType());                                       \
+          .reset(new __VA_ARGS__());                                      \
    }                                                                     \
  };                                                                      \
  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }

-#define REGISTER_OP_GPU_KERNEL(type, KernelType) \
-  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
+// (type, KernelType)
+#define REGISTER_OP_GPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)

-#define REGISTER_OP_CPU_KERNEL(type, KernelType) \
-  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
+// (type, KernelType)
+#define REGISTER_OP_CPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

 /**
 * Macro to mark what Operator and Kernel we will use and tell the compiler to

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -19,21 +19,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-void OperatorBase::CreateInOutOffsetMap(const OpProto& proto) {
-  PADDLE_ENFORCE(in_out_idxs_.empty(), "duplicate call CreateInOutOffsetMap");
-  for (int i = 0; i < proto.inputs_size(); i++) {
-    const auto& name = proto.inputs()[i].name();
-    in_out_idxs_[name] = i;
-  }
-  for (int i = 0; i < proto.outputs_size(); i++) {
-    const auto& name = proto.outputs()[i].name();
-    in_out_idxs_[name] = i;
-  }
-}
-
 const std::string& OperatorBase::Input(const std::string& name) const {
-  auto it = in_out_idxs_.find(name);
-  PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);

  if (attrs_.count("input_format") == 0) {
    return inputs_[it->second];
@@ -46,7 +35,7 @@ const std::string& OperatorBase::Input(const std::string& name) const {

 std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
  auto input_format = GetAttr<std::vector<int>>("input_format");
-  auto offset = in_out_idxs_.at(name);
+  auto offset = in_out_idxs_->at(name);

  return std::vector<std::string>{
      inputs_.begin() + input_format.at(offset),
@@ -54,8 +43,9 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
 }

 const std::string& OperatorBase::Output(const std::string& name) const {
-  auto it = in_out_idxs_.find(name);
-  PADDLE_ENFORCE(it != in_out_idxs_.end(), "no key [%s] in in_out_idxs_", name);
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);

  if (attrs_.count("output_format") == 0) {
    return outputs_[it->second];
@@ -68,7 +58,7 @@ const std::string& OperatorBase::Output(const std::string& name) const {

 std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
  auto output_format = GetAttr<std::vector<int>>("output_format");
-  auto offset = in_out_idxs_.at(name);
+  auto offset = in_out_idxs_->at(name);

  return std::vector<std::string>{
      outputs_.begin() + output_format.at(offset),

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -82,16 +82,13 @@ class OperatorBase {
  // TODO add a vector_view to prevent memory copy.
  std::vector<std::string> Outputs(const std::string& name) const;

-  // init in_out_idxs_ to accelerate argument's offset lookup.
-  void CreateInOutOffsetMap(const OpProto& proto);
-
 public:
  std::string type_;
  std::vector<std::string> inputs_;
  std::vector<std::string> outputs_;
  AttributeMap attrs_;
  // store the arguments' offset described in op_desc.
-  std::unordered_map<std::string, int> in_out_idxs_;
+  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };

 class KernelContext {

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -102,6 +102,7 @@ class OpWithKernelTest : public OperatorWithKernel {
                  const std::vector<Tensor*>& outputs) const override {}
 };

+template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
 public:
  void Compute(const KernelContext& ctx) const {
@@ -171,7 +172,8 @@ class CPUKernalMultiInputsTest : public OpKernel {

 REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
            paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);

 // test with single input
 TEST(OpKernel, all) {

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -34,7 +34,7 @@ namespace framework {

 class Tensor {
 public:
-  Tensor() : numel_(0), offset_(0) {}
+  Tensor() : offset_(0) {}

  template <typename T>
  const T* data() const {
@@ -51,30 +51,26 @@ class Tensor {

  template <typename T>
  T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(numel_ > 0,
-                   "Tensor::numel_ must be larger than zero to call "
+    PADDLE_ENFORCE(product(dims_) > 0,
+                   "Tensor's numel must be larger than zero to call "
                   "Tensor::mutable_data. Call Tensor::set_dim first.");
    if (holder_ == nullptr ||
        !(holder_->place() ==
          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < numel_ * sizeof(T) + offset_) {
+        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
+      } else if (platform::is_gpu_place(place)) {
 #ifdef __CUDACC__
-      switch (place.which()) {
-        case 0:
        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-              boost::get<platform::GPUPlace>(place), numel_ * sizeof(T)));
-          break;
-
-        case 1:
-          holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-              boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
-          break;
-      }
+            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
 #else
-      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+        PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device.");
 #endif
-
+      } else {
+        PADDLE_ENFORCE(true, "Unknown 'place'.");
+      }
      offset_ = 0;
    }
    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -95,7 +91,7 @@ class Tensor {
                       platform::is_cpu_place(dst_place),
                   "Tensor::CopyFrom only support CPU now.");
    src.CheckDims<T>();
-    size_t size = src.numel_ * sizeof(T);
+    size_t size = product(src.dims_) * sizeof(T);
    set_dims(src.dims());
    const void* src_ptr = static_cast<const void*>(src.data<T>());
    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
@@ -129,7 +125,6 @@ class Tensor {
      return;
    }
    dims_ = dims;
-    numel_ = product(dims_);
  }

  DDim dims() const { return dims_; }
@@ -179,18 +174,17 @@ class Tensor {
  inline void CheckDims() const {
    PADDLE_ENFORCE(holder_ != nullptr,
                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                   "first to re-allocate memory.");
  }

  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
-  size_t numel_;   // cache of `product(dims_)`
  size_t offset_;  // marks the begin of tensor data area.
  template <bool less, size_t i, typename... args>
  friend struct paddle::pybind::details::CastToPyBufferImpl;
-};  // namespace framework
+};

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()

 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
  if(WITH_TESTING)
    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)

--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"

 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
            "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
             0,
@@ -58,18 +58,10 @@ public:
    workspaceBuffer_ = nullptr;
    workspaceSize_ = 0;

-    threadpool_ = nullptr;
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
+    create_nnpack_threadpool();
  }

  ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
    if (workspaceBuffer_) {
      free(workspaceBuffer_);
    }
@@ -225,14 +217,25 @@ public:
    }
  }

+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
 private:
  nnp_convolution_algorithm algorithm_;
  nnp_convolution_transform_strategy transform_strategy_;
  void* workspaceBuffer_;
  size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };

+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);

 }  // namespace paddle
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -42,3 +42,8 @@ endfunction()

 op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
+
+op_library(mul_op SRCS mul_op.cc mul_op.cu)
+op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
+op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -31,8 +31,7 @@ protected:
        "Inputs/Outputs of AddOp must all be set");
    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                   "Two input of Add Op's dimension must be same.");
-    // Need set dims in Tensor
-    // outputs[0]->set_dims(inputs[0]->dims())
+    outputs[0]->set_dims(inputs[0]->dims());
  }
 };


--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/tensor.h>
+#include <paddle/operators/mul_op.h>
+
+namespace paddle {
+namespace operators {
+
+class MulOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
+                   "The input of mul op must be matrix");
+    PADDLE_ENFORCE(
+        dim0[1] == dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
+    outputs[0]->set_dims({dim0[0], dim1[1]});
+  }
+};
+
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/operators/mul_op.h>
+#include <paddle/framework/op_registry.h>
+
+REGISTER_OP_GPU_KERNEL(mul,
+                       paddle::operators::MulKernel<paddle::platform
+                       ::GPUPlace>);
\ No newline at end of file
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class MulKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+namespace paddle {
+namespace operators {
+
+class RowWiseAddOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+
+    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
+    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left input of row-wise add op, must be matrix");
+    AddInput("b", "The right input of row-wise add op, must be vector");
+    AddOutput("Out", "The output of row-wise add op");
+    AddComment(R"DOC(Row-wise Add operator
+
+for i in xrange(X.shape[0]):
+  Out = X[i] + b
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(rowwise_add,
+            paddle::operators::RowWiseAddOp,
+            paddle::operators::RowWiseAddOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class RowWiseAddKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/sigmoid_op.h>
+namespace paddle {
+namespace operators {
+
+class SigmoidOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "sigmoid input");
+    AddInput("Y", "sigmoid output");
+    AddComment("Sigmoid function");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(sigmoid,
+            paddle::operators::SigmoidOp,
+            paddle::operators::SigmoidOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
+#include <paddle/operators/sigmoid_op.h>
+#include <paddle/framework/op_registry.h>
+
+REGISTER_OP_GPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class SigmoidKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
+
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input of softmax");
+    AddOutput("Y", "output of softmax");
+    AddComment("Softmax Op");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
+REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+
+REGISTER_OP_GPU_KERNEL(
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class SoftmaxKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op)
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
+        add_op mul_op rowwise_add_op sigmoid_op softmax_op)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -26,6 +26,10 @@ namespace py = pybind11;
 namespace pd = paddle::framework;

 USE_OP(add_two);
+USE_OP(softmax);
+USE_OP(mul);
+USE_OP(rowwise_add);
+USE_OP(sigmoid);

 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of Paddle Paddle");

--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -13,6 +13,11 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version

+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
 if ! pre-commit run -a ; then
  git diff  --exit-code
 fi