diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md new file mode 100644 index 0000000000000000000000000000000000000000..62edd349f3f982805ec89d1a20af27226504c1be --- /dev/null +++ b/doc/design/cluster_train/pserver_client.md @@ -0,0 +1,95 @@ +# Design Doc: The Client Library of Parameter Server + +For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file. + +## C Interface + +```c +#define PADDLE_ELEMENT_TYPE_INT32 0 +#define PADDLE_ELEMENT_TYPE_UINT32 1 +#define PADDLE_ELEMENT_TYPE_INT64 2 +#define PADDLE_ELEMENT_TYPE_UINT64 3 +#define PADDLE_ELEMENT_TYPE_FLOAT32 4 +#define PADDLE_ELEMENT_TYPE_FLOAT64 5 + +typedef struct paddle_pserver_client paddle_pserver_client; + +/** + * @brief paddle_new_pserver_client creates a new parameter server + * client. + */ +paddle_pserver_client* paddle_new_pserver_client(); + +/** + * @brief paddle_pserver_client_release releases the parameter server + * client. + */ +void paddle_pserver_client_release(paddle_pserver_client* client); + +/** + * @brief paddle_begin_init_param begins to initialize parameters + * on parameter servers. + * + * paddle_begin_init_param will be called from multiple trainers, only + * one trainer will be selected to initialize the parameters on + * parameter servers. Other trainers will be blocked until the + * initialization is done, and they need to get the initialized + * parameters from parameter servers using @paddle_get_param. + * + * @return 1 if trainer is selected to initialize parameter + * servers, otherwise 0. + */ +int paddle_begin_init_param(paddle_pserver_client* client); + +/** + * @brief paddle_init_param initializes the parameter on parameter + * servers. + * + * @return 0 if successful, otherwise -1. On failure the trainer need + * to restart the entire initialization process starting from + * paddle_begin_init_param. Or simply exit the program and wait for + * cluster management system to restart trainer. + */ +int paddle_init_param(paddle_pserver_client* client, const char* name, int element_type, const void* content); + +/** + * @brief paddle_finish_init_param tells parameter servers client has + * sent all parameters to parameter servers as initialization. + * + * @return 0 if successful, otherwise -1. On failure the trainer need + * to restart the entire initialization process starting from + * paddle_begin_init_param. Or simply exit the program and wait for + * cluster management system to restart trainer. + */ +int paddle_finish_init_param(paddle_pserver_client* client); + +/** + * @brief paddle_send_grad sends gradients to parameter servers for + * updating parameters. + * + * @return 0 if successful, otherwise -1. + */ +int paddle_send_grad(paddle_pserver_client* client, const char* name, int element_type, const void* content); + +/** + * @brief paddle_set_param sets a parameter on parameter servers. + * + * @return 0 if successful, otherwise -1. + */ +int paddle_set_param(paddle_pserver_client* client, const char* name, int element_type, const void* content); + +/** + * @brief paddle_get_param gets the parameter from parameter servers. + * + * @return 0 if successful, otherwise -1. + */ +int paddle_get_param(paddle_pserver_client* client, const char* name, void** dst, int* dstLen); + +/** + * @brief paddle_save_model indicates parameters to save the parameter + * to the given path + * + * @return 0 if successful, otherwise -1. + */ +int paddle_save_model(paddle_pserver_client* client, const char* path); +``` diff --git a/doc/design/cluster_train/trainer.md b/doc/design/cluster_train/trainer.md deleted file mode 100644 index bcb4a9c09dccc2961edb480c399fd41be761bf02..0000000000000000000000000000000000000000 --- a/doc/design/cluster_train/trainer.md +++ /dev/null @@ -1,82 +0,0 @@ -# Design Doc: Trainer Communication Library - -For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the trainer's communication library, which will manage communication with parameter servers and the [master server](master_server.md). The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file. - -## Go Interface - -The Go interface is the basic abstraction of communications with the master server and parameter servers. We will add another layer on top (add retry logic, polish interface with C idiom) before exposing the library with a [C interface](#c-interface). - -```go -// MasterClient is the client to the master server. -type MasterClient struct {} - -// GetTask gets a new task by telling the master server the finished task. -// Use nil as the finished task when getting the task for the first time. -func (*MasterClient) GetTask(finished master.Task) (master.Task, error) - -// ElementType is the type of elements of a Parameter. -type ElementType int - -// Different element types. -const ( - Int32 ElementType = iota - UInt32 - Int64 - UInt64 - Float32 - Float64 -) - -// Parameter is a piece of data to sync with the parameter server. -type Parameter struct { - Name string - ElementType ElementType - Buffer []byte -} - -// Gradient is the gradient of the parameter. -type Gradient Parameter - -// PServerClient is the client to parameter servers. -type PServerClient struct {} - -// UpdateRule specifies the rule for updating parameters with gradients. -type UpdateRule struct { - UpdateMethod pserver.UpdateMethod - LearningRate float32 -} - -// ParamInitChans returns a send channel for parameter initialization. -// -// ParamInitChans will be called from multiple trainers, only one trainer should -// initialize the parameters on parameter servers, other trainers will instead -// get the initialized parameters from parameter servers using GetParam. -// -// If send channel is not nil, the trainer is selected to do the initialization, -// the trainer needs to signal for finishing initializing the parameters by -// closing the send channel. -func (*PServerClient) ParamInitChan() (send chan<- Parameter, err error) - -// SendGrad sends gradients to parameter servers for updating parameters. -func (*PServerClient) SendGrad(method UpdateMethod, grads []Gradient) error - -// SetParam sets parameters. -// -// SetParam can be used for the parameters that are not suitable for updating -// using gradients. -func (*PServerClient) SetParam(params []Paramter) error - -// GetParam gets parameters from parameter servers. -func (*PServerClient) GetParam(names []string) ([]Parameter, error) - -// Save indicates parameters to save the parameter to the given path. -// -// Path needs to be the path to a distributed file system which is visible -// to all parameter servers. -func (*PServerClient) Save(path string) error -``` -Please see [master server design doc](master_server.md) for the definition of `master.Task`. - -## C Interface - -TODO