Multi Graphics Card
Created by: tonyyang-svail
This issue demonstrates the difficulties in implementing multi GPU training.
Background
Parallelism: Data Parallel
Comunication pattern: Ring Based Allreduce. http://research.baidu.com/bringing-hpc-techniques-deep-learning/
Python Example
data = layer.data()
places = layer.get_places(all_gpu=True)
data_array = split_data(data, places)
label_array = split_data(label, places)
with parallel_for(places) as p_for:
h1 = layer.fc(input=read_from_array(data_array, p_for.i)) # h1 = w1 * data
h2 = layer.fc(h1) # h2 = w2 * h1
loss = layer.softmax(h2, read_from_array(label, p_for.i))
append_backward(loss)
with parallel_for(places) as p_for:
append_optimization(loss, Adam())
exe = Executor(CPUPlace)
exe.run(fluid.default_startup_program())
avg_loss_value, = exe.run(fluid.default_main_program()). # TBD: how to aggregate loss
ParallelDoOp
/* ParallelDoOp
* Input:
* places vector<Place>
* Input Variable
* Output:
* par_scopes vector<Scope*>
* Attr:
* block BlockDescBind
*/
class ParallelDoOp : public OperatorBase {
...
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
vector<thread> threads;
auto& block = attr("block");
auto& par_scopes = input("par_scopes");
for (auto& place : input("places")) {
threads.push_back(thread(
[&] {
auto p_scope = scope->NewScope();
auto par_scopes.push_back(p_scope);
auto exe = Executor(place);
exe.run(p_scope, block->program, block->id);
}
));
}
join_all_threads();
}
}
/* ParallelDoGradOp
* Input:
* places vector<Place>
* Input Variable
* par_scopes vector<Scope*>
* Output:
* Input_Grad Variable
* Attr:
* block BlockDescBind Note this is the backward block
*/
class ParallelDoGradOp : public OperatorBase {
...
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
vector<thread> threads;
auto& block = attr("block");
for (auto& place, p_scope : input("places"), input("par_scopes")) {
threads.push_back(thread(
[&] {
auto exe = Executor(place);
exe.run(p_scope, block->program, block->id);
}
));
}
join_all_threads();
}
}
ProgramDesc
# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
start_program
{
vars: w1, w2
ops: init(w1), init(w2)
}
main_program
{
block0 {
vars: data, places, w1, w2
ops: data, get_place, parallel_do(block1),
parallel_do_grad(block2), # append_backward
parallel_do(block3) # append_optimization
}
block1 {
vars: data, h1, h2, loss # TBD: need add w1, w2 here?
ops: fc, fc, softmax
}
block2 {
vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
ops: softmax_grad,
fc_grad, allreduce(places, scopes, w1_grad), # TBD: who add allreduce?
fc_grad, allreduce(places, scopes, w2_grad)
}
block3 {
vars: lr # TBD: need add w1, w2 here?
ops: sgd(w2, w2_grad),
sgd(w1, w1_grad)
}
}
Problems
- At the first iteration, who will copy the initialized parameters (note some parameter doesn't need to copy) to different GPUs? In later iterations, how to avoid this copy.
- Answer: we copy on every iteration. However, we allow parameter sharing if the place is same.
- Who will add
allreduce
? Backward will support this?- Answer: parallel_do will manually accumulate the gradients across all places.
- Who will add
parallel_do(block3)
? Answer:- Answer: parallel_do outputs the gradient to the host place. all the
param += grad
only happens on the host place.
- Answer: parallel_do outputs the gradient to the host place. all the
- How to save model? Answer:
- Answer: all the parameters will be at the host place.
- How does optimization access forward/backward scope?
- Answer: all the updates will appear at w and w_grad on the host place.
- How to aggregate
target
?- Answer:
parallel_do
will aggregate its output.
- Answer: