diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 878e53058567500aeb9fe854a1a65ed5380572a8..c8a4292932dfaddb4ea73a0d1c8ff6bda02ce1c0 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -38,7 +38,7 @@ class ConcatKernel : public framework::OpKernel { auto in_stride = framework::stride_numel(in->dims()); StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data() + output_offset, out_stride, - in->data(), in_stride); + in->data(), in_stride, in_stride[axis]); output_offset += in_stride[axis]; } } @@ -59,7 +59,7 @@ class ConcatGradKernel : public framework::OpKernel { auto out_stride = framework::stride_numel(out->dims()); StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data(), out_stride, in->data() + input_offset, - in_stride); + in_stride, out_stride[axis]); input_offset += out_stride[axis]; } } diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h index 06bcf82620bec57346c30b029d23ad8417252248..54420e1bf6ec982545715dc847b0b3e138cf2045 100644 --- a/paddle/fluid/operators/split_op.h +++ b/paddle/fluid/operators/split_op.h @@ -38,7 +38,7 @@ class SplitOpKernel : public framework::OpKernel { auto out_stride = framework::stride_numel(out->dims()); StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data(), out_stride, in->data() + input_offset, - in_stride); + in_stride, out_stride[axis]); input_offset += out_stride[axis]; } } diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 4036d1091db927de68c288df78f159b9e593ff25..4c7b90693a2f9ba62d9c30bb601ea4aaebeaf4b5 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -54,11 +54,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, int64_t axis, T* dst, const framework::DDim& dst_stride_numel, const T* src, - const framework::DDim& src_stride_numel) { + const framework::DDim& src_stride_numel, + int64_t size) { int64_t before = dst_stride_numel[0] / dst_stride_numel[axis]; int64_t src_after = src_stride_numel[axis]; int64_t dst_after = dst_stride_numel[axis]; - int64_t copy_size = std::min(src_after, dst_after); auto place = ctx.GetPlace(); PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(), @@ -83,15 +83,14 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); memory::Copy(cpu_place, dst + i * dst_after, cpu_place, - src + i * src_after, sizeof(T) * copy_size); + src + i * src_after, sizeof(T) * size); } else { #ifdef PADDLE_WITH_CUDA auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(ctx); memory::Copy(gpu_place, dst + i * dst_after, gpu_place, - src + i * src_after, sizeof(T) * copy_size, - cuda_ctx.stream()); + src + i * src_after, sizeof(T) * size, cuda_ctx.stream()); #else PADDLE_THROW("Paddle is not compiled with GPU"); #endif diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py index f84481adf7a23263bfbc5ca5da66e96aa822bdc3..689920af0c4fb85d11c3492d83da2d22d9c4fa6e 100644 --- a/python/paddle/v2/fluid/distribute_transpiler.py +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -121,6 +121,7 @@ def split_dense_variable(var_list, block_size += dim1 - remains # update split_count after aligning split_count = int(math.ceil(var_numel / float(block_size))) + print("###split var ", var.name, var.shape, block_size, split_count) for block_id in xrange(split_count): curr_block_size = min(block_size, var_numel - ( (block_id) * block_size)) @@ -255,6 +256,7 @@ class DistributeTranspiler: splited_shape = [rows] if len(orig_shape) >= 2: splited_shape.extend(orig_shape[1:]) + print("###splited: ", size, rows, splited_shape) var = program.global_block().create_var( name="%s.block%d" % (varname, i), psersistable=False, @@ -262,6 +264,7 @@ class DistributeTranspiler: type=orig_var.type, shape=splited_shape) # flattend splited var var_mapping[varname].append(var) + print("###created split var ", var) return var_mapping def _clone_var(self, block, var): @@ -528,6 +531,8 @@ class DistributeTranspiler: """ # step5 pserver_program = Program() + print("param mapping on pserver: #### ", + self.param_grad_ep_mapping[endpoint]["params"]) for v in self.param_grad_ep_mapping[endpoint]["params"]: self._clone_var(pserver_program.global_block(), v) for v in self.param_grad_ep_mapping[endpoint]["grads"]: