From 6863e2ae4f4e2bc6e4ebc33d19d0eed89caf909e Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Tue, 27 Jun 2023 14:16:47 +0800 Subject: [PATCH] [Semi-Auto] SPMD Parallel Rule Base (#53863) * base rule * add sharidng merge * add sharidng axis merge * define unified data class for inferencing dist_attr * test wrap DistTensorSpec in dygraph mode * matmul main logic done * define unified data class for inferencing dist_attr --------- Co-authored-by: Yichen Zhang --- .../distributed/auto_parallel/CMakeLists.txt | 3 + .../auto_parallel/spmd_rules/CMakeLists.txt | 4 + .../auto_parallel/spmd_rules/common.cc | 213 ++++++++++++++++ .../auto_parallel/spmd_rules/common.h | 161 +++++++++++++ .../spmd_rules/dist_tensor_spec.cc | 87 +++++++ .../spmd_rules/dist_tensor_spec.h | 76 ++++++ .../spmd_rules/matmul_spmd_rule.cc | 228 ++++++++++++++++++ .../spmd_rules/matmul_spmd_rule.h | 47 ++++ .../auto_parallel/spmd_rules/rules.h | 30 +++ .../auto_parallel/test/CMakeLists.txt | 7 +- .../auto_parallel/test/spmd_rule_test.cc | 206 ++++++++++++++++ paddle/fluid/pybind/CMakeLists.txt | 3 +- paddle/fluid/pybind/auto_parallel_py.cc | 36 +++ paddle/phi/api/yaml/generator/api_base.py | 13 + paddle/phi/api/yaml/generator/api_gen.py | 2 + .../core/distributed/auto_parallel/utils.h | 1 + .../auto_parallel/static/completion.py | 1 + .../auto_parallel/static/dist_attribute.py | 1 + .../distributed/auto_parallel/static/utils.py | 65 ++++- test/auto_parallel/spmd_rules/CMakeLists.txt | 10 + .../spmd_rules/test_matmul_rule.py | 225 +++++++++++++++++ 21 files changed, 1413 insertions(+), 6 deletions(-) create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/common.h create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.cc create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h create mode 100644 paddle/fluid/distributed/auto_parallel/test/spmd_rule_test.cc create mode 100644 test/auto_parallel/spmd_rules/CMakeLists.txt create mode 100644 test/auto_parallel/spmd_rules/test_matmul_rule.py diff --git a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt index a0806fa1a64..9bffd1a7fb0 100644 --- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt +++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt @@ -3,4 +3,7 @@ cc_library( SRCS dist_attr.cc DEPS phi auto_parallel_proto proto_desc) +cc_library(auto_parallel DEPS op_dist_attr spmd_rule) + add_subdirectory(test) +add_subdirectory(spmd_rules) diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt new file mode 100644 index 00000000000..8411669a3fe --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_library( + spmd_rule + SRCS common.cc dist_tensor_spec.cc matmul_spmd_rule.cc + DEPS phi) diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc new file mode 100644 index 00000000000..c948acd715b --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc @@ -0,0 +1,213 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" + +#include + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +std::pair, std::vector> +SPMDRuleBase::InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) { + PADDLE_THROW( + phi::errors::Unimplemented("InferForward should be called from a " + "derived class of SPMDRuleBase !")); +} + +std::pair, std::vector> +SPMDRuleBase::InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) { + PADDLE_THROW( + phi::errors::Unimplemented("InferBackward should be called from a " + "derived class of SPMDRuleBase !")); +} + +std::unordered_map ShardingMergeForTensors( + const std::vector>>& + tensor_axes_to_dim_pairs) { + std::unordered_map axis_to_dim_map; + std::unordered_map dim_to_axis_map; + int64_t merge_dim; + + for (auto& pair : tensor_axes_to_dim_pairs) { + for (size_t i = 0; i < pair.second.size(); ++i) { + auto tensor_axis = pair.first.substr(i, 1); + auto mesh_dim = pair.second[i]; + + if (axis_to_dim_map.count(tensor_axis) == 0) { + merge_dim = mesh_dim; + } else { + merge_dim = ShardingMergeForAxis( + tensor_axis, mesh_dim, axis_to_dim_map[tensor_axis]); + } + axis_to_dim_map[tensor_axis] = merge_dim; + if (merge_dim != -1) { + if (dim_to_axis_map.count(merge_dim) == 0) { + dim_to_axis_map.insert({merge_dim, tensor_axis}); + } else if (dim_to_axis_map[merge_dim].find(tensor_axis) == + std::string::npos) { + dim_to_axis_map[merge_dim] += tensor_axis; + } + } + } + } + + // Resolute "mesh_dim shard by more than one axis" confict. + // Now we just naive pick the first axis naively. + // (TODO) use local cost model to pick the axis with lowest cost(in concern of + // memory or communication or computation). + for (auto& it : dim_to_axis_map) { + if (it.second.size() > 1) { + VLOG(4) << "Sharding Conflict: Mesh_Dim [" << it.first + << "] are Sharding Multiple Tensor Axis: [" << it.second + << "]. The Axis: [" << it.second[0] << "] is Picked."; + for (size_t i = 1; i < it.second.size(); ++i) { + axis_to_dim_map[it.second.substr(i, 1)] = -1; + } + } + } + + return axis_to_dim_map; +} + +// Rule1: A repicated dimension could be merged by any sharded dimension. +// Rule2: A tensor axis could at most be sharded by one mesh dimension. +// (TODO trigger heuristics cost model and reshard to handle axis sharded by +// multiple dimension case.) +int64_t ShardingMergeForAxis(const std::string& axis, + const int64_t& mesh_dim1, + const int64_t& mesh_dim2) { + if (mesh_dim1 != mesh_dim2) { + if (mesh_dim1 == -1) { + return mesh_dim2; + } else if (mesh_dim2 == -1) { + return mesh_dim1; + } else { + // (TODO) local cost model here. + PADDLE_THROW( + phi::errors::Unimplemented("Tensor Axis[%s] is Sharded by two " + "different mesh dimension [%d] and [%d].", + axis, + mesh_dim1, + mesh_dim2)); + } + + } else { + return mesh_dim1; + } +} + +TensorDistAttr CopyTensorDistAttrForOutput( + const TensorDistAttr& src_dist_attr) { + TensorDistAttr new_dist_attr = TensorDistAttr(); + new_dist_attr.set_process_mesh(src_dist_attr.process_mesh()); + new_dist_attr.set_batch_dim(src_dist_attr.batch_dim()); + new_dist_attr.set_dynamic_dims(src_dist_attr.dynamic_dims()); + // new_dist_attr.set_annotated(false); TODO unset field is false by default. + return new_dist_attr; +} + +std::vector ResoluteOutputPartialDimension( + const std::unordered_map& axis_to_dim_map, + const std::string& tensor_axes) { + std::vector partial_on_dims; + + for (auto& it : axis_to_dim_map) { + if (tensor_axes.find(it.first) == std::string::npos) { + if (it.second > -1) { + partial_on_dims.push_back(it.second); + } + } + } + return partial_on_dims; +} + +std::string GetBroadcastAxes(const int64_t& tenosr_ndim, + const int64_t& broadcast_ndim, + const std::string& alphabet) { + PADDLE_ENFORCE_GE( + alphabet.size(), + broadcast_ndim, + phi::errors::InvalidArgument( + "size of alphabet [%d] is less than broadcast ndim [%d]", + alphabet.size(), + broadcast_ndim)); + PADDLE_ENFORCE_GE(broadcast_ndim, + tenosr_ndim, + phi::errors::InvalidArgument( + "broadcast ndim [%d] is less than tenosr ndim [%d]", + broadcast_ndim, + tenosr_ndim)); + if (tenosr_ndim <= 0) { + return std::string(); + } + return alphabet.substr(broadcast_ndim - tenosr_ndim, tenosr_ndim); +} + +// SPMDRuleMap +SPMDRuleMap& SPMDRuleMap::Instance() { + static SPMDRuleMap g_spmd_rule_map; + return g_spmd_rule_map; +} + +// To enable default replicated spmd rule for op that are NOT registered +// which all tensors of inputs and outputs will be replicated in all ranks of +// the mesh. +SPMDRuleBase* SPMDRuleMap::Get(const std::string& op_type) const { + auto rule_ptr = GetNullable(op_type); + if (rule_ptr == nullptr) { + std::string str; + for (const auto& item : map_) { + str += item.first + ", "; + } + VLOG(4) << "Size of current map [" << map_.size() << "]"; + VLOG(4) << "Keys are [" << str << "]"; + } + PADDLE_ENFORCE_NOT_NULL( + rule_ptr, + platform::errors::NotFound( + "NO SPMD Rule has been registered for Operator [%s].", op_type)); + return rule_ptr; +} + +SPMDRuleBase* SPMDRuleMap::GetNullable(const std::string& op_type) const { + auto it = map_.find(op_type); + if (it == map_.end()) { + return nullptr; + } else { + return it->second.get(); + } +} + +int SPMDRuleMap::Insert(const std::string& op_type, + std::unique_ptr rule) { + VLOG(4) << "Call SPMDRuleMap::Insert!"; + PADDLE_ENFORCE_NE( + Has(op_type), + true, + platform::errors::AlreadyExists( + "SPMD Rule for Operator [%s] has been registered.", op_type)); + map_.insert({op_type, std::move(rule)}); + + return 1; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h new file mode 100644 index 00000000000..9d7c7086d91 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h" +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" +#include "paddle/utils/flat_hash_map.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +using paddle::framework::Attribute; + +class SPMDRuleBase { + public: + virtual ~SPMDRuleBase() {} + + // Based on the information of Input Tensors and Op Attribute: + // 1. Merge the Sharding (dims_mapping) among Input Tensors. + // 2. Infer the Sharding (dims_mapping) for Output Tensors. + // The Info of input tensors (Shape and DistAttr) are wrapped as + // DistTensorSpec, and op attribtue should be given as AttributeMap. The + // Output is a pair consist of two vectors: + // 1. The first vector: the merged DistAttr of input tensors. + // 2. The infered DistAttr of output tensors. + // The Merged DistAttr might be different from the original Intput DistAttrs, + // which means that the corressponding input tensor need to be reshard. + virtual std::pair, std::vector> + InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs); + + // Based on the information of Output Tensors and Op Attribute: + // 1. Merge the Sharding (dims_mapping) among Output Tensors. + // 2. Infer the Sharding (dims_mapping) for Input Tensors. + // The Info of output tensors (Shape and DistAttr) are wrapped as + // DistTensorSpec, and op attribtue should be given as AttributeMap. The + // Output is a pair consist of two vectors: + // 1. The first vector: the merged DistAttr of output tensors. + // 2. The infered DistAttr of Input tensors. + virtual std::pair, std::vector> + InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs); + + template + inline const T ExtractAttr( + const std::string& name, + const paddle::framework::AttributeMap& attrs) const { + auto& attr = GetAttr(name, attrs); + + // In order to get bool attr properly + framework::proto::AttrType attr_type = + static_cast(attr.index() - 1); + if (attr_type == framework::proto::AttrType::INT) { + if (std::is_same::value) { + return static_cast(PADDLE_GET_CONST(int, attr)); + } + } + + return PADDLE_GET_CONST(T, attr); + } + + const Attribute& GetAttr(const std::string& name, + const paddle::framework::AttributeMap& attrs) const { + auto iter = attrs.find(name); + PADDLE_ENFORCE_NE(iter, + attrs.end(), + paddle::platform::errors::NotFound( + "(%s) is not found in AttributeMap.")); + return iter->second; + } +}; + +// Merge sharding specification (dims mapping) of given tensors. +// The same axes of different tensors will be merged. +std::unordered_map ShardingMergeForTensors( + const std::vector>>& + tensor_axes_to_dim_pairs); + +// Merge the sharding specification (dims mapping) for one tensor Axis. +// Rule1: A repicated dimension could be merged by any sharded dimension. +// Rule2: A tensor axis could at most be sharded by one mesh dimension. +// (TODO trigger heuristics cost model and reshard to handle axis sharded by +// multiple dimension case.) +int64_t ShardingMergeForAxis(const std::string& axis, + const int64_t& mesh_dim1, + const int64_t& mesh_dim2); + +TensorDistAttr CopyTensorDistAttrForOutput(const TensorDistAttr& src_dist_attr); + +// Resolute the partial mesh dimension of a output tensor, giving the +// merged sharding specifcation of input tensors and the axis names of output +// tensor. Input are +std::vector ResoluteOutputPartialDimension( + const std::unordered_map& axis_to_dim_map, + const std::string& tensor_axes); + +// Generate the axis notation of tensor for the einsum notation of a broadcast +// operation(alignment star from the rightmost axis). tenosr_ndim: the size of +// the tensor. broadcast_ndim: the maxium size of tensors in this broadcast +// operation. alphabet: the characters used to represent the axes of tensor. +// length of alphabet should >= broadcast_ndim. +std::string GetBroadcastAxes(const int64_t& tenosr_ndim, + const int64_t& broadcast_ndim, + const std::string& alphabet); + +// The static map that stores and initializes all the registered SPMD rules. +class SPMDRuleMap { + public: + ~SPMDRuleMap() = default; + + // A singleton + static SPMDRuleMap& Instance(); + + // Returns the spmd rule for the given op_type + SPMDRuleBase* Get(const std::string& op_type) const; + + // Returns the spmd by name or nullptr if not registered + SPMDRuleBase* GetNullable(const std::string& op_type) const; + + // Register a spmd for an op_type. + int Insert(const std::string& op_type, std::unique_ptr rule); + + bool Has(const std::string& op_type) const { + return map_.find(op_type) != map_.end(); + } + + private: + SPMDRuleMap() = default; + paddle::flat_hash_map> map_; + DISABLE_COPY_AND_ASSIGN(SPMDRuleMap); +}; + +#define REGISTER_SPMD_RULE(op_type, rule_class, ...) \ + UNUSED static int __spmd_rule_holder_##op_type = \ + ::paddle::distributed::auto_parallel::SPMDRuleMap::Instance().Insert( \ + #op_type, std::make_unique(__VA_ARGS__)) + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc new file mode 100644 index 00000000000..95e9a8d0321 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h" + +#include "paddle/phi/core/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +DistTensorSpec::DistTensorSpec(const std::vector& shape, + const TensorDistAttr& dist_attr) { + shape_.assign(shape.begin(), shape.end()); + // we should merge the new distributed attributes with the original one + // after inferencing, thus we get a copy of the original one + dist_attr_.copy_from(dist_attr); +} + +DistTensorSpec::DistTensorSpec(const DistTensorSpec& spec) { + std::vector spec_shape = spec.shape(); + shape_.assign(spec_shape.begin(), spec_shape.end()); + dist_attr_.copy_from(spec.dist_attr()); +} + +DistTensorSpec::~DistTensorSpec() {} + +DistTensorSpec::DistTensorSpec(const Tensor& tensor) { + shape_ = tensor.shape(); +} + +DistTensorSpec& DistTensorSpec::operator=(const DistTensorSpec& spec) { + std::vector spec_shape = spec.shape(); + shape_ = spec_shape; + dist_attr_.copy_from(spec.dist_attr()); + return *this; +} + +const std::vector& DistTensorSpec::dims_mapping() const { + return dist_attr_.dims_mapping(); +} + +void DistTensorSpec::set_dims_mapping( + const std::vector& dims_mapping) { + dist_attr_.set_dims_mapping(dims_mapping); +} + +const ProcessMesh& DistTensorSpec::process_mesh() const { + return dist_attr_.process_mesh(); +} + +void DistTensorSpec::set_process_mesh(const ProcessMesh& process_mesh) { + dist_attr_.set_process_mesh(process_mesh); +} + +const std::vector& DistTensorSpec::shape() const { return shape_; } + +void DistTensorSpec::set_shape(const std::vector& shape) { + shape_ = shape; +} +const TensorDistAttr& DistTensorSpec::dist_attr() const { return dist_attr_; } + +void DistTensorSpec::set_dist_attr(const TensorDistAttr& dist_attr) { + dist_attr_ = dist_attr; +} + +std::string DistTensorSpec::to_string() const { + using phi::distributed::auto_parallel::str_join; + std::string spec_str = "{tensor_shape:[" + str_join(shape_) + "], "; + spec_str += "dist_attr:" + dist_attr_.to_string() + "}"; + return spec_str; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h new file mode 100644 index 00000000000..f4f66d30630 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +using phi::distributed::auto_parallel::ProcessMesh; +using phi::distributed::auto_parallel::TensorDistAttr; + +/** + * A unified data class for inferring distributed attributes + * in both dygraph mode and static mode + */ +class DistTensorSpec { + public: + DistTensorSpec() = default; + + DistTensorSpec(const std::vector& shape, + const TensorDistAttr& dist_attr); + + DistTensorSpec(const DistTensorSpec& spec); + + // temp function, only for test in dygraph mode + explicit DistTensorSpec(const Tensor& tensor); + + ~DistTensorSpec(); + + DistTensorSpec& operator=(const DistTensorSpec& spec); + + // get dims_mapping from dist_attr_ + const std::vector& dims_mapping() const; + + // set dims_mapping in dist_attr_ + void set_dims_mapping(const std::vector& dims_mapping); + + // get process_mesh from dist_attr_ + const ProcessMesh& process_mesh() const; + + // set process_mesh in dist_attr_ + void set_process_mesh(const ProcessMesh& process_mesh); + + const TensorDistAttr& dist_attr() const; + + void set_dist_attr(const TensorDistAttr& dist_attr); + + const std::vector& shape() const; + + void set_shape(const std::vector& shape); + + std::string to_string() const; + + private: + std::vector shape_; + // distributed attributes of the corresponding tensor + TensorDistAttr dist_attr_; +}; +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.cc new file mode 100644 index 00000000000..89d0083545d --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.cc @@ -0,0 +1,228 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h" + +#include "paddle/phi/core/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { +using phi::distributed::auto_parallel::str_join; +std::pair, std::vector> +MatmulSPMDRule::InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) { + // step0: verify input args based on matmul logic + auto input_specs_size = input_specs.size(); + PADDLE_ENFORCE_EQ( + input_specs_size, + 2, + phi::errors::InvalidArgument( + "The size of InputSpec of matmul should be 2, but got [%d].", + input_specs_size)); + auto x_shape = input_specs[0].shape(); + auto y_shape = input_specs[1].shape(); + int x_ndim = x_shape.size(); + int y_ndim = y_shape.size(); + auto x_dist_attr_src = input_specs[0].dist_attr(); + auto y_dist_attr_src = input_specs[1].dist_attr(); + std::vector x_dims_mapping = x_dist_attr_src.dims_mapping(); + std::vector y_dims_mapping = y_dist_attr_src.dims_mapping(); + PADDLE_ENFORCE_EQ( + x_ndim, + x_dims_mapping.size(), + phi::errors::InvalidArgument( + "Mismatch of X's tensor size: [%d] and X's dims_mapping size [%d].", + x_ndim, + x_dims_mapping.size())); + PADDLE_ENFORCE_EQ( + y_ndim, + y_dims_mapping.size(), + phi::errors::InvalidArgument( + "Mismatch of Y's tensor size: [%d] and Y's dims_mapping size [%d].", + x_ndim, + x_dims_mapping.size())); + + bool trans_x = ExtractAttr("trans_x", attrs); + bool trans_y = ExtractAttr("trans_y", attrs); + + // Step2.3.2 handle input tensor partial (TODO) + VLOG(4) << "MatmulSPMDRule InferForward Inputs: " + << "X shape: [" << str_join(x_shape) << "], x_dims_mapping: [" + << str_join(x_dims_mapping) << "]; Y shape: [" << str_join(y_shape) + << "], y_dims_mapping: [" << str_join(y_dims_mapping) + << "]; trans_x: " + << "[" << (trans_x ? "true" : "false") << "]; " + << "trans_y: " + << "[" << (trans_y ? "true" : "false") << "]; "; + + // step1: build Einsum Notation + + // reserve the char k, m, n for matrix product notation: mk,kn -> mn + int max_ndim = std::max(x_ndim, y_ndim); + std::string alphabet = "abcdefghijlopqrstuvwxyz"; + std::string x_axes; + std::string y_axes; + std::string out_axes; + + // Handle 4 different matmul cases in Paddle + // vector * vector = scala + if (x_ndim == 1 && y_ndim == 1) { + x_axes = "k"; + y_axes = "k"; + out_axes = ""; + // vector * batched matrix + } else if (x_ndim == 1 && y_ndim > 1) { + x_axes = "k"; + std::string y_broadcast_axes = + GetBroadcastAxes(y_ndim - 2, y_ndim - 2, alphabet); + y_axes = y_broadcast_axes + "kn"; + out_axes = y_broadcast_axes + "n"; + // batched matrix * vector + } else if (x_ndim > 1 && y_ndim == 1) { + y_axes = "k"; + std::string x_broadcast_axes = + GetBroadcastAxes(x_ndim - 2, x_ndim - 2, alphabet); + x_axes = x_broadcast_axes + "mk"; + out_axes = x_broadcast_axes + "m"; + // batched matrix * batched matrix + } else if (x_ndim > 1 && y_ndim > 1) { + std::string x_broadcast_axes = + GetBroadcastAxes(x_ndim - 2, max_ndim - 2, alphabet); + std::string y_broadcast_axes = + GetBroadcastAxes(y_ndim - 2, max_ndim - 2, alphabet); + x_axes = x_broadcast_axes + "mk"; + y_axes = y_broadcast_axes + "kn"; + + if (x_ndim > y_ndim) { + out_axes = x_broadcast_axes + "mn"; + } else { + out_axes = y_broadcast_axes + "mn"; + } + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "MatmulSPMDRule Receive Unsupported x_dim [%d] and y_dim [%d].", + x_ndim, + y_ndim)); + } + + VLOG(4) << "MatmulSPMDRule build Einsum notation: [" << x_axes << "," + << y_axes << " --> " << out_axes << "]."; + + // step2: Sharding Propogation + if (trans_x) { + PADDLE_ENFORCE_GE( + x_ndim, + 2, + phi::errors::InvalidArgument("When trans_x is True, the size of X " + "tensor should be 2, but got [%d].", + x_ndim)); + std::iter_swap(x_dims_mapping.end() - 2, x_dims_mapping.end() - 1); + } + if (trans_y) { + PADDLE_ENFORCE_GE( + y_ndim, + 2, + phi::errors::InvalidArgument("When trans_x is True, the size of X " + "tensor should be 2, but got [%d].", + y_ndim)); + std::iter_swap(y_dims_mapping.end() - 2, y_dims_mapping.end() - 1); + } + // step2.1: Sharding Merge + std::pair> x_pair(x_axes, x_dims_mapping); + std::pair> y_pair(y_axes, y_dims_mapping); + auto axis_to_dim_map = ShardingMergeForTensors({x_pair, y_pair}); + + // step2.2: Infer Output's Dims Mapping. + TensorDistAttr output_dist_attr_dst = + CopyTensorDistAttrForOutput(x_dist_attr_src); + std::vector out_dims_mapping; + out_dims_mapping.reserve(out_axes.size()); + for (size_t i = 0; i < out_axes.size(); ++i) { + out_dims_mapping.push_back(axis_to_dim_map[out_axes.substr(i, 1)]); + } + output_dist_attr_dst.set_dims_mapping(out_dims_mapping); + + // step2.3: Merge and get Inputs' New Dims Mapping. + TensorDistAttr x_dist_attr_dst = GetInferedDistAttr( + x_dist_attr_src, x_shape, x_axes, axis_to_dim_map, trans_x); + TensorDistAttr y_dist_attr_dst = GetInferedDistAttr( + y_dist_attr_src, y_shape, y_axes, axis_to_dim_map, trans_y); + + // step2.3: Handle Partial + // Step2.3.1 Output Partial + std::vector partial_on_dims = + ResoluteOutputPartialDimension(axis_to_dim_map, out_axes); + + // Step2.3.2 handle input tensor partial (TODO) + VLOG(4) << "MatmulSPMDRule InferForward: " + << "X shape: [" << str_join(x_shape) << "], src_dims_mapping: [" + << str_join(x_dist_attr_src.dims_mapping()) + << "], dst_dims_mapping: [" + << str_join(x_dist_attr_dst.dims_mapping()) << "]; Y shape: [" + << str_join(y_shape) << "], src_dims_mapping: [" + << str_join(y_dist_attr_src.dims_mapping()) + << "], dst_dims_mapping: [" + << str_join(y_dist_attr_dst.dims_mapping()) + << "]; Output dims_mapping: [" << str_join(out_dims_mapping) + << "], partial_on_dims: [" << str_join(partial_on_dims) << "]"; + + return {{x_dist_attr_dst, y_dist_attr_dst}, {output_dist_attr_dst}}; +} + +TensorDistAttr GetInferedDistAttr( + const TensorDistAttr& origin_dist_attr, + const std::vector& shape, + const std::string& tensor_axis, + const std::unordered_map& axis_to_dim_map, + const bool trans_axis) { + TensorDistAttr dist_attr_ = CopyTensorDistAttrForOutput(origin_dist_attr); + std::vector infered_dims_mapping; + infered_dims_mapping.reserve(tensor_axis.size()); + + for (size_t i = 0; i < tensor_axis.size(); ++i) { + if (shape.size() > i && shape[i] == 1) { + infered_dims_mapping.push_back(-1); + } else { + auto itr = axis_to_dim_map.find(tensor_axis.substr(i, 1)); + if (itr == axis_to_dim_map.end()) { + phi::errors::InvalidArgument( + "Tensor axis [%s] of not in axis_to_dim_map.", + tensor_axis.substr(i, 1)); + } + infered_dims_mapping.push_back(itr->second); + } + } + + if (trans_axis) { + std::iter_swap(infered_dims_mapping.end() - 2, + infered_dims_mapping.end() - 1); + } + + dist_attr_.set_dims_mapping(infered_dims_mapping); + return dist_attr_; +} + +std::pair, std::vector> +MatmulSPMDRule::InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) { + PADDLE_THROW(phi::errors::Unimplemented( + "InferBackward of MatmulSPMDRule is NOT implemented yet.")); + + return {}; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h new file mode 100644 index 00000000000..6ce43a314d4 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TensorDistAttr GetInferedDistAttr( + const TensorDistAttr& origin_dist_attr, + const std::vector& shape, + const std::string& tensor_axes, + const std::unordered_map& axis_to_dim_map, + const bool trans_axis); + +class MatmulSPMDRule : public SPMDRuleBase { + public: + std::pair, std::vector> + InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) override; + + std::pair, std::vector> + InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) override; +}; +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h new file mode 100644 index 00000000000..33472305941 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h @@ -0,0 +1,30 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h" + +// TODO(ljz) Automatic this process in cmake file. +namespace paddle { +namespace distributed { +namespace auto_parallel { + +// matmul rule +REGISTER_SPMD_RULE(matmul, MatmulSPMDRule); + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt index 15c0ed63052..fc370f2a512 100644 --- a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt +++ b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt @@ -13,7 +13,6 @@ cc_test( SRCS dist_attr_test.cc DEPS phi proto_desc) -cc_test( - dist_mapper_test - SRCS dist_mapper_test.cc - DEPS phi) +cc_test_old(dist_mapper_test SRCS dist_mapper_test.cc DEPS phi) + +cc_test_old(spmd_rule_test SRCS spmd_rule_test.cc DEPS spmd_rule) diff --git a/paddle/fluid/distributed/auto_parallel/test/spmd_rule_test.cc b/paddle/fluid/distributed/auto_parallel/test/spmd_rule_test.cc new file mode 100644 index 00000000000..8d1516568f4 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/test/spmd_rule_test.cc @@ -0,0 +1,206 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "gtest/gtest.h" + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h" +#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" +#include "paddle/phi/core/distributed/auto_parallel/process_mesh.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(MatmulSPMDRule, Ctor) { + // build input data class + std::vector x_shape = {64, 32}; + std::vector y_shape = {32, 48}; + + std::vector mesh_shape = {2, 3}; + std::vector process_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + ProcessMesh process_mesh(mesh_shape, process_ids, dim_names); + + TensorDistAttr x_dist_attr = TensorDistAttr(); + x_dist_attr.set_process_mesh(process_mesh); + x_dist_attr.set_dims_mapping(std::vector({1, -1})); + x_dist_attr.set_batch_dim(-1); + x_dist_attr.set_dynamic_dims(std::vector({false, false})); + + TensorDistAttr y_dist_attr = TensorDistAttr(); + y_dist_attr.set_process_mesh(process_mesh); + y_dist_attr.set_dims_mapping(std::vector({-1, -1})); + y_dist_attr.set_batch_dim(-1); + y_dist_attr.set_dynamic_dims(std::vector({false, false})); + + DistTensorSpec x_dist_tensor_spec = DistTensorSpec(x_shape, x_dist_attr); + DistTensorSpec y_dist_tensor_spec = DistTensorSpec(y_shape, y_dist_attr); + + paddle::framework::AttributeMap attrs; + attrs["trans_x"] = false; + attrs["trans_y"] = false; + + SPMDRuleBase* matmul_rule = SPMDRuleMap::Instance().Get("matmul"); + + // mk[1, -1],kn[-1, -1] --> mk[1, -1],kn[-1, -1] = nm[1, -1] partial[] + std::pair, std::vector> + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + + size_t input_size = 2; + size_t output_size = 1; + EXPECT_EQ(infered_dist_attrs.first.size(), input_size); + EXPECT_EQ(infered_dist_attrs.second.size(), output_size); + + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({1, -1})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({-1, -1})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({1, -1})); + VLOG(4) << "test1 done." << std::endl << std::endl << std::endl; + + // mk[-1,-1],kn[-1,0] --> mk[-1,-1],kn[-1,0] = nm[-1,0] partial[] + x_dist_tensor_spec.set_dims_mapping({-1, -1}); + y_dist_tensor_spec.set_dims_mapping({-1, 0}); + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({-1, -1})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({-1, 0})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({-1, 0})); + VLOG(4) << "test2 done." << std::endl << std::endl << std::endl; + + // mk[1, 0],kn[-1,-1] --> mk[1, 0],kn[0, -1] = nm[1, -1] partial[0]: done + x_dist_tensor_spec.set_dims_mapping({1, 0}); + y_dist_tensor_spec.set_dims_mapping({-1, -1}); + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({1, 0})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({0, -1})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({1, -1})); + VLOG(4) << "test3 done." << std::endl << std::endl << std::endl; + + // mk[-1,-1],kn[1,0] --> mk[-1, 1],kn[1, 0] = nm[-1, 0] partial[1]: done + x_dist_tensor_spec.set_dims_mapping({-1, -1}); + y_dist_tensor_spec.set_dims_mapping({1, 0}); + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({-1, 1})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({1, 0})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({-1, 0})); + VLOG(4) << "test4 done." << std::endl << std::endl << std::endl; + + // abcmk[1, 0, -1, -1],kn[-1, -1] --> abcmk[1, 0, -1, -1],kn[-1, -1] = + // abcmn[1, 0, -1, -1] partial[]: done + x_dist_tensor_spec.set_shape({512, 48, 64, 32}); + x_dist_tensor_spec.set_dims_mapping({0, 1, -1, -1}); + y_dist_tensor_spec.set_dims_mapping({-1, -1}); + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({0, 1, -1, -1})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({-1, -1})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({0, 1, -1, -1})); + VLOG(4) << "test5 done." << std::endl << std::endl << std::endl; + + // abcmk[1, -1, -1, 0],kn[-1, -1] --> abcmk[1, -1, -1, 0],kn[0, -1] = abcmn[1, + // -1, -1, -1] partial[0]: done + x_dist_tensor_spec.set_dims_mapping({1, -1, -1, 0}); + y_dist_tensor_spec.set_dims_mapping({-1, -1}); + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({1, -1, -1, 0})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({0, -1})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({1, -1, -1, -1})); + VLOG(4) << "test6 done." << std::endl << std::endl << std::endl; + + // abcmk[1, -1, -1, 0], kn[-1, -1] --> abcmk[1, -1, -1, 0],kn[-1, -1] = + // abcmn[1, -1, 0, -1] partial[]: done + x_dist_tensor_spec.set_dims_mapping({1, -1, -1, 0}); + y_dist_tensor_spec.set_dims_mapping({-1, -1}); + attrs["trans_x"] = true; + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({1, -1, -1, 0})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({-1, -1})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({1, -1, 0, -1})); + VLOG(4) << "test7 done." << std::endl << std::endl << std::endl; + + // abcmk[-1, -1, -1, -1], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] = + // abcmn[-1, -1, -1, 1] partial[0]: done + x_dist_tensor_spec.set_dims_mapping({-1, -1, -1, -1}); + y_dist_tensor_spec.set_dims_mapping({1, 0}); + attrs["trans_x"] = false; + attrs["trans_y"] = true; + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({-1, -1, -1, 0})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({1, 0})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({-1, -1, -1, 1})); + VLOG(4) << "test8 done." << std::endl << std::endl << std::endl; + + // abcmk[-1, -1, -1, -1], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] = + // abcmn[-1, -1, -1, 1] partial[0]: done + x_dist_tensor_spec.set_dims_mapping({-1, -1, 0, 1}); + y_dist_tensor_spec.set_dims_mapping({1, 0}); + attrs["trans_y"] = true; + attrs["trans_x"] = true; + infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs); + EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(), + std::vector({-1, -1, 0, 1})); + EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(), + std::vector({-1, 0})); + EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(), + std::vector({-1, -1, 1, -1})); + VLOG(4) << "test9 done." << std::endl << std::endl << std::endl; + + // abcmk[-1, -1, 1, 0], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] = + // abcmn[-1, -1, -1, 1] partial[0]: done + x_dist_tensor_spec.set_dims_mapping({-1, -1, 1, 0}); + y_dist_tensor_spec.set_dims_mapping({1, 0}); + attrs["trans_y"] = true; + attrs["trans_x"] = true; + EXPECT_ANY_THROW(infered_dist_attrs = matmul_rule->InferForward( + {x_dist_tensor_spec, y_dist_tensor_spec}, attrs)); + // Error + VLOG(4) << "test10 done." << std::endl << std::endl << std::endl; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index f1b553a3db0..d16b413bf18 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -47,7 +47,8 @@ set(PYBIND_DEPS jit_property prim_utils static_tensor_operants - type_info) + type_info + auto_parallel) if(WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc index 1b78d7bd257..bdb8a763a91 100644 --- a/paddle/fluid/pybind/auto_parallel_py.cc +++ b/paddle/fluid/pybind/auto_parallel_py.cc @@ -24,12 +24,18 @@ #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h" #include "paddle/utils/optional.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h" + namespace py = pybind11; namespace paddle { namespace pybind { +using paddle::distributed::auto_parallel::DistTensorSpec; using paddle::distributed::auto_parallel::OperatorDistAttr; +using paddle::distributed::auto_parallel::SPMDRuleBase; +using paddle::distributed::auto_parallel::SPMDRuleMap; using paddle::framework::OpDesc; using paddle::framework::VarDesc; using phi::distributed::auto_parallel::Device; @@ -281,6 +287,29 @@ void BindAutoParallel(py::module *m) { py::arg("memo")) .def("__str__", &TensorDistAttr::to_string); + py::class_(*m, "SPMDRuleBase") + .def("infer_forward", &SPMDRuleBase::InferForward) + .def("infer_backward", &SPMDRuleBase::InferBackward); + + py::class_(*m, "DistTensorSpec") + .def(py::init<>()) + .def(py::init()) + .def(py::init &, const TensorDistAttr &>()) + .def("dims_mapping", &DistTensorSpec::dims_mapping) + .def("set_dims_mapping", &DistTensorSpec::set_dims_mapping) + .def("process_mesh", &DistTensorSpec::process_mesh) + .def("set_process_mesh", &DistTensorSpec::set_process_mesh) + .def_property("shape", &DistTensorSpec::shape, &DistTensorSpec::set_shape) + .def("__str__", &DistTensorSpec::to_string) + .def("__copy__", + [](const DistTensorSpec &self) { return DistTensorSpec(self); }) + .def( + "__deepcopy__", + [](const DistTensorSpec &self, py::dict) { + return DistTensorSpec(self); + }, + py::arg("memo")); + py::class_(*m, "OperatorDistAttr") .def(py::init<>()) .def(py::init()) @@ -384,6 +413,13 @@ void BindAutoParallel(py::module *m) { py::arg("memo")) .def("__str__", &OperatorDistAttr::to_string); + m->def( + "get_spmd_rule", + [](const std::string op_type) { + return SPMDRuleMap::Instance().Get(op_type); + }, + py::return_value_policy::reference); + // TODO(liuzhenhai): DistributedMapper is not used for now, but // dist_mapper_test need the symbols forch DistributedMapper to be linked, // remove it latter diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py index 24f77bce383..c15dded80da 100644 --- a/paddle/phi/api/yaml/generator/api_base.py +++ b/paddle/phi/api/yaml/generator/api_base.py @@ -1278,6 +1278,17 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d }} """ + def gen_dist_tensor_code(self): + # define the DistTensorSpec vector for input and output tensors + api_code = " \n std::vector input_specs;\n" + + # get DistTensorSpec for each input tensor + for tensor_name in self.inputs['names']: + api_code += f" input_specs.emplace_back(paddle::distributed::auto_parallel::DistTensorSpec({tensor_name}));\n" + api_code += "\n" + + return api_code + def gene_base_api_code(self, inplace_flag=False): api_func_name = self.get_api_func_name() if inplace_flag and api_func_name[-1] != '_': @@ -1286,6 +1297,8 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{ {self.gene_kernel_select()} """ + # if api_func_name == 'matmul': + # api_code += self.gen_dist_tensor_code() if len(self.kernel['func']) > 1: kernel_dispatch_code = '' diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py index 71285de7b24..7c7109859e0 100644 --- a/paddle/phi/api/yaml/generator/api_gen.py +++ b/paddle/phi/api/yaml/generator/api_gen.py @@ -379,6 +379,8 @@ def source_include(header_file_path): #include "paddle/phi/api/profiler/event_tracing.h" #include "paddle/phi/api/profiler/supplement_tracing.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h" + DECLARE_bool(conv2d_disable_cudnn); DECLARE_int32(low_precision_op_list); """ diff --git a/paddle/phi/core/distributed/auto_parallel/utils.h b/paddle/phi/core/distributed/auto_parallel/utils.h index 63036c9b7e9..c9e69dd550a 100644 --- a/paddle/phi/core/distributed/auto_parallel/utils.h +++ b/paddle/phi/core/distributed/auto_parallel/utils.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py index e59db23ceeb..d8636153ccf 100644 --- a/python/paddle/distributed/auto_parallel/static/completion.py +++ b/python/paddle/distributed/auto_parallel/static/completion.py @@ -16,6 +16,7 @@ import copy import logging from paddle.distributed.fleet.meta_optimizers.common import OpRole +from paddle.fluid.core import get_spmd_rule # noqa: F401 from paddle.framework import core from ..process_mesh import ProcessMesh, compute_compatible_process_mesh diff --git a/python/paddle/distributed/auto_parallel/static/dist_attribute.py b/python/paddle/distributed/auto_parallel/static/dist_attribute.py index 5c7fadf2e20..d31df134d6b 100644 --- a/python/paddle/distributed/auto_parallel/static/dist_attribute.py +++ b/python/paddle/distributed/auto_parallel/static/dist_attribute.py @@ -12,5 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License +from paddle.fluid.core import DistTensorSpec # noqa: F401 from paddle.fluid.core import OperatorDistAttr # noqa: F401 from paddle.fluid.core import TensorDistAttr # noqa: F401 diff --git a/python/paddle/distributed/auto_parallel/static/utils.py b/python/paddle/distributed/auto_parallel/static/utils.py index cfd5e9b844c..130098ac9d9 100644 --- a/python/paddle/distributed/auto_parallel/static/utils.py +++ b/python/paddle/distributed/auto_parallel/static/utils.py @@ -28,7 +28,7 @@ from paddle.framework.io_utils import is_belong_to_optimizer, is_parameter from paddle.static import Variable from ..process_mesh import ProcessMesh -from .dist_attribute import OperatorDistAttr, TensorDistAttr +from .dist_attribute import DistTensorSpec, OperatorDistAttr, TensorDistAttr OpRole = core.op_proto_and_checker_maker.OpRole OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() @@ -2380,3 +2380,66 @@ def use_new_executor(): 'True', 'true', ] + + +def wrap_data_for_completion( + dist_op, input_names: list, output_names: list, attr_names: list +): + """ + Get data used in inferring distributed attributes, including: + 1. DistTensorSpec for each input and output tensor of this dist_op. + 2. Operator attributes of this dist_op, e.g. transpose_x in matmul op. + + Args: + dist_op: the DistributedOperator + input_names: list, name of the dist_op's input tensors + output_names: list, name of the dist_op's output tensors + attr_names: list, attribute name of the dist_op's corresponding serial op + + Returns: + input_specs: list, DistTensorSpec for each input tensor of the dist_op + output_specs: list, DistTensorSpec for each output tensor of the dist_op + attrs: dict, attribute map of the dist op + + Usage: + op_desc = dist_op.serial_op.desc + input_name_list = [] + output_name_list = [] + input_name_list.append(op_desc.input('X')[0]) # 'X' is the arg name for op + input_name_list.append(op_desc.input('Y')[0]) + output_name_list.append(op_desc.output('Out')[0]) + attr_name_list = ['trans_x', 'trans_y'] + input_specs, output_specs, attrs = wrap_data_for_completion( + dist_op, + input_name_list, + output_name_list, + attr_name_list) + + """ + + input_specs = [] + output_specs = [] + attrs = {} + + serial_op = dist_op.serial_op + + # Construct each input tensor's DistTensorSpec with shape and dist_attr + for name in input_names: + tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(name) + var = serial_op.block._var_recursive(name) + tensor_shape = var.shape + dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr) + input_specs.append(dist_spec) + + # Construct each output tensor's DistTensorSpec with shape and dist_attr + for name in output_names: + tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(name) + var = serial_op.block._var_recursive(name) + tensor_shape = var.shape + dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr) + output_specs.append(dist_spec) + + for attr_name in attr_names: + attrs[attr_name] = serial_op.desc.attr(attr_name) + + return input_specs, output_specs, attrs diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt new file mode 100644 index 00000000000..f103971401e --- /dev/null +++ b/test/auto_parallel/spmd_rules/CMakeLists.txt @@ -0,0 +1,10 @@ +# file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +# string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +if(WITH_DISTRIBUTE AND WITH_GPU) + + # NOTE(zyl): unittests WITH single card and WITHOUT timeout + py_test_modules(test_matmul_rule MODULES test_matmul_rule) + # End of unittests WITH single card WITHOUT timeout + +endif() diff --git a/test/auto_parallel/spmd_rules/test_matmul_rule.py b/test/auto_parallel/spmd_rules/test_matmul_rule.py new file mode 100644 index 00000000000..85195ca4fd9 --- /dev/null +++ b/test/auto_parallel/spmd_rules/test_matmul_rule.py @@ -0,0 +1,225 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.static.completion import get_spmd_rule +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.distributed.fleet import auto + + +class TestMatmulSPMDRule(unittest.TestCase): + def setUp(self): + self.rule = get_spmd_rule("matmul") + + x_shape = [64, 32] + y_shape = [32, 48] + process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]]) + + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [1, 0] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + + y_tensor_dist_attr = TensorDistAttr() + y_tensor_dist_attr.dims_mapping = [0, -1] + y_tensor_dist_attr.process_mesh = process_mesh + self.y_dist_tensor_spec = DistTensorSpec(y_shape, y_tensor_dist_attr) + + self.attrs = { + 'trans_x': False, + 'trans_y': False, + } + + def test_matmul_infer_forward(self): + # TODO test partial: mk[1, 0],kn[0, -1] --> mk[1, 0],kn[0, -1] = nm[1, -1] partial[0] + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 2) + self.assertEqual(len(infered_output_dist_attrs), 1) + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, 0]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1]) + + # test row parallel: mk[1, -1],kn[-1, -1] --> mk[1, -1],kn[-1, -1] = nm[1, -1] partial[] + self.x_dist_tensor_spec.set_dims_mapping([1, -1]) + self.y_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1]) + + # test row parallel: mk[1, -1],kn[-1, -1] --> mk[1, -1],kn[-1, -1] = nm[1, -1] partial[] + self.x_dist_tensor_spec.set_dims_mapping([1, -1]) + self.y_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1]) + + # test n parallel: mk[-1, -1],kn[-1, 0] --> mk[-1, -1],kn[-1, 0] = nm[-1, 0] partial[] + self.x_dist_tensor_spec.set_dims_mapping([-1, -1]) + self.y_dist_tensor_spec.set_dims_mapping([-1, 0]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, 0]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0]) + + # test partial with propogation: mk[1, 0],kn[-1,-1] --> mk[1, 0],kn[0, -1] = nm[1, -1] partial[0] + self.x_dist_tensor_spec.set_dims_mapping([1, 0]) + self.y_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, 0]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1]) + + # mk[-1,-1],kn[1,0] --> mk[-1, 1],kn[1, 0] = nm[-1, 0] partial[1]: + self.x_dist_tensor_spec.set_dims_mapping([-1, -1]) + self.y_dist_tensor_spec.set_dims_mapping([1, 0]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, 1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [1, 0]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0]) + + # abcmk[1, 0, -1, -1],kn[-1, -1] --> abcmk[1, 0, -1, -1],kn[-1, -1] = abcmn[1, 0, -1, -1] partial[]: done + self.x_dist_tensor_spec.shape = [512, 48, 64, 32] + self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1]) + self.y_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1] + ) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, -1]) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [0, 1, -1, -1] + ) + + # abcmk[1, -1, -1, 0],kn[-1, -1] --> abcmk[1, -1, -1, 0],kn[0, -1] = abcmn[1,-1, -1, -1] partial[0] + self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0]) + self.y_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, 0] + ) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [0, -1]) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [1, -1, -1, -1] + ) + + # trans_x = True, abcmk[1, -1, -1, 0], kn[-1, -1] --> abcmk[1, -1, -1, 0],kn[-1, -1] = abcmn[1, -1, 0, -1] partial[] + self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0]) + self.y_dist_tensor_spec.set_dims_mapping([-1, -1]) + self.attrs['trans_x'] = True + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, 0] + ) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, -1]) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [1, -1, 0, -1] + ) + + # trans_y = True, abcmk[-1, -1, -1, -1], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] = abcmn[-1, -1, -1, 1] partial[0]: done + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, -1, -1]) + self.y_dist_tensor_spec.set_dims_mapping([1, 0]) + self.attrs['trans_x'] = False + self.attrs['trans_y'] = True + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, 0] + ) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [1, 0]) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1, 1] + ) + + # trans_y = True, trans_x = True, abcmk[-1, -1, 0, 1], kn[1, 0] --> abcmk[-1, -1, 0, 1]],kn[-1, 0] = abcmn[-1, -1, 1, -1] partial[0] + # multiple mesh dim shard same tensor axis + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1]) + self.y_dist_tensor_spec.set_dims_mapping([1, 0]) + self.attrs['trans_x'] = True + self.attrs['trans_y'] = True + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0, 1] + ) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, 0]) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, 1, -1] + ) + + # trans_y = True, trans_x = True, abcmk[-1, -1, 1, 0], kn[1, 0] --> error: + # one mesh dim shard multiple tensor axes + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 1, 0]) + self.y_dist_tensor_spec.set_dims_mapping([1, 0]) + self.attrs['trans_x'] = True + self.attrs['trans_y'] = True + with self.assertRaises(NotImplementedError): + self.rule.infer_forward( + [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs + ) + + +if __name__ == "__main__": + unittest.main() -- GitLab