未验证 提交 bc72be16 编写于 作者: C cheng cheng 提交者: GitHub

NNGraph RunLazyJob check static input tensor meta (#6243)

* NNGraph input/output valid by register tensors

* LazyJobInstrutionType skip send Push/PullCB by NNGraph io valid

* Add test script

* NNGraph RunLazyJob check static input tensor meta

* Fix bug of static vec empty

* Using tensor meta string instead of TensorMeta

* refine error note

* fix bug of clang

* Fix graph/test_to_consistent.py

* add nd sbp check
Co-authored-by: Noneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
上级 d9ace84f
......@@ -51,6 +51,17 @@ Maybe<bool> GetTensorValidInCurRank(const std::shared_ptr<one::Tensor>& tensor)
}
}
Maybe<std::string> GetTensorMetaString(const std::shared_ptr<one::Tensor>& tensor) {
std::string ret = "shape=" + tensor->shape()->ToString() + ", dtype=" + tensor->dtype()->name();
if (tensor->is_consistent()) {
ret += ", placement=" + *JUST(PlacementToString(JUST(tensor->parallel_desc())));
ret += ", nd_sbp=" + *JUST(NdSbpToString(JUST(tensor->nd_sbp())));
} else {
ret += ", device=" + JUST(tensor->device())->ToString();
}
return ret;
}
} // namespace
NNGraph::~NNGraph() {
......@@ -78,6 +89,14 @@ const std::vector<bool>& NNGraph::inputs_valid() const { return input_tensors_va
const std::vector<bool>& NNGraph::outputs_valid() const { return output_tensors_valid_; }
const std::vector<std::string>& NNGraph::inputs_tensor_meta_str() const {
return inputs_tensor_meta_str_;
}
const std::vector<std::string>& NNGraph::outputs_tensor_meta_str() const {
return outputs_tensor_meta_str_;
}
int64_t NNGraph::variable_op_size() const { return variable_op_name2eager_blob_.size(); }
Maybe<void> NNGraph::RegisterInputOpNamesAndTensors(
......@@ -87,10 +106,13 @@ Maybe<void> NNGraph::RegisterInputOpNamesAndTensors(
CHECK_OR_RETURN(input_op_names_.empty())
<< " The input tensors of nn.Graph " << name_ << " are register repeatedly.";
CHECK_OR_RETURN(input_tensors_valid_.empty());
CHECK_OR_RETURN(inputs_tensor_meta_str_.empty());
input_op_names_.assign(input_op_names.begin(), input_op_names.end());
input_tensors_valid_.reserve(input_tensors.size());
inputs_tensor_meta_str_.reserve(input_tensors.size());
for (const auto& input_tensor : input_tensors) {
input_tensors_valid_.push_back(JUST(GetTensorValidInCurRank(input_tensor)));
inputs_tensor_meta_str_.push_back(*JUST(GetTensorMetaString(input_tensor)));
}
CHECK_EQ_OR_RETURN(input_tensors_valid_.size(), input_tensors.size());
return Maybe<void>::Ok();
......@@ -103,10 +125,13 @@ Maybe<void> NNGraph::RegisterOutputOpNamesAndTensors(
CHECK_OR_RETURN(output_op_names_.empty())
<< " The output tensors of nn.Graph " << name_ << " are register repeatedly.";
CHECK_OR_RETURN(output_tensors_valid_.empty());
CHECK_OR_RETURN(outputs_tensor_meta_str_.empty());
output_op_names_.assign(output_op_names.begin(), output_op_names.end());
output_tensors_valid_.reserve(output_tensors.size());
outputs_tensor_meta_str_.reserve(output_tensors.size());
for (const auto& output_tensor : output_tensors) {
output_tensors_valid_.push_back(JUST(GetTensorValidInCurRank(output_tensor)));
outputs_tensor_meta_str_.push_back(*JUST(GetTensorMetaString(output_tensor)));
}
CHECK_EQ_OR_RETURN(output_tensors_valid_.size(), output_tensors.size());
return Maybe<void>::Ok();
......@@ -331,6 +356,21 @@ Maybe<void> RunLazyNNGraph(const one::TensorTuple& inputs, const one::TensorTupl
// the args: parameters is all variable tensor hold by nn.Graph
// but the NNGraph::variable_op_size may has FreeEagerTensor as sepcial variable op.
CHECK_LE_OR_RETURN(parameters.size(), nn_graph->variable_op_size());
for (int i = 0; i < inputs.size(); ++i) {
// TODO(chengcheng, liufengwei):
// use TensorMeta.to_string and equal.
std::string tensor_meta_str = *JUST(GetTensorMetaString(inputs.at(i)));
const std::string& static_meta_str = nn_graph->inputs_tensor_meta_str().at(i);
CHECK_OR_RETURN(static_meta_str == tensor_meta_str)
<< "\n nn.Graph ONLY accepts static inputs tensor meta, please check whether your input "
<< "tensor meta each step is the same as the input of first call graph. \n The excepted "
<< "tensor meta is : ( \n " << static_meta_str
<< " \n) , but the actual tensor meta is : ( \n " << tensor_meta_str << " \n)";
}
for (int i = 0; i < outputs.size(); ++i) {
CHECK_OR_RETURN(nn_graph->outputs_tensor_meta_str().at(i)
== *JUST(GetTensorMetaString(outputs.at(i))));
}
std::vector<std::shared_ptr<vm::EagerBlobObject>> input_blobs;
std::vector<std::shared_ptr<vm::EagerBlobObject>> output_blobs;
std::vector<std::shared_ptr<vm::EagerBlobObject>> var_blobs;
......
......@@ -38,6 +38,8 @@ class NNGraph final : public NNGraphIf {
const std::vector<std::string>& outputs_op_names() const override;
const std::vector<bool>& inputs_valid() const override;
const std::vector<bool>& outputs_valid() const override;
const std::vector<std::string>& inputs_tensor_meta_str() const;
const std::vector<std::string>& outputs_tensor_meta_str() const;
int64_t variable_op_size() const;
Maybe<void> RegisterInputOpNamesAndTensors(
......@@ -64,6 +66,8 @@ class NNGraph final : public NNGraphIf {
std::vector<std::string> output_op_names_;
std::vector<bool> input_tensors_valid_;
std::vector<bool> output_tensors_valid_;
std::vector<std::string> inputs_tensor_meta_str_;
std::vector<std::string> outputs_tensor_meta_str_;
HashMap<std::string, Blob*> variable_op_name2eager_blob_;
HashSet<std::string> variable_op_names_;
Job job_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册