提交 f04b23ad 编写于 作者: T tangwei12

add checkpoint_load, update checkpoint save

上级 3c820064
...@@ -243,6 +243,7 @@ op_library(load_op DEPS lod_tensor) ...@@ -243,6 +243,7 @@ op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor)
op_library(checkpoint_save_op DEPS lod_tensor) op_library(checkpoint_save_op DEPS lod_tensor)
op_library(checkpoint_load_op DEPS lod_tensor)
op_library(concat_op DEPS concat) op_library(concat_op DEPS concat)
# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency # FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
...@@ -278,6 +279,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea ...@@ -278,6 +279,6 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
cc_test(checkpoint_save_op_test SRCS checkpoint_save_op_test.cc DEPS checkpoint_save_op) cc_test(checkpoint_op_test SRCS checkpoint_op_test.cc DEPS checkpoint_save_op checkpoint_load_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
#include <sstream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
constexpr char kSEP = '/';
// write empty file named _SUCCESS
const char SUCCESS[] = "_SUCCESS";
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
class CheckpointLoadOp : public framework::OperatorBase {
public:
CheckpointLoadOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dir = Attr<std::string>("dir");
bool is_present = FileExists(dir);
if (!is_present) {
return;
}
// UPDATE LATER ...
}
};
class CheckpointLoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
CheckpointLoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddAttr<std::string>(
"dir",
"(string)"
"The \"file_path\" where the LoDTensor variables will be saved.")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(checkpoint_load, ops::CheckpointLoadOp,
ops::CheckpointLoadOpProtoMaker);
...@@ -27,8 +27,6 @@ limitations under the License. */ ...@@ -27,8 +27,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr char kSEP = '/'; constexpr char kSEP = '/';
// write empty file named _SUCCESS // write empty file named _SUCCESS
const char SUCCESS[] = "_SUCCESS"; const char SUCCESS[] = "_SUCCESS";
...@@ -82,7 +80,14 @@ class CheckpointSaveOp : public framework::OperatorBase { ...@@ -82,7 +80,14 @@ class CheckpointSaveOp : public framework::OperatorBase {
// overwrite=false", // overwrite=false",
// dir, overwrite); // dir, overwrite);
} }
MkDirRecursively(dir.c_str());
auto serial_var_name = Output("Serial");
auto *serial_var = scope.FindVar(serial_var_name);
std::string *serial_num = serial_var->GetMutable<std::string>();
serial_num->append("0");
dir.append("/");
dir.append(serial_num);
MkDirRecursively(dir.c_str()); MkDirRecursively(dir.c_str());
auto inp_var_names = Inputs("X"); auto inp_var_names = Inputs("X");
...@@ -93,6 +98,7 @@ class CheckpointSaveOp : public framework::OperatorBase { ...@@ -93,6 +98,7 @@ class CheckpointSaveOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
// todo (tangwei) made it async
for (size_t i = 0; i < inp_var_names.size(); i++) { for (size_t i = 0; i < inp_var_names.size(); i++) {
auto *var = scope.FindVar(inp_var_names[i]); auto *var = scope.FindVar(inp_var_names[i]);
std::string var_file; std::string var_file;
...@@ -132,19 +138,20 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -132,19 +138,20 @@ class CheckpointSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"X", "X",
"(vector) Input LoDTensors that need to be saved together in a file.") "(vector) Input LoDTensors that need to be saved together in a file.")
.AsDuplicable(); .AsDuplicable();
AddOutput("Serial", "the serial number");
AddComment(R"DOC( AddComment(R"DOC(
SaveCombine operator CheckpointSave operator
This operator will serialize and write a list of input LoDTensor variables This operator will serialize and write a list of input LoDTensor variables
to a file on disk. to a file on disk.
)DOC"); )DOC");
AddAttr<bool>("overwrite", AddAttr<bool>("overwrite",
"(boolean, default true)" "(boolean, default false)"
"Overwrite the output file if it exists.") "Delete the output dir if it exists.")
.SetDefault(true); .SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"file_path", "dir",
"(string)" "(string)"
"The \"file_path\" where the LoDTensor variables will be saved.") "The \"file_path\" where the LoDTensor variables will be saved.")
.AddCustomChecker( .AddCustomChecker(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册