提交 3231c4ab 编写于 作者: L lizhenyu

add wide&deep stanalone training script for gpu in model zoo

上级 c74b1685
...@@ -37,6 +37,7 @@ To train and evaluate the model, command as follows: ...@@ -37,6 +37,7 @@ To train and evaluate the model, command as follows:
python train_and_eval.py python train_and_eval.py
``` ```
Arguments: Arguments:
* `--device_target`: Device where the code will be implemented (Default: Ascend).
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
* `--epochs`: Total train epochs. * `--epochs`: Total train epochs.
* `--batch_size`: Training batch size. * `--batch_size`: Training batch size.
...@@ -57,6 +58,7 @@ To train the model in one device, command as follows: ...@@ -57,6 +58,7 @@ To train the model in one device, command as follows:
python train.py python train.py
``` ```
Arguments: Arguments:
* `--device_target`: Device where the code will be implemented (Default: Ascend).
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
* `--epochs`: Total train epochs. * `--epochs`: Total train epochs.
* `--batch_size`: Training batch size. * `--batch_size`: Training batch size.
...@@ -87,6 +89,7 @@ To evaluate the model, command as follows: ...@@ -87,6 +89,7 @@ To evaluate the model, command as follows:
python eval.py python eval.py
``` ```
Arguments: Arguments:
* `--device_target`: Device where the code will be implemented (Default: Ascend).
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
* `--epochs`: Total train epochs. * `--epochs`: Total train epochs.
* `--batch_size`: Training batch size. * `--batch_size`: Training batch size.
......
...@@ -26,11 +26,11 @@ from src.datasets import create_dataset ...@@ -26,11 +26,11 @@ from src.datasets import create_dataset
from src.metrics import AUCMetric from src.metrics import AUCMetric
from src.config import WideDeepConfig from src.config import WideDeepConfig
context.set_context(mode=context.GRAPH_MODE, device_target="Davinci",
save_graphs=True)
def get_WideDeep_net(config): def get_WideDeep_net(config):
"""
Get network of wide&deep model.
"""
WideDeep_net = WideDeepModel(config) WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config) loss_net = NetWithLossClass(WideDeep_net, config)
...@@ -91,4 +91,5 @@ if __name__ == "__main__": ...@@ -91,4 +91,5 @@ if __name__ == "__main__":
widedeep_config = WideDeepConfig() widedeep_config = WideDeepConfig()
widedeep_config.argparse_init() widedeep_config.argparse_init()
context.set_context(mode=context.GRAPH_MODE, device_target=widedeep_config.device_target)
test_eval(widedeep_config) test_eval(widedeep_config)
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
# bash run_multigpu_train.sh # bash run_multigpu_train.sh RANK_SIZE EPOCH_SIZE DATASET
script_self=$(readlink -f "$0") script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}") self_path=$(dirname "${script_self}")
RANK_SIZE=$1 RANK_SIZE=$1
...@@ -25,4 +25,5 @@ mpirun --allow-run-as-root -n $RANK_SIZE \ ...@@ -25,4 +25,5 @@ mpirun --allow-run-as-root -n $RANK_SIZE \
python -s ${self_path}/../train_and_eval_distribute.py \ python -s ${self_path}/../train_and_eval_distribute.py \
--device_target="GPU" \ --device_target="GPU" \
--data_path=$DATASET \ --data_path=$DATASET \
--batch_size=8000 \
--epochs=$EPOCH_SIZE > log.txt 2>&1 & --epochs=$EPOCH_SIZE > log.txt 2>&1 &
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
# bash run_standalone_train_for_gpu.sh EPOCH_SIZE DATASET
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
EPOCH_SIZE=$1
DATASET=$2
python -s ${self_path}/../train_and_eval.py \
--device_target="GPU" \
--data_path=$DATASET \
--batch_size=16000 \
--epochs=$EPOCH_SIZE > log.txt 2>&1 &
...@@ -15,16 +15,16 @@ ...@@ -15,16 +15,16 @@
import os import os
from mindspore import Model, context from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
from src.callbacks import LossCallBack from src.callbacks import LossCallBack
from src.datasets import create_dataset from src.datasets import create_dataset
from src.config import WideDeepConfig from src.config import WideDeepConfig
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True)
def get_WideDeep_net(configure): def get_WideDeep_net(configure):
"""
Get network of wide&deep model.
"""
WideDeep_net = WideDeepModel(configure) WideDeep_net = WideDeepModel(configure)
loss_net = NetWithLossClass(WideDeep_net, configure) loss_net = NetWithLossClass(WideDeep_net, configure)
...@@ -72,7 +72,7 @@ def test_train(configure): ...@@ -72,7 +72,7 @@ def test_train(configure):
model = Model(train_net) model = Model(train_net)
callback = LossCallBack(config=configure) callback = LossCallBack(config=configure)
ckptconfig = CheckpointConfig(save_checkpoint_steps=1, ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(),
keep_checkpoint_max=5) keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig)
model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb]) model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb])
...@@ -82,4 +82,5 @@ if __name__ == "__main__": ...@@ -82,4 +82,5 @@ if __name__ == "__main__":
config = WideDeepConfig() config = WideDeepConfig()
config.argparse_init() config.argparse_init()
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
test_train(config) test_train(config)
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import os import os
from mindspore import Model, context from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
from src.callbacks import LossCallBack, EvalCallBack from src.callbacks import LossCallBack, EvalCallBack
...@@ -23,10 +23,11 @@ from src.datasets import create_dataset ...@@ -23,10 +23,11 @@ from src.datasets import create_dataset
from src.metrics import AUCMetric from src.metrics import AUCMetric
from src.config import WideDeepConfig from src.config import WideDeepConfig
context.set_context(mode=context.GRAPH_MODE, device_target="Davinci")
def get_WideDeep_net(config): def get_WideDeep_net(config):
"""
Get network of wide&deep model.
"""
WideDeep_net = WideDeepModel(config) WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config) loss_net = NetWithLossClass(WideDeep_net, config)
...@@ -87,11 +88,13 @@ def test_train_eval(config): ...@@ -87,11 +88,13 @@ def test_train_eval(config):
out = model.eval(ds_eval) out = model.eval(ds_eval)
print("=====" * 5 + "model.eval() initialized: {}".format(out)) print("=====" * 5 + "model.eval() initialized: {}".format(out))
model.train(epochs, ds_train, callbacks=[eval_callback, callback, ckpoint_cb]) model.train(epochs, ds_train,
callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
if __name__ == "__main__": if __name__ == "__main__":
wide_deep_config = WideDeepConfig() wide_deep_config = WideDeepConfig()
wide_deep_config.argparse_init() wide_deep_config.argparse_init()
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target)
test_train_eval(wide_deep_config) test_train_eval(wide_deep_config)
...@@ -40,6 +40,9 @@ init() ...@@ -40,6 +40,9 @@ init()
def get_WideDeep_net(config): def get_WideDeep_net(config):
"""
Get network of wide&deep model.
"""
WideDeep_net = WideDeepModel(config) WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config) loss_net = NetWithLossClass(WideDeep_net, config)
loss_net = VirtualDatasetCellTriple(loss_net) loss_net = VirtualDatasetCellTriple(loss_net)
......
...@@ -33,6 +33,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ...@@ -33,6 +33,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def get_WideDeep_net(config): def get_WideDeep_net(config):
"""
Get network of wide&deep model.
"""
WideDeep_net = WideDeepModel(config) WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config) loss_net = NetWithLossClass(WideDeep_net, config)
train_net = TrainStepWrap(loss_net) train_net = TrainStepWrap(loss_net)
...@@ -90,8 +93,12 @@ def train_and_eval(config): ...@@ -90,8 +93,12 @@ def train_and_eval(config):
callback = LossCallBack(config=config) callback = LossCallBack(config=config)
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', if config.device_target == "Ascend":
directory=config.ckpt_path, config=ckptconfig) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path, config=ckptconfig)
elif config.device_target == "GPU":
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train_' + str(get_rank()),
directory=config.ckpt_path, config=ckptconfig)
out = model.eval(ds_eval) out = model.eval(ds_eval)
print("=====" * 5 + "model.eval() initialized: {}".format(out)) print("=====" * 5 + "model.eval() initialized: {}".format(out))
model.train(epochs, ds_train, model.train(epochs, ds_train,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册