Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
b76594a0
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b76594a0
编写于
2月 02, 2023
作者:
W
wangzhen38
提交者:
GitHub
2月 02, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Add GPUPS CI] GPUBox unittest (#50130)
上级
494431c6
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
465 addition
and
0 deletion
+465
-0
python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
+4
-0
python/paddle/fluid/tests/unittests/ps/config_gpubox.yaml
python/paddle/fluid/tests/unittests/ps/config_gpubox.yaml
+55
-0
python/paddle/fluid/tests/unittests/ps/dataset_generator_criteo.py
...ddle/fluid/tests/unittests/ps/dataset_generator_criteo.py
+86
-0
python/paddle/fluid/tests/unittests/ps/download_criteo_data.sh
...n/paddle/fluid/tests/unittests/ps/download_criteo_data.sh
+17
-0
python/paddle/fluid/tests/unittests/ps/gpubox_run.sh
python/paddle/fluid/tests/unittests/ps/gpubox_run.sh
+60
-0
python/paddle/fluid/tests/unittests/ps/static_gpubox_trainer.py
.../paddle/fluid/tests/unittests/ps/static_gpubox_trainer.py
+208
-0
python/paddle/fluid/tests/unittests/ps/test_gpubox_ps.py
python/paddle/fluid/tests/unittests/ps/test_gpubox_ps.py
+35
-0
未找到文件。
python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
浏览文件 @
b76594a0
...
...
@@ -9,3 +9,7 @@ foreach(TEST_OP ${TEST_OPS})
list
(
APPEND TEST_OPS
${
TEST_OP
}
)
set_tests_properties
(
${
TEST_OP
}
PROPERTIES TIMEOUT 50
)
endforeach
()
if
(
WITH_PSCORE
)
set_tests_properties
(
test_gpubox_ps PROPERTIES LABELS
"RUN_TYPE=GPUPS"
)
endif
()
python/paddle/fluid/tests/unittests/ps/config_gpubox.yaml
0 → 100755
浏览文件 @
b76594a0
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# workspace
#workspace: "models/rank/dnn"
runner
:
train_data_dir
:
"
data/sample_data/train"
train_reader_path
:
"
criteo_reader"
# importlib format
use_gpu
:
True
use_auc
:
False
train_batch_size
:
32
epochs
:
3
print_interval
:
10
model_save_path
:
"
output_model_dnn_queue"
sync_mode
:
"
gpubox"
thread_num
:
30
reader_type
:
"
InmemoryDataset"
# DataLoader / QueueDataset / RecDataset / InmemoryDataset
pipe_command
:
"
python3.7
dataset_generator_criteo.py"
dataset_debug
:
False
split_file_list
:
False
infer_batch_size
:
2
infer_reader_path
:
"
criteo_reader"
# importlib format
test_data_dir
:
"
data/sample_data/train"
infer_load_path
:
"
output_model_dnn_queue"
infer_start_epoch
:
0
infer_end_epoch
:
3
# hyper parameters of user-defined network
hyper_parameters
:
# optimizer config
optimizer
:
class
:
Adam
learning_rate
:
0.001
strategy
:
async
# user-defined <key, value> pairs
sparse_inputs_slots
:
27
sparse_feature_number
:
1024
sparse_feature_dim
:
9
dense_input_dim
:
13
fc_sizes
:
[
512
,
256
,
128
,
32
]
distributed_embedding
:
0
python/paddle/fluid/tests/unittests/ps/dataset_generator_criteo.py
0 → 100644
浏览文件 @
b76594a0
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
paddle.distributed.fleet
as
fleet
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
class
Reader
(
fleet
.
MultiSlotDataGenerator
):
def
init
(
self
):
padding
=
0
sparse_slots
=
"click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
self
.
sparse_slots
=
sparse_slots
.
strip
().
split
(
" "
)
self
.
dense_slots
=
[
"dense_feature"
]
self
.
dense_slots_shape
=
[
13
]
self
.
slots
=
self
.
sparse_slots
+
self
.
dense_slots
self
.
slot2index
=
{}
self
.
visit
=
{}
for
i
in
range
(
len
(
self
.
slots
)):
self
.
slot2index
[
self
.
slots
[
i
]]
=
i
self
.
visit
[
self
.
slots
[
i
]]
=
False
self
.
padding
=
padding
logger
.
info
(
"pipe init success"
)
def
line_process
(
self
,
line
):
line
=
line
.
strip
().
split
(
" "
)
output
=
[(
i
,
[])
for
i
in
self
.
slots
]
for
i
in
line
:
slot_feasign
=
i
.
split
(
":"
)
slot
=
slot_feasign
[
0
]
if
slot
not
in
self
.
slots
:
continue
if
slot
in
self
.
sparse_slots
:
feasign
=
int
(
slot_feasign
[
1
])
else
:
feasign
=
float
(
slot_feasign
[
1
])
output
[
self
.
slot2index
[
slot
]][
1
].
append
(
feasign
)
self
.
visit
[
slot
]
=
True
for
i
in
self
.
visit
:
slot
=
i
if
not
self
.
visit
[
slot
]:
if
i
in
self
.
dense_slots
:
output
[
self
.
slot2index
[
i
]][
1
].
extend
(
[
self
.
padding
]
*
self
.
dense_slots_shape
[
self
.
slot2index
[
i
]]
)
else
:
output
[
self
.
slot2index
[
i
]][
1
].
extend
([
self
.
padding
])
else
:
self
.
visit
[
slot
]
=
False
return
output
# return [label] + sparse_feature + [dense_feature]
def
generate_sample
(
self
,
line
):
r
"Dataset Generator"
def
reader
():
output_dict
=
self
.
line_process
(
line
)
# {key, value} dict format: {'labels': [1], 'sparse_slot1': [2, 3], 'sparse_slot2': [4, 5, 6, 8], 'dense_slot': [1,2,3,4]}
# dict must match static_model.create_feed()
yield
output_dict
return
reader
if
__name__
==
"__main__"
:
r
=
Reader
()
r
.
init
()
r
.
run_from_stdin
()
python/paddle/fluid/tests/unittests/ps/download_criteo_data.sh
0 → 100755
浏览文件 @
b76594a0
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
wget
--no-check-certificate
https://paddlerec.bj.bcebos.com/benchmark/sample_train.txt
mkdir
train_data
mv
sample_train.txt train_data/
python/paddle/fluid/tests/unittests/ps/gpubox_run.sh
0 → 100644
浏览文件 @
b76594a0
# !/bin/bash
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if
[
!
-d
"./log"
]
;
then
mkdir
./log
echo
"Create log floder for store running log"
fi
export
FLAGS_LAUNCH_BARRIER
=
0
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PSERVER_NUMS
=
1
export
PADDLE_TRAINERS
=
1
export
PADDLE_TRAINERS_NUM
=
${
PADDLE_TRAINERS
}
export
POD_IP
=
127.0.0.1
# set free port if 29011 is occupied
export
PADDLE_PSERVERS_IP_PORT_LIST
=
"127.0.0.1:29011"
export
PADDLE_PSERVER_PORT_ARRAY
=(
29011
)
# set gpu numbers according to your device
export
FLAGS_selected_gpus
=
"0,1,2,3,4,5,6,7"
#export FLAGS_selected_gpus="0,1"
# set your model yaml
#SC="gpubox_ps_trainer.py"
SC
=
"static_gpubox_trainer.py"
# run pserver
export
TRAINING_ROLE
=
PSERVER
for
((
i
=
0
;
i<
$PADDLE_PSERVER_NUMS
;
i++
))
do
cur_port
=
${
PADDLE_PSERVER_PORT_ARRAY
[
$i
]
}
echo
"PADDLE WILL START PSERVER "
$cur_port
export
PADDLE_PORT
=
${
cur_port
}
python3.7
-u
$SC
&> ./log/pserver.
$i
.log &
done
# run trainer
export
TRAINING_ROLE
=
TRAINER
for
((
i
=
0
;
i<
$PADDLE_TRAINERS
;
i++
))
do
echo
"PADDLE WILL START Trainer "
$i
export
PADDLE_TRAINER_ID
=
$i
python3.7
-u
$SC
&> ./log/worker.
$i
.log
done
echo
"Training log stored in ./log/"
python/paddle/fluid/tests/unittests/ps/static_gpubox_trainer.py
0 → 100755
浏览文件 @
b76594a0
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
logging
import
os
import
sys
import
time
import
paddle
import
paddle.distributed.fleet
as
fleet
from
paddle.fluid.incubate.fleet.utils.fleet_util
import
FleetUtil
fleet_util
=
FleetUtil
()
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
__dir__
,
'..'
)))
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
get_dataset
(
inputs
,
config
):
dataset
=
paddle
.
distributed
.
InMemoryDataset
()
dataset
.
_set_use_ps_gpu
(
config
.
get
(
'runner.use_gpu'
))
pipe_cmd
=
config
.
get
(
'runner.pipe_command'
)
dataset
.
init
(
use_var
=
inputs
,
pipe_command
=
pipe_cmd
,
batch_size
=
32
,
thread_num
=
int
(
config
.
get
(
'runner.thread_num'
)),
fs_name
=
config
.
get
(
"runner.fs_name"
,
""
),
fs_ugi
=
config
.
get
(
"runner.fs_ugi"
,
""
),
)
dataset
.
set_filelist
([
"train_data/sample_train.txt"
])
dataset
.
update_settings
(
parse_ins_id
=
config
.
get
(
"runner.parse_ins_id"
,
False
),
parse_content
=
config
.
get
(
"runner.parse_content"
,
False
),
)
return
dataset
class
Main
(
object
):
def
__init__
(
self
):
self
.
metrics
=
{}
self
.
input_data
=
None
self
.
reader
=
None
self
.
exe
=
None
self
.
model
=
None
self
.
PSGPU
=
None
self
.
train_result_dict
=
{}
self
.
train_result_dict
[
"speed"
]
=
[]
self
.
train_result_dict
[
"auc"
]
=
[]
def
run
(
self
):
from
ps_dnn_trainer
import
YamlHelper
yaml_helper
=
YamlHelper
()
config_yaml_path
=
'config_gpubox.yaml'
self
.
config
=
yaml_helper
.
load_yaml
(
config_yaml_path
)
os
.
environ
[
"CPU_NUM"
]
=
str
(
self
.
config
.
get
(
"runner.thread_num"
))
fleet
.
init
()
self
.
network
()
if
fleet
.
is_server
():
self
.
run_server
()
elif
fleet
.
is_worker
():
self
.
run_worker
()
fleet
.
stop_worker
()
logger
.
info
(
"Run Success, Exit."
)
logger
.
info
(
"-"
*
100
)
def
network
(
self
):
from
ps_dnn_trainer
import
StaticModel
,
get_user_defined_strategy
# self.model = get_model(self.config)
self
.
model
=
StaticModel
(
self
.
config
)
self
.
input_data
=
self
.
model
.
create_feeds
()
self
.
init_reader
()
self
.
metrics
=
self
.
model
.
net
(
self
.
input_data
)
self
.
inference_target_var
=
self
.
model
.
inference_target_var
logger
.
info
(
"cpu_num: {}"
.
format
(
os
.
getenv
(
"CPU_NUM"
)))
# self.model.create_optimizer(get_strategy(self.config)
user_defined_strategy
=
get_user_defined_strategy
(
self
.
config
)
optimizer
=
paddle
.
optimizer
.
Adam
(
0.01
,
lazy_mode
=
True
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
user_defined_strategy
)
optimizer
.
minimize
(
self
.
model
.
_cost
)
logger
.
info
(
"end network....."
)
def
run_server
(
self
):
logger
.
info
(
"Run Server Begin"
)
fleet
.
init_server
(
self
.
config
.
get
(
"runner.warmup_model_path"
))
fleet
.
run_server
()
def
run_worker
(
self
):
logger
.
info
(
"Run Worker Begin"
)
use_cuda
=
int
(
self
.
config
.
get
(
"runner.use_gpu"
))
use_auc
=
self
.
config
.
get
(
"runner.use_auc"
,
False
)
place
=
paddle
.
CUDAPlace
(
0
)
if
use_cuda
else
paddle
.
CPUPlace
()
self
.
exe
=
paddle
.
static
.
Executor
(
place
)
'''
with open("./{}_worker_main_program.prototxt".format(
fleet.worker_index()), 'w+') as f:
f.write(str(paddle.static.default_main_program()))
with open("./{}_worker_startup_program.prototxt".format(
fleet.worker_index()), 'w+') as f:
f.write(str(paddle.static.default_startup_program()))
'''
self
.
exe
.
run
(
paddle
.
static
.
default_startup_program
())
fleet
.
init_worker
()
'''
save_model_path = self.config.get("runner.model_save_path")
if save_model_path and (not os.path.exists(save_model_path)):
os.makedirs(save_model_path)
'''
reader_type
=
self
.
config
.
get
(
"runner.reader_type"
,
None
)
epochs
=
int
(
self
.
config
.
get
(
"runner.epochs"
))
sync_mode
=
self
.
config
.
get
(
"runner.sync_mode"
)
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
self
.
PSGPU
=
paddle
.
framework
.
core
.
PSGPU
()
gpuslot
=
[
int
(
i
)
for
i
in
range
(
1
,
self
.
model
.
sparse_inputs_slots
)]
gpu_mf_sizes
=
[
self
.
model
.
sparse_feature_dim
-
1
]
*
(
self
.
model
.
sparse_inputs_slots
-
1
)
self
.
PSGPU
.
set_slot_vector
(
gpuslot
)
self
.
PSGPU
.
set_slot_dim_vector
(
gpu_mf_sizes
)
self
.
PSGPU
.
init_gpu_ps
([
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)])
gpu_num
=
len
(
gpus_env
.
split
(
","
))
opt_info
=
paddle
.
static
.
default_main_program
().
_fleet_opt
if
use_auc
is
True
:
opt_info
[
'stat_var_names'
]
=
[
self
.
model
.
stat_pos
.
name
,
self
.
model
.
stat_neg
.
name
,
]
else
:
opt_info
[
'stat_var_names'
]
=
[]
for
epoch
in
range
(
epochs
):
epoch_start_time
=
time
.
time
()
self
.
dataset_train_loop
(
epoch
)
epoch_time
=
time
.
time
()
-
epoch_start_time
self
.
PSGPU
.
end_pass
()
fleet
.
barrier_worker
()
self
.
reader
.
release_memory
()
logger
.
info
(
"finish {} epoch training...."
.
format
(
epoch
))
self
.
PSGPU
.
finalize
()
def
init_reader
(
self
):
if
fleet
.
is_server
():
return
# self.reader, self.file_list = get_reader(self.input_data, config)
self
.
reader
=
get_dataset
(
self
.
input_data
,
self
.
config
)
def
dataset_train_loop
(
self
,
epoch
):
start_time
=
time
.
time
()
self
.
reader
.
load_into_memory
()
print
(
"self.reader.load_into_memory cost :{} seconds"
.
format
(
time
.
time
()
-
start_time
)
)
begin_pass_time
=
time
.
time
()
self
.
PSGPU
.
begin_pass
()
print
(
"begin_pass cost:{} seconds"
.
format
(
time
.
time
()
-
begin_pass_time
)
)
logger
.
info
(
"Epoch: {}, Running Dataset Begin."
.
format
(
epoch
))
fetch_info
=
[
"Epoch {} Var {}"
.
format
(
epoch
,
var_name
)
for
var_name
in
self
.
metrics
]
fetch_vars
=
[
var
for
_
,
var
in
self
.
metrics
.
items
()]
print_step
=
int
(
self
.
config
.
get
(
"runner.print_interval"
))
self
.
exe
.
train_from_dataset
(
program
=
paddle
.
static
.
default_main_program
(),
dataset
=
self
.
reader
,
debug
=
self
.
config
.
get
(
"runner.dataset_debug"
),
)
if
__name__
==
"__main__"
:
paddle
.
enable_static
()
benchmark_main
=
Main
()
benchmark_main
.
run
()
python/paddle/fluid/tests/unittests/ps/test_gpubox_ps.py
0 → 100755
浏览文件 @
b76594a0
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shlex
# noqa: F401
import
unittest
class
GpuBoxTest
(
unittest
.
TestCase
):
def
test_gpubox
(
self
):
exitcode
=
os
.
system
(
'sh gpubox_run.sh'
)
os
.
system
(
'rm *_train_desc.prototxt'
)
if
os
.
path
.
exists
(
'./train_data'
):
os
.
system
(
'rm -rf train_data'
)
if
os
.
path
.
exists
(
'./log'
):
os
.
system
(
'rm -rf log'
)
if
__name__
==
'__main__'
:
if
not
os
.
path
.
exists
(
'./train_data'
):
os
.
system
(
'sh download_criteo_data.sh'
)
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录