提交 f1a8f7ac 编写于 作者: H helinwang 提交者: GitHub

Merge pull request #1227 from helinwang/k8s_aws

paddle on aws with kubernetes tutorial now works
此差异已折叠。
doc/howto/usage/k8s/src/create_efs.png

244.5 KB | W: | H:

doc/howto/usage/k8s/src/create_efs.png

236.1 KB | W: | H:

doc/howto/usage/k8s/src/create_efs.png
doc/howto/usage/k8s/src/create_efs.png
doc/howto/usage/k8s/src/create_efs.png
doc/howto/usage/k8s/src/create_efs.png
  • 2-up
  • Swipe
  • Onion skin
apiVersion: batch/v1
kind: Job
metadata:
name: paddle-cluster-job
spec:
parallelism: 3
completions: 3
template:
metadata:
name: paddle-cluster-job
spec:
volumes:
- name: jobpath
hostPath:
path: /home/work/paddle_output
containers:
- name: trainer
image: registry.baidu.com/public/paddle:mypaddle
command: ["bin/bash", "-c", "/root/start.sh"]
env:
- name: JOB_NAME
value: paddle-cluster-job
- name: JOB_PATH
value: /home/jobpath
- name: JOB_NAMESPACE
value: default
- name: TRAIN_CONFIG_DIR
value: recommendation
- name: CONF_PADDLE_NIC
value: eth0
- name: CONF_PADDLE_PORT
value: "7164"
- name: CONF_PADDLE_PORTS_NUM
value: "2"
- name: CONF_PADDLE_PORTS_NUM_SPARSE
value: "2"
- name: CONF_PADDLE_GRADIENT_NUM
value: "3"
volumeMounts:
- name: jobpath
mountPath: /home/jobpath
restartPolicy: Never
FROM alpine
RUN apk update && apk upgrade && apk add coreutils
ADD quick_start /quick_start
ADD get_data.sh /bin/
RUN chmod +x /bin/get_data.sh
ENTRYPOINT ["/bin/get_data.sh"]
To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
```
cp -r ../../../../../../demo/quick_start .
docker build . -t prepare-data-image-name
```
#!/bin/sh
out_dir=$OUT_DIR
split_count=$SPLIT_COUNT
set -e
mkdir -p $out_dir
cp -r /quick_start $out_dir/
mkdir -p $out_dir/0/data
cd $out_dir/0/data
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
tar zxvf preprocessed_data.tar.gz
rm preprocessed_data.tar.gz
split -d --number=l/$split_count -a 5 train.txt train.
mv train.00000 train.txt
cd $out_dir
end=$(expr $split_count - 1)
for i in $(seq 1 $end); do
mkdir -p $i/data
cp -r 0/data/* $i/data
mv $i/data/train.`printf %05d $i` $i/data/train.txt
done;
FROM paddledev/paddle:cpu-latest
COPY start.sh /root/
COPY start_paddle.py /root/
RUN chmod +x /root/start.sh
CMD ["bash"," -c","/root/start.sh"]
To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
```
docker build . -t train-image-name
```
#!/bin/sh #!/bin/sh
set -eu set -eu
jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR} jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
cd /root cd /root
cp -rf $jobconfig . cp -rf $jobconfig/* .
cd $TRAIN_CONFIG_DIR
python /root/start_paddle.py \ python /root/start_paddle.py \
--dot_period=10 \ --dot_period=10 \
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \ --ports_num=$CONF_PADDLE_PORTS_NUM \
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
--log_period=50 \ --log_period=50 \
--num_passes=10 \ --num_passes=10 \
--trainer_count=4 \ --trainer_count=$TRAINER_COUNT \
--saving_period=1 \ --saving_period=1 \
--local=0 \ --local=0 \
--config=./trainer_config.py \ --config=trainer_config.lr.py \
--use_gpu=0 --use_gpu=0
...@@ -23,7 +23,6 @@ import argparse ...@@ -23,7 +23,6 @@ import argparse
API = "/api/v1/namespaces/" API = "/api/v1/namespaces/"
JOBSELECTOR = "labelSelector=job-name=" JOBSELECTOR = "labelSelector=job-name="
JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME") JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
JOB_PATH_DATA = JOB_PATH + "/data"
JOB_PATH_OUTPUT = JOB_PATH + "/output" JOB_PATH_OUTPUT = JOB_PATH + "/output"
JOBNAME = os.getenv("JOB_NAME") JOBNAME = os.getenv("JOB_NAME")
NAMESPACE = os.getenv("JOB_NAMESPACE") NAMESPACE = os.getenv("JOB_NAMESPACE")
...@@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM") ...@@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE") PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM") PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
def refine_unknown_args(cmd_args): def refine_unknown_args(cmd_args):
''' '''
...@@ -64,6 +65,7 @@ def isPodAllRunning(podlist): ...@@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
for pod in podlist["items"]: for pod in podlist["items"]:
if pod["status"]["phase"] == "Running": if pod["status"]["phase"] == "Running":
running += 1 running += 1
print "waiting for pods running, require:", require, "running:", running
if require == running: if require == running:
return True return True
return False return False
...@@ -79,6 +81,15 @@ def getPodList(): ...@@ -79,6 +81,15 @@ def getPodList():
pod = API + NAMESPACE + "/pods?" pod = API + NAMESPACE + "/pods?"
job = JOBNAME job = JOBNAME
if os.path.isfile(tokenpath):
tokenfile = open(tokenpath, mode='r')
token = tokenfile.read()
Bearer = "Bearer " + token
headers = {"Authorization": Bearer}
return requests.get(apiserver + pod + JOBSELECTOR + job,
headers=headers,
verify=False).json()
else:
return requests.get(apiserver + pod + JOBSELECTOR + job, return requests.get(apiserver + pod + JOBSELECTOR + job,
verify=False).json() verify=False).json()
...@@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None): ...@@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None):
if not os.path.exists(JOB_PATH_OUTPUT): if not os.path.exists(JOB_PATH_OUTPUT):
os.makedirs(JOB_PATH_OUTPUT) os.makedirs(JOB_PATH_OUTPUT)
os.mkdir(logDir) os.mkdir(logDir)
copyCommand = 'cp -rf ' + JOB_PATH_DATA + \ copyCommand = 'cp -rf ' + JOB_PATH + \
"/" + str(trainerId) + " ./data" "/" + str(trainerId) + "/data/*" + " ./data/"
os.system(copyCommand) os.system(copyCommand)
startPserver = 'nohup paddle pserver' + \ startPserver = 'nohup paddle pserver' + \
" --port=" + str(PADDLE_PORT) + \ " --port=" + str(PADDLE_PORT) + \
...@@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None): ...@@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None):
print startPserver print startPserver
os.system(startPserver) os.system(startPserver)
# wait until pservers completely start # wait until pservers completely start
time.sleep(10) time.sleep(20)
startTrainer = program + args + " > " + \ startTrainer = program + args + " 2>&1 | tee " + \
logDir + "/train.log 2>&1 < /dev/null" logDir + "/train.log"
print startTrainer print startTrainer
os.system(startTrainer) os.system(startTrainer)
...@@ -152,7 +163,7 @@ if __name__ == '__main__': ...@@ -152,7 +163,7 @@ if __name__ == '__main__':
podlist = getPodList() podlist = getPodList()
# need to wait until all pods are running # need to wait until all pods are running
while not isPodAllRunning(podlist): while not isPodAllRunning(podlist):
time.sleep(10) time.sleep(20)
podlist = getPodList() podlist = getPodList()
idMap = getIdMap(podlist) idMap = getIdMap(podlist)
startPaddle(idMap, train_args_dict) startPaddle(idMap, train_args_dict)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册