diff --git a/benchmark/cluster/v2/Dockerfile b/benchmark/cluster/v2/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c52acd51a203194bc10afe263c6856afbd67cca1 --- /dev/null +++ b/benchmark/cluster/v2/Dockerfile @@ -0,0 +1,4 @@ +FROM registry.baidu.com/paddlepaddle/rawjob +RUN mkdir -p /workspace && mkdir -p /root/.cache/paddle/dataset/flowers/ +ADD vgg16.py reader.py /workspace/ +ADD 102flowers.tgz imagelabels.mat setid.mat /root/.cache/paddle/dataset/flowers/ diff --git a/benchmark/cluster/v2/pserver.yaml b/benchmark/cluster/v2/pserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed1671bbbd458a3bdc82bbd44dce108182500c6a --- /dev/null +++ b/benchmark/cluster/v2/pserver.yaml @@ -0,0 +1,64 @@ +apiVersion: extensions/v1beta1 +kind: ReplicaSet +metadata: + name: vgg16job-pserver +spec: + replicas: 10 + template: + metadata: + labels: + paddle-job-pserver: vgg16job + spec: + hostNetwork: true + imagePullSecrets: + - name: job-registry-secret + containers: + - name: pserver + image: "registry.baidu.com/paddlepaddle/rawjob:vgg16" + imagePullPolicy: Always + ports: + - name: jobport-30236 + containerPort: 30236 + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "python train.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + command: ["paddle_k8s", "start_pserver"] + resources: + requests: + memory: 10Gi + cpu: 4 + limits: + memory: 10Gi + cpu: 4 diff --git a/benchmark/cluster/v2/reader.py b/benchmark/cluster/v2/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..a5a2d54841c84e07cf0150217d8ad672ffd7ffb8 --- /dev/null +++ b/benchmark/cluster/v2/reader.py @@ -0,0 +1,56 @@ +import random +from paddle.v2.image import load_and_transform +import paddle.v2 as paddle +from multiprocessing import cpu_count + + +def train_mapper(sample): + ''' + map image path to type needed by model input layer for the training set + ''' + img, label = sample + img = paddle.image.load_image(img) + img = paddle.image.simple_transform(img, 256, 224, True) + return img.flatten().astype('float32'), label + + +def test_mapper(sample): + ''' + map image path to type needed by model input layer for the test set + ''' + img, label = sample + img = paddle.image.load_image(img) + img = paddle.image.simple_transform(img, 256, 224, True) + return img.flatten().astype('float32'), label + + +def train_reader(train_list, buffered_size=1024): + def reader(): + with open(train_list, 'r') as f: + lines = [line.strip() for line in f] + for line in lines: + img_path, lab = line.strip().split('\t') + yield img_path, int(lab) + + return paddle.reader.xmap_readers(train_mapper, reader, + cpu_count(), buffered_size) + + +def test_reader(test_list, buffered_size=1024): + def reader(): + with open(test_list, 'r') as f: + lines = [line.strip() for line in f] + for line in lines: + img_path, lab = line.strip().split('\t') + yield img_path, int(lab) + + return paddle.reader.xmap_readers(test_mapper, reader, + cpu_count(), buffered_size) + + +if __name__ == '__main__': + #for im in train_reader('train.list'): + # print len(im[0]) + #for im in train_reader('test.list'): + # print len(im[0]) + paddle.dataset.flowers.train() diff --git a/benchmark/cluster/v2/trainer.yaml b/benchmark/cluster/v2/trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..33c95df365ac3b88b2a32c1c7f28f49d0957bacd --- /dev/null +++ b/benchmark/cluster/v2/trainer.yaml @@ -0,0 +1,63 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: vgg16job-trainer +spec: + parallelism: 20 + completions: 20 + template: + metadata: + labels: + paddle-job: vgg16job + spec: + imagePullSecrets: + - name: job-registry-secret + hostNetwork: true + containers: + - name: trainer + image: "registry.baidu.com/paddlepaddle/rawjob:vgg16" + imagePullPolicy: Always + command: ["paddle_k8s", "start_trainer", "v2"] + env: + - name: PADDLE_JOB_NAME + value: vgg16job + - name: TRAINERS + value: "20" + - name: PSERVERS + value: "10" + - name: TOPOLOGY + value: "" + - name: ENTRY + value: "cd /workspace && python /workspace/vgg16.py" + - name: TRAINER_PACKAGE + value: "/workspace" + - name: PADDLE_INIT_PORT + value: "30236" + - name: PADDLE_INIT_NICS + value: "xgbe0" + - name: PADDLE_INIT_TRAINER_COUNT + value: "1" + - name: PADDLE_INIT_PORTS_NUM + value: "1" + - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE + value: "1" + - name: PADDLE_INIT_NUM_GRADIENT_SERVERS + value: "20" + - name: PADDLE_INIT_NUM_PASSES + value: "1" + - name: PADDLE_INIT_USE_GPU + value: "0" + - name: LD_LIBRARY_PATH + value: "/usr/local/nvidia/lib64" + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: "metadata.namespace" + resources: + requests: + memory: 40Gi + cpu: 2 + limits: + memory: 40Gi + cpu: 2 + restartPolicy: Never diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py new file mode 100644 index 0000000000000000000000000000000000000000..699fc07628f6a430089c96893f995b52e8c99510 --- /dev/null +++ b/benchmark/cluster/v2/vgg16.py @@ -0,0 +1,125 @@ +import gzip + +import paddle.v2.dataset.flowers as flowers +import paddle.v2 as paddle +import reader + +DATA_DIM = 3 * 224 * 224 # Use 3 * 331 * 331 or 3 * 299 * 299 for Inception-ResNet-v2. +CLASS_DIM = 102 +BATCH_SIZE = 128 + + +def vgg(input, nums, class_dim): + def conv_block(input, num_filter, groups, num_channels=None): + return paddle.networks.img_conv_group( + input=input, + num_channels=num_channels, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act=paddle.activation.Relu(), + pool_type=paddle.pooling.Max()) + + assert len(nums) == 5 + # the channel of input feature is 3 + conv1 = conv_block(input, 64, nums[0], 3) + conv2 = conv_block(conv1, 128, nums[1]) + conv3 = conv_block(conv2, 256, nums[2]) + conv4 = conv_block(conv3, 512, nums[3]) + conv5 = conv_block(conv4, 512, nums[4]) + + fc_dim = 4096 + fc1 = paddle.layer.fc(input=conv5, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + fc2 = paddle.layer.fc(input=fc1, + size=fc_dim, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + out = paddle.layer.fc(input=fc2, + size=class_dim, + act=paddle.activation.Softmax()) + return out + + +def vgg13(input, class_dim): + nums = [2, 2, 2, 2, 2] + return vgg(input, nums, class_dim) + + +def vgg16(input, class_dim): + nums = [2, 2, 3, 3, 3] + return vgg(input, nums, class_dim) + + +def vgg19(input, class_dim): + nums = [2, 2, 4, 4, 4] + return vgg(input, nums, class_dim) + + +def main(): + paddle.init(use_gpu=True, trainer_count=1) + image = paddle.layer.data( + name="image", type=paddle.data_type.dense_vector(DATA_DIM)) + lbl = paddle.layer.data( + name="label", type=paddle.data_type.integer_value(CLASS_DIM)) + + extra_layers = None + learning_rate = 0.01 + out = vgg16(image, class_dim=CLASS_DIM) + cost = paddle.layer.classification_cost(input=out, label=lbl) + + # Create parameters + parameters = paddle.parameters.create(cost) + + # Create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0005 * + BATCH_SIZE), + learning_rate=learning_rate / BATCH_SIZE, + learning_rate_decay_a=0.1, + learning_rate_decay_b=128000 * 35, + learning_rate_schedule="discexp", ) + + train_reader = paddle.batch( + paddle.reader.shuffle( + flowers.train(), + # To use other data, replace the above line with: + # reader.train_reader('train.list'), + buf_size=1000), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + flowers.valid(), + # To use other data, replace the above line with: + # reader.test_reader('val.list'), + batch_size=BATCH_SIZE) + + # Create trainer + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer, + extra_layers=extra_layers, + is_local=False) + + # End batch and end pass event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 1 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if isinstance(event, paddle.event.EndPass): + with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: + trainer.save_parameter_to_tar(f) + + result = trainer.test(reader=test_reader) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + trainer.train( + reader=train_reader, num_passes=200, event_handler=event_handler) + + +if __name__ == '__main__': + main()