DataConfig.proto 3.1 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Z
zhangjinchao01 已提交
2 3 4 5 6 7 8 9 10 11 12 13

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yu Yang 已提交
14
syntax = "proto2";
Z
zhangjinchao01 已提交
15 16 17

package paddle;

Y
Yu Yang 已提交
18

Z
zhangjinchao01 已提交
19 20 21 22 23 24 25 26 27 28
message FileGroupConf {
  optional uint32 queue_capacity = 1 [default = 1];
  // how many files to load for a load file thread
  optional int32 load_file_count = 2 [default = 1];
  // how many threads to load files
  // Setting to be 5~10 is appropriate when loading files by hadoop vfs
  optional int32 load_thread_num = 3 [default = 1];
};

message DataConfig {
Y
Yu Yang 已提交
29

Z
zhangjinchao01 已提交
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
  required string type = 1;

  // name of a text file which contains a list of file names at each line
  optional string files = 3;

  optional int32 feat_dim = 4;//feature dimension of one frame
  repeated int32 slot_dims = 5;//feature slot dims
  optional int32 context_len = 6;//max neibour frame numbers
  optional uint64 buffer_capacity = 7;//the number of samples

  //part of data used in training
  //if not -1, part of train data is used in training
  optional int64 train_sample_num = 8 [default = -1];

  //The number of documents processed once
  optional int32  file_load_num = 9 [default = -1];
  optional bool  async_load_data = 12 [default = false];
  /// Note the field number 10, 11 and 13 have been deprecated.
  optional bool for_test = 14 [default = false];  // whether this data is for test
  optional FileGroupConf file_group_conf = 15;
  repeated int32 float_slot_dims = 16;

  /// Note the field number 17, 18 and 19 have been deprecated.

Y
Yu Yang 已提交
54
  // a list of values which will be used to create additional one dimensional float
Z
zhangjinchao01 已提交
55 56 57
  // values slots. These one dimensional slots can be used as the weight input
  // for cost layers.
  // Currently this is only supported by ProtoDataProvider.
Y
Yu Yang 已提交
58
  repeated double constant_slots = 20;
Z
zhangjinchao01 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82

  // for PyDataProvider.
  // Specify the load data script module name, object name and user args
  optional string load_data_module = 21;
  optional string load_data_object = 22;
  optional string load_data_args = 23;

  // for MultiDataProvider
  repeated DataConfig sub_data_configs = 24; // sub dataproviders
  /*
   * the ratio of each sub dataproviders:
   * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
   * then each mini-batch is combined by 10 instance from A and 90 instances
   * from B.
   */
  optional int32 data_ratio = 25;
  /*
   * if one of the sub dataproviders is running out of data, then
   * (1) it is "main data", then finish current pass.
   * (2) it is not "main data", then reset it, and try getNextBatch again.
   */
  optional bool is_main_data = 26 [default = true];

  // the usage ratio of instances. Setting to 1.0 means the use of all instances.
Y
Yu Yang 已提交
83
  optional double usage_ratio = 27 [default = 1.0];
Z
zhangjinchao01 已提交
84 85
};