diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h index 6e50f8738e316a0bbc3dac1b715af1ec1288145f..677b047029305549084770bdb5eadfeaafbfac8a 100644 --- a/paddle/gserver/layers/MultinomialSampler.h +++ b/paddle/gserver/layers/MultinomialSampler.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once +#include #include - #include "paddle/utils/TypeDefs.h" namespace paddle { @@ -32,6 +32,17 @@ class MultinomialSampler { public: MultinomialSampler(const real* prob, int size); + //! protobuf always using double. + static MultinomialSampler* create(const double* prob, int size) { +#ifdef PADDLE_TYPE_DOUBLE + return new MultinomialSampler(prob, size); +#else + std::unique_ptr tmp(new real[size]); + std::copy(prob, prob + size, tmp.get()); + return new MultinomialSampler(tmp.get(), size); +#endif + } + /** * @brief Generate a random sample. * @param g is a random number engine. See . diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp index 540db46545ef03010d6138954070283ec32e5577..5ab765247f63dfe6e6651ca4d27dc7183a9f33e1 100644 --- a/paddle/gserver/layers/NCELayer.cpp +++ b/paddle/gserver/layers/NCELayer.cpp @@ -99,8 +99,8 @@ public: if (config_.neg_sampling_dist_size()) { CHECK_EQ(numClasses_, config_.neg_sampling_dist_size()); - sampler_.reset(new MultinomialSampler(config_.neg_sampling_dist().data(), - numClasses_)); + sampler_.reset(MultinomialSampler::create( + config_.neg_sampling_dist().data(), numClasses_)); } return true; diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp index 84d965a66a16dfc597e728e11dab966cc194dfb1..887168075e307b4056ea4bc500665446d30228fe 100644 --- a/paddle/pserver/ParameterClient2.cpp +++ b/paddle/pserver/ParameterClient2.cpp @@ -25,24 +25,17 @@ P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send"); namespace paddle { -template -void copyToRepeatedField(google::protobuf::RepeatedField* dest, - const T* src, +template +void copyToRepeatedField(google::protobuf::RepeatedField* dest, + const T2* src, size_t size) { dest->Clear(); dest->Reserve(size); - for (size_t i = 0; i < size; ++i) { dest->AddAlreadyReserved(src[i]); } } -template -void copyToRepeatedField(const std::vector& src, - google::protobuf::RepeatedField* dest) { - copyToRepeatedField(dest, &src[0], src.size()); -} - ParameterClient2::ParameterClient2(bool separate, int port, int numPorts) : BaseClient(separate, numPorts), port_(port) { #ifndef PADDLE_DISABLE_TIMER @@ -618,6 +611,11 @@ void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) { pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows()); } +template +static inline auto add(T1 a, T2 b) -> decltype(a + b) { + return a + b; +} + void ParameterClient2::doOperation(PreparedOperations& ops, bool waitForGradient, bool sendBackGradient, @@ -682,8 +680,11 @@ void ParameterClient2::doOperation(PreparedOperations& ops, CpuVectorPtr rvec = resultVectors[i]; if (!rvec) continue; CHECK_EQ(rvec->getSize(), (size_t)vec.dim()); - CpuVector avec(rvec->getSize(), const_cast(vec.values().data())); - rvec->add(avec); + std::transform(rvec->getData(), + rvec->getData() + rvec->getSize(), + vec.values().data(), + rvec->getData(), + add); } CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size()); @@ -693,11 +694,12 @@ void ParameterClient2::doOperation(PreparedOperations& ops, if (!rmat) continue; CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows()); CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols()); - CpuMatrixPtr amat = - std::make_shared(const_cast(mat.values().data()), - rmat->getHeight(), - rmat->getWidth()); - rmat->add(*amat); + + std::transform(rmat->getData(), + rmat->getData() + rmat->getElementCnt(), + mat.values().data(), + rmat->getData(), + add); } } } diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt index d7f523bc8d9bce00ba72c41284d2b3eb3cde6529..2c40070eca44d8656d7ce82157a1b840092b9965 100644 --- a/proto/CMakeLists.txt +++ b/proto/CMakeLists.txt @@ -6,25 +6,6 @@ set(proto_filenames ParameterService.proto TrainerConfig.proto) -set(real_proto_files) - -# TODO(yuyang18): Some internal proto will also be depended on. -# Find a way to automatically calculate all depends. -foreach(filename ${proto_filenames}) - set(PROTOBUF_3_FLAGS "") - if (PROTOBUF_3) - set(PROTOBUF_3_FLAGS "-Dproto3") - endif() - add_custom_command(OUTPUT ${filename} - COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} ${PROTOBUF_3_FLAGS} -I '${INTERNAL_PROTO_PATH}' - ${PROJ_ROOT}/proto/${filename}.m4 > ${filename} - DEPENDS ${PROJ_ROOT}/proto/${filename}.m4 - COMMENT "Generate ${filename}") -endforeach() - -add_custom_target(proto_accuracy ALL - DEPENDS ${proto_filenames}) - set(PROTO_GEN) set(PROTO_GEN_PY) @@ -39,9 +20,8 @@ foreach(filename ${proto_filenames}) add_custom_command(OUTPUT ${CUR_PROTO_GEN} COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --cpp_out ${CMAKE_CURRENT_BINARY_DIR} - --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename} - DEPENDS proto_accuracy - ${PROJ_ROOT}/proto/${filename}.m4) + --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename} + DEPENDS ${filename}) set(CUR_PROTO_GEN_PY ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py) @@ -50,9 +30,8 @@ foreach(filename ${proto_filenames}) ${PROTO_GEN_PY}) add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY} COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto - --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename} - DEPENDS proto_accuracy - ${PROJ_ROOT}/proto/${filename}.m4) + --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename} + DEPENDS ${filename}) endforeach() include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto) @@ -61,5 +40,4 @@ add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN}) add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY}) add_library(paddle_proto STATIC ${PROTO_GEN}) -add_dependencies(paddle_proto proto_accuracy) target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/proto/DataConfig.proto.m4 b/proto/DataConfig.proto similarity index 93% rename from proto/DataConfig.proto.m4 rename to proto/DataConfig.proto index 1f8e3f4f3e523447b69bfd2dbce9c99dc22571d1..e895c184d9f95dba1449e6467a2566712837600b 100644 --- a/proto/DataConfig.proto.m4 +++ b/proto/DataConfig.proto @@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -ifdef(`proto3', `syntax = "proto2";') +syntax = "proto2"; package paddle; -sinclude(`DataConfigExt.proto.m4') + message FileGroupConf { optional uint32 queue_capacity = 1 [default = 1]; // how many files to load for a load file thread @@ -26,7 +26,7 @@ message FileGroupConf { }; message DataConfig { -sinclude(`DataConfigInter.proto.m4') + required string type = 1; // name of a text file which contains a list of file names at each line @@ -51,11 +51,11 @@ sinclude(`DataConfigInter.proto.m4') /// Note the field number 17, 18 and 19 have been deprecated. - // a list of values which will be used to create additional one dimensional real + // a list of values which will be used to create additional one dimensional float // values slots. These one dimensional slots can be used as the weight input // for cost layers. // Currently this is only supported by ProtoDataProvider. - repeated real constant_slots = 20; + repeated double constant_slots = 20; // for PyDataProvider. // Specify the load data script module name, object name and user args @@ -80,6 +80,6 @@ sinclude(`DataConfigInter.proto.m4') optional bool is_main_data = 26 [default = true]; // the usage ratio of instances. Setting to 1.0 means the use of all instances. - optional real usage_ratio = 27 [default = 1.0]; + optional double usage_ratio = 27 [default = 1.0]; }; diff --git a/proto/DataFormat.proto.m4 b/proto/DataFormat.proto similarity index 98% rename from proto/DataFormat.proto.m4 rename to proto/DataFormat.proto index 54e9fd008e485d24c21c58d543be6b311378905b..19b1499b0281a1b92028cc8944c27ee4d56b8dd2 100644 --- a/proto/DataFormat.proto.m4 +++ b/proto/DataFormat.proto @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -ifdef(`proto3', `syntax = "proto2";') +syntax = "proto2"; package paddle; diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto similarity index 95% rename from proto/ModelConfig.proto.m4 rename to proto/ModelConfig.proto index ccad69a3c2209542d2be855ddf3f75def9e8d729..b34e1ebdedab104f7c16dbf9e1a264f3665115ce 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -ifdef(`proto3', `syntax = "proto2";') +syntax = "proto2"; import "ParameterConfig.proto"; @@ -20,7 +20,7 @@ package paddle; /** * Various structs for the configuration of a neural network */ -sinclude(`ModelConfigExt.proto.m4') + message ExternalConfig { repeated string layer_names = 1; @@ -146,8 +146,8 @@ message NormConfig { // the parameters for normalization // u = u / (1+scale*sum(u^2 in window))^pow - required real scale = 4; - required real pow = 5; + required double scale = 4; + required double pow = 5; // The size of output feature map. required uint32 output_x = 6; @@ -223,7 +223,7 @@ message OperatorConfig { required uint64 output_size = 4; // For DotMulOperator - optional real dotmul_scale = 5 [default = 1.0]; + optional double dotmul_scale = 5 [default = 1.0]; // For ConvOperator optional ConvConfig conv_conf = 6; @@ -266,7 +266,7 @@ message LayerInputConfig { } message LayerConfig { -sinclude(`ModelConfigLayer.proto.m4') + required string name = 1; required string type = 2; optional uint64 size = 3; @@ -293,7 +293,7 @@ sinclude(`ModelConfigLayer.proto.m4') optional uint32 partial_sum = 9; // for dropout - optional real drop_rate = 10; + optional double drop_rate = 10; // for HierarchicalSoftmaxLayer and NCELayer // the number of classes @@ -317,17 +317,17 @@ sinclude(`ModelConfigLayer.proto.m4') // For NCELayer // The distribution for generating the random negative labels. // A uniform distribution will be used if not provided - repeated real neg_sampling_dist = 17 [packed = true]; + repeated double neg_sampling_dist = 17 [packed = true]; // For MaxLayer // default: output VALUE of MaxLayer. set this flag to true for output INDEX - // INDEX will be put in Argument::value as real values. + // INDEX will be put in Argument::value as double values. optional bool output_max_index = 19 [default = false]; /// The filed number 20 have been deprecated. // For self-normalized estimation - optional real softmax_selfnorm_alpha = 21 [default = 0.1]; + optional double softmax_selfnorm_alpha = 21 [default = 0.1]; /// The filed numbers 22 and 23 have been deprecated. @@ -338,14 +338,14 @@ sinclude(`ModelConfigLayer.proto.m4') optional bool norm_by_times = 25; // for CostLayers - optional real coeff = 26 [default = 1.0]; + optional double coeff = 26 [default = 1.0]; // for AverageLayer // can be set to: 'average', 'sum' or 'squarerootn' optional string average_strategy = 27; // for error clipping - optional real error_clipping_threshold = 28 [default = 0.0]; + optional double error_clipping_threshold = 28 [default = 0.0]; // for operators used by mixed layer repeated OperatorConfig operator_confs = 29; @@ -355,11 +355,11 @@ sinclude(`ModelConfigLayer.proto.m4') optional int32 max_sort_size = 31; // for SlopeInterceptLayer - optional real slope = 32; - optional real intercept = 33; + optional double slope = 32; + optional double intercept = 33; // for CosSimVecMatLayer and CosSimLayer - optional real cos_scale = 34; + optional double cos_scale = 34; // for DataNormLayer // can be set to: 'z-score', 'min-max' or 'decimal-scaling' @@ -394,7 +394,7 @@ sinclude(`ModelConfigLayer.proto.m4') // if number of the selected columns is less than // sample number * selective_fc output size * selective_fc_mull_mull_ratio // sparse multiplication is used, otherwise, using full multiplication. - optional real selective_fc_full_mul_ratio = 44 [default = 0.02]; + optional double selective_fc_full_mul_ratio = 44 [default = 0.02]; // to indicate how many threads selective_fc use to to accelate // the plain_mul period @@ -406,7 +406,7 @@ sinclude(`ModelConfigLayer.proto.m4') optional bool use_global_stats = 46; // use to compute moving mean and variance. - optional real moving_average_fraction = 47 [default = 0.9]; + optional double moving_average_fraction = 47 [default = 0.9]; // bias size optional uint32 bias_size = 48 [default = 0]; @@ -438,7 +438,7 @@ message EvaluatorConfig { // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator // For multi binary labels: true if output > classification_threshold - optional real classification_threshold = 6 [default = 0.5]; + optional double classification_threshold = 6 [default = 0.5]; // The positive label. -1 means average precision and recall optional int32 positive_label = 7 [default = -1]; diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto similarity index 87% rename from proto/ParameterConfig.proto.m4 rename to proto/ParameterConfig.proto index b5c0fea6c373307dc0af2e29c0f1ff5362823411..cbcd0af598df22c36c66767fdeb7add2aa49e87d 100644 --- a/proto/ParameterConfig.proto.m4 +++ b/proto/ParameterConfig.proto @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -ifdef(`proto3', `syntax = "proto2";') +syntax = "proto2"; package paddle; @@ -32,14 +32,14 @@ message ParameterUpdaterHookConfig { message ParameterConfig { required string name = 1; required uint64 size = 2; - optional real learning_rate = 3 [default = 1.0]; - optional real momentum = 4 [default = 0.0]; - optional real initial_mean = 5 [default = 0.0]; - optional real initial_std = 6 [default = 0.01]; + optional double learning_rate = 3 [default = 1.0]; + optional double momentum = 4 [default = 0.0]; + optional double initial_mean = 5 [default = 0.0]; + optional double initial_std = 6 [default = 0.01]; // use L2-regularization if decay_rate set and decay_rate_l1 not set - optional real decay_rate = 7 [default = 0.0]; + optional double decay_rate = 7 [default = 0.0]; // use L1-regularization if decay_rate_l1 set - optional real decay_rate_l1 = 8 [default = 0.0]; + optional double decay_rate_l1 = 8 [default = 0.0]; // dims of Parameter, e.g. dims[0] as height, dims[1] as width.. repeated uint64 dims = 9; // the gpu device which the parameter in. @@ -60,7 +60,7 @@ message ParameterConfig { // sparse remote update or not optional bool sparse_remote_update = 16 [default = false]; // gradient clipping threshold, no clipping by default - optional real gradient_clipping_threshold = 17 [default = 0.0]; + optional double gradient_clipping_threshold = 17 [default = 0.0]; // static parameters are fixed when training optional bool is_static = 18 [default = false]; // para_id should NOT be set by config_parser. It is for diff --git a/proto/ParameterService.proto.m4 b/proto/ParameterService.proto similarity index 97% rename from proto/ParameterService.proto.m4 rename to proto/ParameterService.proto index 25b0991583ec128aeeca1ca775a574f81500d6e5..c1c04d8cc5bdedd09173d5dfa10b82c7ee7ed6a4 100644 --- a/proto/ParameterService.proto.m4 +++ b/proto/ParameterService.proto @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -ifdef(`proto3', `syntax = "proto2";') +syntax = "proto2"; import "ParameterConfig.proto"; import "TrainerConfig.proto"; @@ -73,7 +73,7 @@ message SendParameterRequest { optional int64 num_samples = 4; // cost will be used to calculate global objective value - optional real cost = 5; + optional double cost = 5; required BatchStatus batch_status = 6; @@ -245,13 +245,13 @@ enum MatrixVectorOperation { message ProtoVector { required int64 dim = 1; - repeated real values = 2 [packed = true]; + repeated double values = 2 [packed = true]; } message ProtoMatrix { required int64 num_rows = 1; required int64 num_cols = 2; - repeated real values = 3 [packed = true]; + repeated double values = 3 [packed = true]; } message Operation { @@ -263,7 +263,7 @@ message Operation { // matrix handles created on the pserver repeated int64 pmatrices = 3; // A, B, C - repeated real scalars = 4; // a, b, c + repeated double scalars = 4; // a, b, c repeated ProtoVector vectors = 5; // x, y, z repeated ProtoMatrix matrices = 6; // X, Y, Z } @@ -272,7 +272,7 @@ message OperationResult { // error message. Empty if success optional string return_message = 1; // - repeated real scalars = 2; // d, e, f + repeated double scalars = 2; // d, e, f repeated ProtoVector vectors = 3; // p, q, r repeated ProtoMatrix matrices = 4; // P, Q, R } diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto similarity index 87% rename from proto/TrainerConfig.proto.m4 rename to proto/TrainerConfig.proto index 4684203b03e3297f60629ff6929729c3daffd8c6..a334e07b6282a6ff9867482e0c3a299df2a78d1d 100644 --- a/proto/TrainerConfig.proto.m4 +++ b/proto/TrainerConfig.proto @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -ifdef(`proto3', `syntax = "proto2";') +syntax = "proto2"; import "DataConfig.proto"; import "ModelConfig.proto"; @@ -24,9 +24,9 @@ message OptimizationConfig { optional int32 num_batches_per_send_parameter = 5 [default = 1]; optional int32 num_batches_per_get_parameter = 6 [default = 1]; - required real learning_rate = 7; - optional real learning_rate_decay_a = 8 [default = 0]; - optional real learning_rate_decay_b = 9 [default = 0]; + required double learning_rate = 7; + optional double learning_rate_decay_a = 8 [default = 0]; + optional double learning_rate_decay_b = 9 [default = 0]; optional string learning_rate_schedule = 27 [default = "constant"]; // learning rate will be scaled according to learning_rate_schedule // 1), constant: @@ -49,14 +49,14 @@ message OptimizationConfig { // owlqn related // L1-regularization - optional real l1weight = 10 [default = 0.1]; + optional double l1weight = 10 [default = 0.1]; // L2-regularization - optional real l2weight = 11 [default = 0]; + optional double l2weight = 11 [default = 0]; // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step) // then accept the step - optional real c1 = 12 [default = 0.0001]; + optional double c1 = 12 [default = 0.0001]; // multiply the step with "backoff", when wolfe condition doesn't satisfy - optional real backoff = 13 [default = 0.5]; + optional double backoff = 13 [default = 0.5]; // how many "s"s and "y"s are kept in owlqn optional int32 owlqn_steps = 14 [default = 10]; // accept the step if encountered "max_backoff" times of "reduce the step" @@ -82,15 +82,15 @@ message OptimizationConfig { // default learning method("momentum") use global decayed learning rate with momentum. // "adagrad", "adadelta" and "rmsprop" can set momentum too. optional string learning_method = 23 [default = "momentum"]; - optional real ada_epsilon = 24 [default = 1e-6]; - optional real ada_rou = 26 [default = 0.95]; + optional double ada_epsilon = 24 [default = 1e-6]; + optional double ada_rou = 26 [default = 0.95]; // Force to do average in cpu in order to save gpu memory usage optional bool do_average_in_cpu = 25 [default = false]; // delta add rate in pserver, used while num_batches_per_send_parameter>1 // will be divided by #machines automatically. - optional real delta_add_rate = 28 [default = 1.0]; + optional double delta_add_rate = 28 [default = 1.0]; // We split a large size into smaller mini-batches, whose sizes are // determined by mini_batch_size. It only takes effect when there is @@ -108,14 +108,14 @@ message OptimizationConfig { // shrink sparse parameter value // only works if parameter is remote sparse update and has L1 decay rate - optional real shrink_parameter_value = 32 [default = 0]; + optional double shrink_parameter_value = 32 [default = 0]; //////////////////////////// // Options Adam Optimizer // //////////////////////////// - optional real adam_beta1 = 33 [default = 0.9]; - optional real adam_beta2 = 34 [default = 0.999]; - optional real adam_epsilon = 35 [default = 1e-8]; + optional double adam_beta1 = 33 [default = 0.9]; + optional double adam_beta2 = 34 [default = 0.999]; + optional double adam_epsilon = 35 [default = 1e-8]; // arguments for learning rate scheduler // Format: num1:rate1,num2:rate2,...,numK:rateK @@ -127,7 +127,7 @@ message OptimizationConfig { // for async sgd gradient commit control. // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, // current async gradient will be discard silently. - optional real async_lagged_grad_discard_ratio = 37 [default = 1.5]; + optional double async_lagged_grad_discard_ratio = 37 [default = 1.5]; }; message TrainerConfig {