diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 22a6b2ab841af290bbe91e1ca8dc8dc12a8ba5c5..9a5901616f8a61d686929a11e09d278df3ee51d6 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -409,6 +409,11 @@ multi_binary_label_cross_entropy_cost .. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: +huber_regression_cost +------------------------- +.. autoclass:: paddle.v2.layer.huber_regression_cost + :noindex: + huber_classification_cost ------------------------- .. autoclass:: paddle.v2.layer.huber_classification_cost diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index 69cf3932255153b42c59fd9024f72a562012bd90..91a742422e2f2efc1a0f4b6ec55963a3e9bdc254 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -594,6 +594,61 @@ void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) { } } +// +// Huber loss for robust regression. +// +REGISTER_LAYER(huber_regression, HuberRegressionLoss); + +bool HuberRegressionLoss::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + HuberCost::init(layerMap, parameterMap); + delta_ = config_.delta(); + return true; +} + +void HuberRegressionLoss::forwardImp(Matrix& output, + Argument& label, + Matrix& target) { + HuberCost::forwardImp(output, label, target); + size_t numSamples = target.getHeight(); + CHECK(label.value); + CHECK_EQ((*label.value).getHeight(), numSamples); + CHECK_EQ(output.getHeight(), numSamples); + CHECK_EQ(output.getWidth(), (*label.value).getWidth()); + CHECK_EQ(target.getWidth(), (size_t)1); + + real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData(); + real* lbl = + useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData(); + std::vector cost(numSamples); + for (size_t i = 0; i < numSamples; ++i) { + real a = std::abs(lbl[i] - out[i]); + if (a <= delta_) + cost[i] = a * a / 2; + else + cost[i] = delta_ * (a - delta_ / 2); + } + target.copyFrom(cost.data(), numSamples); +} + +void HuberRegressionLoss::backwardImp(Matrix& output, + Argument& label, + Matrix& outputG) { + size_t numSamples = output.getHeight(); + real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData(); + real* lbl = + useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData(); + real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData(); + for (size_t i = 0; i < numSamples; ++i) { + real a = lbl[i] - out[i]; + if (std::abs(a) <= delta_) + grad[i] += -a; + else + grad[i] += a > 0 ? delta_ : -delta_; + } + if (useGpu_) outputG.copyFrom(grad, numSamples); +} + // // Huber loss for robust 2-classes classification // diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h index c006dc81104b625ed7ec1503212c274157eca781..0ce72ef40a5ac23d20f485eb5b518186d1ec0686 100644 --- a/paddle/gserver/layers/CostLayer.h +++ b/paddle/gserver/layers/CostLayer.h @@ -321,6 +321,30 @@ public: void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {} }; +/** + * Huber loss for robust regression. + * + * Given output f(x), label y and delta, the loss is: + * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\ + * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise + */ +class HuberRegressionLoss : public HuberCost { +public: + explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forwardImp(Matrix& output, Argument& label, Matrix& cost) override; + + void backwardImp(Matrix& outputValue, + Argument& label, + Matrix& outputGrad) override; + +protected: + real delta_; +}; + /** * Huber loss for robust 2-classes classification. * diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 6d60250f6d8ec88f1344bea68ce19f32602f14f2..c522b20f0e7f0a9951bfdab488b40aab390a3068 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -828,6 +828,24 @@ TEST(Layer, square_error_weighted) { } } +TEST(Layer, huber_regression_loss) { + TestConfig config; + config.layerConfig.set_type("huber_regression"); + config.biasSize = 0; + + config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0}); + config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0}); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + for (auto delta : {1, 3, 5}) { + config.layerConfig.set_delta(delta); + testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu); + } + } +} + TEST(Layer, huber_two_class) { TestConfig config; config.layerConfig.set_type("huber_classification"); @@ -839,7 +857,7 @@ TEST(Layer, huber_two_class) { config.layerConfig.add_inputs(); for (auto useGpu : {false, true}) { - testLayerGrad(config, "huber", 100, /* trans */ false, useGpu); + testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu); } } diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 4f3d5bf3f6cb96c97285f40e3a3d100c2af47ad5..e19e0f85f3d74d1f4c6697cb2a7644141bd53905 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -496,6 +496,9 @@ message LayerConfig { optional int32 axis = 54 [ default = 2 ]; repeated uint32 offset = 55; repeated uint32 shape = 56; + + // for HuberRegressionLoss + optional double delta = 57 [ default = 1.0 ]; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 248da9417f2f619a1e3042d2ad0579ed1ce1a0af..a3ca3f251099bfbcf3dfef74c10f744d5081f333 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2317,6 +2317,17 @@ class LambdaCost(LayerBase): self.config.max_sort_size = max_sort_size +@config_layer('huber_regression') +class HuberRegressionLoss(LayerBase): + def __init__(self, name, inputs, delta=1., coeff=1., device=None): + super(HuberRegressionLoss, self).__init__( + name, 'huber_regression', 1, inputs=inputs, device=device) + config_assert( + len(self.inputs) == 2, 'HuberRegression must have 2 inputs') + self.config.delta = delta + self.config.coeff = coeff + + @config_layer('nce') class NCELayer(LayerBase): def __init__(self, diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 20d96efe1529c6f2141eaca3e74ef2ece6dc2ea9..d61c94dc8278cf84b8b5e6c5c2e3e6ab875afcbf 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -108,6 +108,7 @@ __all__ = [ 'sum_cost', 'rank_cost', 'lambda_cost', + 'huber_regression_cost', 'huber_classification_cost', 'block_expand_layer', 'maxout_layer', @@ -216,6 +217,7 @@ class LayerType(object): RANK_COST = 'rank-cost' LAMBDA_COST = 'lambda_cost' + HUBER_REGRESSION = 'huber_regression' HUBER_CLASSIFICATION = 'huber_classification' CROSS_ENTROPY = 'multi-class-cross-entropy' CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm' @@ -5603,6 +5605,57 @@ def sum_cost(input, name=None, layer_attr=None): return LayerOutput(name, LayerType.SUM_COST, parents=[input], size=1) +@wrap_name_default() +@layer_support() +def huber_regression_cost(input, + label, + name=None, + delta=1.0, + coeff=1.0, + layer_attr=None): + """ + In statistics, the Huber loss is a loss function used in robust regression, + that is less sensitive to outliers in data than the squared error loss. + Given a prediction f(x), a label y and :math:`\delta`, the loss function + is defined as: + + .. math: + loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta + loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise + + The example usage is: + + .. code-block:: python + + cost = huber_regression_cost(input=input_layer, label=label_layer) + + :param input: The first input layer. + :type input: LayerOutput. + :param label: The input label. + :type input: LayerOutput. + :param name: The name of this layers. It is not necessary. + :type name: None|basestring. + :param delta: The difference between the observed and predicted values. + :type delta: float. + :param coeff: The coefficient affects the gradient in the backward. + :type coeff: float. + :param layer_attr: Extra Layer Attribute. + :type layer_attr: ExtraLayerAttribute + :return: LayerOutput object. + :rtype: LayerOutput. + """ + assert isinstance(input, LayerOutput) + Layer( + name=name, + type=LayerType.HUBER_REGRESSION, + inputs=[input.name, label.name], + delta=delta, + coeff=coeff, + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1) + + @wrap_name_default() @layer_support() def huber_classification_cost(input, diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr index a64e5ea0ddbc015fa788cd8b995f6db59d4916e0..55ab464ddf88f55bfb7b93ec0a189d4e53633468 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr @@ -167,6 +167,20 @@ layers { softmax_selfnorm_alpha: 0.1 coeff: 1.0 } +layers { + name: "__huber_regression_cost_0__" + type: "huber_regression" + size: 1 + active_type: "" + inputs { + input_layer_name: "input" + } + inputs { + input_layer_name: "labels" + } + coeff: 1.0 + delta: 1.0 +} layers { name: "huber_probs" type: "data" @@ -300,6 +314,7 @@ output_layer_names: "__rank_cost_0__" output_layer_names: "__lambda_cost_0__" output_layer_names: "__cross_entropy_0__" output_layer_names: "__cross_entropy_with_selfnorm_0__" +output_layer_names: "__huber_regression_cost_0__" output_layer_names: "__huber_classification_cost_0__" output_layer_names: "__multi_binary_label_cross_entropy_0__" output_layer_names: "__sum_cost_0__" @@ -324,6 +339,7 @@ sub_models { layer_names: "__lambda_cost_0__" layer_names: "__cross_entropy_0__" layer_names: "__cross_entropy_with_selfnorm_0__" + layer_names: "__huber_regression_cost_0__" layer_names: "huber_probs" layer_names: "huber_label" layer_names: "__huber_classification_cost_0__" @@ -349,6 +365,7 @@ sub_models { output_layer_names: "__lambda_cost_0__" output_layer_names: "__cross_entropy_0__" output_layer_names: "__cross_entropy_with_selfnorm_0__" + output_layer_names: "__huber_regression_cost_0__" output_layer_names: "__huber_classification_cost_0__" output_layer_names: "__multi_binary_label_cross_entropy_0__" output_layer_names: "__sum_cost_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py index 98bf026d606b33a2ef71a4e86a30c091dbe3d4e8..7ce375c708af7b0b7ae1d700dedbdb6a4ce16c7f 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py @@ -33,6 +33,8 @@ outputs( input=probs, label=xe_label), cross_entropy_with_selfnorm( input=probs, label=xe_label), + huber_regression_cost( + input=seq_in, label=labels), huber_classification_cost( input=data_layer( name='huber_probs', size=1), diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py index 7373a55ce6f2e00532649b6f35d7d3ea32f3019f..783a0ca85dc61b9f00ac8126e03788884dfb44cb 100644 --- a/python/paddle/v2/tests/test_layer.py +++ b/python/paddle/v2/tests/test_layer.py @@ -141,12 +141,13 @@ class CostLayerTest(unittest.TestCase): cost8 = layer.rank_cost(left=score, right=score, label=score) cost9 = layer.lambda_cost(input=inference, score=score) cost10 = layer.sum_cost(input=inference) - cost11 = layer.huber_classification_cost(input=score, label=label) + cost11 = layer.huber_regression_cost(input=score, label=label) + cost12 = layer.huber_classification_cost(input=score, label=label) print layer.parse_network([cost1, cost2]) print layer.parse_network([cost3, cost4]) print layer.parse_network([cost5, cost6]) - print layer.parse_network([cost7, cost8, cost9, cost10, cost11]) + print layer.parse_network([cost7, cost8, cost9, cost10, cost11, cost12]) crf = layer.crf(input=inference, label=label) crf_decoding = layer.crf_decoding(input=inference, size=3)