未验证 提交 1dad36f6 编写于 作者: X xuezhong 提交者: GitHub

Merge pull request #15609 from xuezhong/add_sample_logits_op

add sample_logits  and sampled_softmax_with_cross_entropy op
...@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs= ...@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.sampled_softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
...@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) ...@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
...@@ -39,6 +39,7 @@ math_library(cross_entropy) ...@@ -39,6 +39,7 @@ math_library(cross_entropy)
math_library(cos_sim_functor) math_library(cos_sim_functor)
math_library(depthwise_conv DEPS cub) math_library(depthwise_conv DEPS cub)
math_library(im2col) math_library(im2col)
math_library(sampler) math_library(sampler)
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/sample_prob.h"
namespace paddle {
namespace operators {
namespace math {
template class SampleWithProb<platform::CPUDeviceContext, float>;
template class SampleWithProb<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <thrust/random.h>
#include <thrust/sort.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace paddle {
namespace operators {
namespace math {
using Tensor = framework::Tensor;
template <typename T>
__device__ T gpu_adjust_prob(const T prob, const int num_samples,
const int num_tries) {
if (num_samples == num_tries) {
return prob * num_samples;
} else {
return -expm1(num_tries * log1p(-prob));
class GPULogUniformSampler {
__device__ int64_t Sample(float random, const int range,
const float log_range) const;
__device__ float Probability(int64_t value, const float log_range) const;
__device__ int64_t GPULogUniformSampler::Sample(float random, const int range,
const float log_range) const {
// Got Log Uniform distribution from uniform distribution by
// inverse_transform_sampling method
const int64_t value = static_cast<int64_t>(exp(random * log_range)) - 1;
// Mathematically, value should be <= range_, but might not be due to some
// floating point roundoff, so we mod by range_.
return value % range;
__device__ float GPULogUniformSampler::Probability(
int64_t value, const float log_range) const {
// Given f(x) = 1/[(x+1) * log_range_]
// The value's probability is integral of f(x) from value to (value + 1)
return (log((value + 2.0) / (value + 1.0))) / log_range;
template <typename T>
__global__ void SamplingCondidate(
const size_t n, const int num_tries, const int range, const float log_range,
const int num_true, const std::size_t num_samples,
const int64_t* label_data, int64_t* samples_data, T* probabilities_data) {
const int num_sampled_classes = num_true + num_samples;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int step_size = 0;
GPULogUniformSampler sampler;
for (; idx < n; idx += blockDim.x * gridDim.x) {
int col_idx = idx % num_sampled_classes;
int row_idx = idx / num_sampled_classes;
if (col_idx < num_true) {
samples_data[idx] = label_data[row_idx * num_true + col_idx];
} else {
samples_data[idx] = samples_data[col_idx];
probabilities_data[idx] = sampler.Probability(samples_data[idx], log_range);
probabilities_data[idx] =
gpu_adjust_prob(probabilities_data[idx], num_samples, num_tries);
template <typename T>
int UniqSampler(const Sampler& sampler, const std::size_t num_samples,
int64_t* samples_data) {
// sample num_samles unique samples for an example, note that they are not
// all negative samples
std::unordered_set<int64_t> tmp_samples;
int num_tries = 0;
int j = 0;
while (j < num_samples) {
auto v = sampler.Sample();
auto insert_ok = tmp_samples.insert(v).second;
if (!insert_ok) {
samples_data[j] = v;
return num_tries;
template <typename T>
void GPUSampleWithProb<T>::operator()(
const platform::CUDADeviceContext& context, const int seed,
const int dict_size, const bool uniq, const std::size_t num_samples,
const Tensor* L, Tensor* S, Tensor* P) {
// UNDERSTAND: dimension issues
const auto lbl_dim = L->dims();
const int batch_size = lbl_dim[0];
const int num_true = lbl_dim[1];
const int num_sampled_classes = num_true + num_samples;
framework::DDim ret_dim{batch_size, num_sampled_classes};
// UNDERSTAND: raw data view
const int64_t* label_data = L->data<int64_t>();
int64_t* samples_data = S->data<int64_t>();
T* probabilities_data = P->data<T>();
int s_size = num_samples;
framework::DDim s_dim{s_size};
Tensor s;
int64_t* s_data = s.mutable_data<int64_t>(s_dim, platform::CPUPlace());
math::LogUniformSampler sampler(dict_size, seed);
int range = dict_size;
float log_range = log(range + 1);
int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
VLOG(1) << "num_tries: " << num_tries;
PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data,
sizeof(int64_t) * num_samples,
int threads = 512;
const size_t size = batch_size * num_sampled_classes;
int grid = (batch_size * num_sampled_classes + threads - 1) / threads;
SamplingCondidate<T><<<grid, threads, 0, context.stream()>>>(
size, num_tries, range, log_range, num_true, num_samples, label_data,
samples_data, probabilities_data);
template class GPUSampleWithProb<float>;
template class GPUSampleWithProb<double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace paddle {
namespace operators {
namespace math {
using Tensor = framework::Tensor;
/* UNDERSTAND: utility function to adjust probability for unique sampling,
return whatever as it is if not using unique samping */
template <typename T>
static T adjust_prob(const T prob, const int num_samples, const int num_tries) {
if (num_samples == num_tries) {
return prob * num_samples;
} else {
return -expm1(num_tries * log1p(-prob));
template <typename DeviceContext, typename T>
class SampleWithProb {
void operator()(const DeviceContext& context, const Sampler& sampler,
const std::size_t num_samples, const Tensor* L, Tensor* S,
Tensor* P) {
// UNDERSTAND: dimension issues
const auto lbl_dim = L->dims();
const int batch_size = lbl_dim[0];
const int num_true = lbl_dim[1];
const int num_sampled_classes = num_true + num_samples;
framework::DDim ret_dim{batch_size, num_sampled_classes};
// UNDERSTAND: raw data view
const int64_t* label_data = L->data<int64_t>();
int64_t* samples_data =
S->mutable_data<int64_t>(ret_dim, context.GetPlace());
T* probabilities_data = P->mutable_data<T>(ret_dim, context.GetPlace());
// temp sets for unique sampling
std::unordered_set<int64_t> tmp_samples;
int j = 0; // column index
// add true labels, not that efficient
while (j < num_true) {
for (int i = 0; i < batch_size; ++i) {
auto samples_index = i * num_sampled_classes + j;
auto v = label_data[i * num_true + j];
samples_data[samples_index] = v;
probabilities_data[samples_index] = sampler.Probability(v);
// sample num_samles unique samples for an example, note that they are not
// all negative samples
int num_tries = 0;
while (j < num_sampled_classes) {
auto v = sampler.Sample();
auto insert_ok = tmp_samples.insert(v).second;
if (!insert_ok) {
auto p = sampler.Probability(v);
for (int i = 0; i < batch_size; ++i) {
auto samples_index = i * num_sampled_classes + j;
samples_data[samples_index] = v;
probabilities_data[samples_index] = p;
// compute Q(y|x), because of unique sampling, probabilities need to be
// adjusted
for (int k = 0; k < num_sampled_classes; ++k) {
for (int i = 0; i < batch_size; ++i) {
auto samples_index = i * num_sampled_classes + k;
probabilities_data[samples_index] = adjust_prob(
probabilities_data[samples_index], num_samples, num_tries);
template <typename T>
class GPUSampleWithProb {
void operator()(const platform::CUDADeviceContext& context, const int seed,
const int dict_size, const bool uniq,
const std::size_t num_samples, const Tensor* L, Tensor* S,
Tensor* P);
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/fluid/operators/math/sample_prob.h"
namespace paddle {
namespace operators {
class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override {
"(Tensor, default: Tensor<float>), The unscaled log probabilities "
"which is a 2-D tensor with shape [N x K]. N is the batch_size, "
"and K is the class number.");
"(Tensor) The ground truth which is a 2-D tensor. Labels is a "
"Tensor<int64> with shape [N x NT], where NT is the number of"
"true labels for each example.");
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S],"
" where N is the batch size, NT is the number of true labels "
"and S is the number of negtive sample for each example."
"The first NT elements of each row should be the same with true "
"labels, "
"followed by S custom negtive samples. This tensor"
"is only used when use_customized_samples is true.")
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The tensor has the same shape with CustomSamples,"
"and each element represents probability of element in CustomSamples. "
"This "
"tensor is only used when use_customized_samples is true.")
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S]."
"The outputs value of sampler, including NT true lables and S "
"negetive samples "
"for each example. This will be used in"
"backward calculation.")
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The probabilites of sampled positive and negtive labels.")
"(Tensor, default: Tensor<float>), A 2-D tensor with shape"
"[N, NT + S]. The outputs value of sampled logits, which will be"
"used in backward propagation.")
"(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
"with shape [N, NT]. The tonsor contains hard labels as input to "
" softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
" of Sampels are positive lables.");
"An indicator whether to use customized samples with probabilities, if "
"the operator will use customized samples and customized probabilities"
"otherwise, the operator will generate them by itself.")
"An indicator whether to sample non-repetitive negtive labels, if True"
"the operator will sample negtive labels without replacement."
"Otherwise, the operator will sample negtive labels with replacement.")
"An indicator whether to remove accidental hits when samples hits true"
"labels, the removal is implemented by subtracting the corresponding"
"logits by float_max to subpress their softmax to be zero.")
AddAttr<int>("num_samples", "The number of negative samples.");
AddAttr<int>("seed", "Random seed for generating samples").SetDefault(0);
Computes sampled output training logits and labels suitable for implementing
sampled softmax.
class SampleLogitsOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
"Input(Logits) should be not null.");
"Input(Labels) should be not null.");
"Output(Samples) should be not null.");
"Output(Probabilities) should be not null.");
"Output(SampledLogits) should be not null.");
"Output(SampledLabels) should be not null.");
auto logits_dims = ctx->GetInputDim("Logits");
auto labels_dims = ctx->GetInputDim("Labels");
logits_dims.size(), 2UL,
"The logits of softmax_with_cross_entropy should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
"The labels should be a 2-D tensor.");
const int num_samples = ctx->Attrs().Get<int>("num_samples");
const int num_sampled_classes = labels_dims[1] + num_samples;
ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Logits"));
framework::OpKernelType kt =
framework::OpKernelType(data_type, ctx.device_context());
return kt;
// UNDERSTAND: InferShape for Grad
class SampleLogitsOpGrad : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
"Input(Logits) should not be null.");
"Input(Labels) should be not null.");
"Input(Samples) should be not null.");
"Input(SampledLogits) should be not null.");
"Input(SampledLogits@Grad) should not be null.");
"Output(Logits@Grad) should be not null.");
auto logit_dims = ctx->GetInputDim("Logits");
auto label_dims = ctx->GetInputDim("Labels");
PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
"The label should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL,
"The logits should be a 2-D tensor.");
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(
framework::OpKernelType kt =
framework::OpKernelType(data_type, ctx.device_context());
return kt;
// UNDERSTAND: what's the rule for making a GradMaker TODO
class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* grad_op = new framework::OpDesc();
grad_op->SetInput("Logits", Input("Logits"));
grad_op->SetInput("Labels", Input("Labels"));
grad_op->SetInput("Samples", Output("Samples"));
grad_op->SetInput("SampledLogits", Output("SampledLogits"));
grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
return std::unique_ptr<framework::OpDesc>(grad_op);
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(sample_logits, ops::SampleLogitsOp, ops::SampleLogitsOpMaker,
REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad);
REGISTER_OP_CPU_KERNEL(sample_logits, ops::SampleLogitsKernel<float>,
REGISTER_OP_CPU_KERNEL(sample_logits_grad, ops::SampleLogitsGradKernel<float>,
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h"
namespace paddle {
namespace operators {
// UNDERSTAND: something like take_along_axis in numpy.
template <typename T>
__global__ void GPUTakeAlongD1(size_t size, const int batch_size,
const int array_slice_size,
const int idx_slice_size, const T* p_array,
const int64_t* p_index, T* p_value) {
const auto value_slice_size = idx_slice_size;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int step_size = blockDim.x * gridDim.x;
for (; idx < size; idx += step_size) {
int i = idx / idx_slice_size;
auto array_index = p_index[idx];
p_value[idx] = p_array[i * array_slice_size + array_index];
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template <typename T>
__global__ void GPUPutAlongD1(size_t size, const int batch_size,
const int array_slice_size,
const int idx_slice_size, T* p_array,
const int64_t* p_index, const T* p_value) {
const auto value_slice_size = idx_slice_size;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int step_size = blockDim.x * gridDim.x;
// size == batch_size
for (; idx < size; idx += step_size) {
int i = idx;
for (int j = 0; j < idx_slice_size; ++j) {
auto array_index = p_index[i * idx_slice_size + j];
p_array[i * array_slice_size + array_index] +=
p_value[i * idx_slice_size + j];
// UNDERSTAND: set label as 0,1,...,num_true-1
template <typename T>
__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int step_size = blockDim.x * gridDim.x;
for (; idx < size; idx += step_size) {
p_array[idx] = idx % num_true;
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template <typename T>
__global__ void gpu_compute_remove_accidental_hits(const int size,
const int num_true,
const int idx_slice_size,
const int64_t* p_index,
T* p_value) {
const auto value_slice_size = idx_slice_size;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int step_size = blockDim.x * gridDim.x;
for (; idx < size; idx += step_size) {
int i = idx / idx_slice_size;
if (idx % idx_slice_size < num_true) continue;
for (int j = 0; j < num_true; ++j) {
const auto true_idx = i * idx_slice_size + j;
if (p_index[true_idx] == p_index[idx]) {
p_value[idx] -= 1e20;
template <typename T>
class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
using Tensor = framework::Tensor;
void Compute(const framework::ExecutionContext& context) const override {
// get necessary inputs
const Tensor* logits = context.Input<Tensor>("Logits");
const Tensor* labels = context.Input<Tensor>("Labels");
VLOG(3) << "Enter SampleLogitsCUDAKernel";
// get necessary outputs
Tensor* samples = context.Output<Tensor>("Samples");
Tensor* probabilities = context.Output<Tensor>("Probabilities");
Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
// shapes
const auto batch_size = logits->dims()[0];
const auto num_classes = logits->dims()[1];
const auto labels_dim = labels->dims();
const auto num_true = labels_dim[1];
const auto samples_dim = samples->dims();
// attrs
const auto num_samples = context.Attr<int>("num_samples");
const bool use_customized_samples =
const bool uniq = context.Attr<bool>("uniq");
const bool remove_accidental_hits =
// device contexts
auto& dev_ctx = context.cuda_device_context();
// UNDERSTAND: allocate memories for temporaries
sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> set_zero;
set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
auto sampled_labels_data =
sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
int threads = 512;
size_t size = batch_size * num_true;
int grid = (size + threads - 1) / threads;
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, num_true, sampled_labels_data);
if (use_customized_samples) {
const Tensor* customized_samples =
const Tensor* customized_probabilities =
} else {
probabilities->mutable_data<T>(samples_dim, context.GetPlace());
// UNDERSTAND: sampling
const auto seed = context.Attr<int>("seed");
auto sampler_with_prob = math::GPUSampleWithProb<T>();
sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
num_samples, labels, samples, probabilities);
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
const auto num_take = samples->dims()[1];
const auto array_dims = logits->dims();
const auto idx_dims = samples->dims();
const T* p_array = logits->data<T>();
const int64_t* p_index = samples->data<int64_t>();
T* p_value = sampled_logits->data<T>();
// src slice size
const auto array_slice_size = array_dims[1];
// index slice size
const auto idx_slice_size = idx_dims[1];
size = batch_size * num_take;
grid = (size + threads - 1) / threads;
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
if (remove_accidental_hits) {
const size_t size = batch_size * (num_true + num_samples);
int grid = (size + threads - 1) / threads;
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, num_true, idx_slice_size, p_index, p_value);
// subtracted sampled logits with logQ(y|x)
auto probs = EigenMatrix<T>::From(*probabilities);
auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
smp_logits.device(*dev_ctx.eigen_device()) =
(smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
template <typename T>
class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
using Tensor = framework::Tensor;
void Compute(const framework::ExecutionContext& context) const override {
auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
const Tensor* samples = context.Input<Tensor>("Samples");
const Tensor* sampled_logits_grad =
auto& dev_ctx = context.cuda_device_context();
math::SetConstant<platform::CUDADeviceContext, T> set_zero;
set_zero(dev_ctx, logits_grad, static_cast<T>(0));
// UNDERSTAND: scatter it back to logit_grad
const auto batch_size = samples->dims()[0];
const auto num_put = samples->dims()[1];
const auto array_dims = logits_grad->dims();
const auto idx_dims = samples->dims();
T* p_array = logits_grad->data<T>();
const int64_t* p_index = samples->data<int64_t>();
const T* p_value = sampled_logits_grad->data<T>();
// src slice size
const auto array_slice_size = array_dims[1];
// index slice size
const auto idx_slice_size = idx_dims[1];
int threads = 128;
const size_t size = batch_size;
int grid = (size + threads - 1) / threads;
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(sample_logits, ops::SampleLogitsCUDAKernel<float>,
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
struct TolerableValue {
HOSTDEVICE T operator()(const T& x) const {
const T kApproInf = 1e20;
if (x == INFINITY) return kApproInf;
if (x == -INFINITY) return -kApproInf;
return x;
// UNDERSTAND: something like take_along_axis in numpy.
template <typename T>
static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
const framework::Tensor& array,
const framework::Tensor& index,
framework::Tensor* value) {
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 &&
index.dims()[0] == array.dims()[0] &&
index.dims() == value->dims());
const auto batch_size = index.dims()[0];
const auto num_take = index.dims()[1];
const auto array_dims = array.dims();
const auto idx_dims = index.dims();
// UNDERSTAND: no allocations here
const T* p_array = array.data<T>();
const int64_t* p_index = index.data<int64_t>();
T* p_value = value->data<T>();
// src slice size
const auto array_slice_size = array_dims[1];
// index slice size
const auto idx_slice_size = idx_dims[1];
const auto value_slice_size = idx_slice_size;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < num_take; ++j) {
auto array_index = p_index[i * idx_slice_size + j];
p_value[i * value_slice_size + j] =
p_array[i * array_slice_size + array_index];
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template <typename T>
static void CPUPutAlongD1(const platform::DeviceContext& ctx,
framework::Tensor* array,
const framework::Tensor& index,
const framework::Tensor& value) {
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 &&
index.dims()[0] == array->dims()[0] &&
index.dims() == value.dims());
const auto batch_size = index.dims()[0];
const auto num_put = index.dims()[1];
auto array_dims = array->dims();
auto idx_dims = index.dims();
// UNDERSTAND: no allocations here
T* p_array = array->data<T>();
const int64_t* p_index = index.data<int64_t>();
const T* p_value = value.data<T>();
// slice sizes
const auto array_slice_size = array_dims[1];
const auto idx_slice_size = idx_dims[1];
const auto value_slice_size = idx_slice_size;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < num_put; ++j) {
auto array_index = p_index[i * idx_slice_size + j];
p_array[i * array_slice_size + array_index] +=
p_value[i * value_slice_size + j];
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template <typename T>
static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
framework::Tensor* sampled_logits,
const framework::Tensor& samples,
const int num_true) {
const auto batch_size = sampled_logits->dims()[0];
const auto num_sampled_classes = sampled_logits->dims()[1];
T* sampled_logits_data = sampled_logits->data<T>();
const auto samples_data = samples.data<int64_t>();
std::unordered_set<int64_t> tmp_true_labels;
for (int i = 0; i < batch_size; ++i) {
tmp_true_labels.insert(samples_data + i * num_sampled_classes,
samples_data + i * num_sampled_classes + num_true);
for (int j = num_true; j < num_sampled_classes; ++j) {
const auto idx = i * num_sampled_classes + j;
if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end())
sampled_logits_data[idx] -= 1e20;
template <typename T>
class SampleLogitsKernel : public framework::OpKernel<T> {
using Tensor = framework::Tensor;
void Compute(const framework::ExecutionContext& context) const override {
"This kernel only runs on CPU.");
VLOG(3) << "Enter SampleLogitsKernel";
// get necessary inputs
const Tensor* logits = context.Input<Tensor>("Logits");
const Tensor* labels = context.Input<Tensor>("Labels");
// get necessary outputs
Tensor* samples = context.Output<Tensor>("Samples");
Tensor* probabilities = context.Output<Tensor>("Probabilities");
Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
// shapes
const auto batch_size = logits->dims()[0];
const auto num_classes = logits->dims()[1];
const auto labels_dim = labels->dims();
const auto num_true = labels_dim[1];
const auto samples_dim = samples->dims();
// attrs
const auto num_samples = context.Attr<int>("num_samples");
const bool use_customized_samples =
const bool remove_accidental_hits =
// device contexts
auto& dev_ctx =
context.template device_context<platform::CPUDeviceContext>();
// UNDERSTAND: allocate memories for temporaries
sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
auto sampled_labels_data =
sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < num_true; ++j) {
sampled_labels_data[i * num_true + j] = j;
if (use_customized_samples) {
const Tensor* customized_samples =
const Tensor* customized_probabilities =
} else {
probabilities->mutable_data<T>(samples_dim, context.GetPlace());
// UNDERSTAND: sampling
const auto seed = context.Attr<int>("seed");
auto sampler_with_prob =
math::SampleWithProb<platform::CPUDeviceContext, T>();
sampler_with_prob(dev_ctx, math::LogUniformSampler(num_classes, seed),
num_samples, labels, samples, probabilities);
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
CPUTakeAlongD1<T>(dev_ctx, *logits, *samples, sampled_logits);
if (remove_accidental_hits) {
compute_remove_accidental_hits<T>(dev_ctx, sampled_logits, *samples,
// subtracted sampled logits with logQ(y|x)
auto probs = EigenMatrix<T>::From(*probabilities);
auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
smp_logits.device(*dev_ctx.eigen_device()) =
(smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
template <typename T>
class SampleLogitsGradKernel : public framework::OpKernel<T> {
using Tensor = framework::Tensor;
void Compute(const framework::ExecutionContext& context) const override {
auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
const Tensor* samples = context.Input<Tensor>("Samples");
const Tensor* sampled_logits_grad =
auto& dev_ctx =
context.template device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, T> set_zero;
set_zero(dev_ctx, logits_grad, static_cast<T>(0));
// UNDERSTAND: scatter it back to logit_grad
CPUPutAlongD1<T>(dev_ctx, logits_grad, *samples, *sampled_logits_grad);
} // namespace operators
} // namespace paddle
...@@ -87,6 +87,7 @@ __all__ = [ ...@@ -87,6 +87,7 @@ __all__ = [
'transpose', 'transpose',
'im2sequence', 'im2sequence',
'nce', 'nce',
'hsigmoid', 'hsigmoid',
'beam_search', 'beam_search',
'row_conv', 'row_conv',
...@@ -5840,6 +5841,124 @@ def softmax_with_cross_entropy(logits, ...@@ -5840,6 +5841,124 @@ def softmax_with_cross_entropy(logits,
return loss return loss
def sampled_softmax_with_cross_entropy(logits,
**Sampled Softmax With Cross Entropy Operator.**
Cross entropy loss with sampled softmax is used as the output layer for
larger output classes extensively. This operator samples a number of samples
for all examples, and computes the softmax normalized values for each
row of the sampled tensor, after which cross-entropy loss is computed.
Because this operator performs a softmax on logits internally, it expects
unscaled logits. This operator should not be used with the output of
softmax operator since that would produce incorrect results.
For examples with T true labels (T >= 1), we assume that each true label has
a probability of 1/T. For each sample, S samples are generated using a
log uniform distribution. True labels are concatenated with these samples to
form T + S samples for each example. So, assume the shape of logits is
[N x K], the shape for samples is [N x (T+S)]. For each sampled label, a
probability is calculated, which corresponds to the Q(y|x) in
[Jean et al., 2014](http://arxiv.org/abs/1412.2007).
Logits are sampled according to the sampled labels. Then if
remove_accidental_hits is True, if a sample[i, j] accidentally hits true
labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to
make its softmax result close to zero. Then sampled logits are subtracted by
logQ(y|x), these sampled logits and re-indexed labels are used to compute
a softmax with cross entropy.
logits (Variable): The unscaled log probabilities, which is a 2-D tensor
with shape [N x K]. N is the batch_size, and K is the class number.
label (Variable): The ground truth which is a 2-D tensor. Label is a
Tensor<int64> with shape [N x T], where T is the number of true
labels per example.
num_samples (int): The number for each example, num_samples should be
less than the number of class.
num_true(int): The number of target classes per training example.
remove_accidental_hits (bool): A flag indicating whether to remove
accidental hits when sampling. If True and if a sample[i, j]
accidentally hits true labels, then the corresponding
sampled_logits[i, j] is minus by 1e20 to make its softmax result
close to zero. Default is True.
use_customized_samples (bool): Whether to use custom samples and probabities to sample
customized_samples (Variable): User defined samples, which is a 2-D tensor
with shape [N, T + S]. S is the num_samples, and T is the number of true
labels per example.
customized_probabilities (Variable): User defined probabilities of samples,
a 2-D tensor which has the same shape with customized_samples.
seed (int): The random seed for generating random number, which is used
in the process of sampling. Default is 0.
Variable: Return the cross entropy loss which is a 2-D tensor with shape
[N x 1].
.. code-block:: python
logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
label = fluid.layers.data(name='label', shape=[5], dtype='int64')
fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.sampled_softmax_with_cross_entropy(
logits=fc, label=label, num_samples=25)
helper = LayerHelper('sample_logits', **locals())
samples = helper.create_variable_for_type_inference(dtype='int64')
probabilities = helper.create_variable_for_type_inference(
sampled_logits \
= helper.create_variable_for_type_inference(dtype=logits.dtype)
sampled_label = helper.create_variable_for_type_inference(dtype='int64')
'Logits': logits,
'Labels': label,
'CustomizedSamples': customized_samples,
'CustomizedProbabilities': customized_probabilities
'Samples': samples,
'Probabilities': probabilities,
'SampledLabels': sampled_label,
'SampledLogits': sampled_logits
'use_customized_samples': use_customized_samples,
'uniq': True,
'remove_accidental_hits': remove_accidental_hits,
'num_samples': num_samples,
'seed': seed
loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
inputs={'Logits': sampled_logits,
'Label': sampled_label},
outputs={'Softmax': softmax,
'Loss': loss},
'soft_label': False,
'ignore_index': False,
'numeric_stable_mode': False
return loss / num_true
def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
""" """
This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`. This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
...@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase): ...@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output) self.assertIsNotNone(output)
print(str(program)) print(str(program))
def test_sampled_softmax_with_cross_entropy(self):
program = Program()
with program_guard(program):
logits = layers.data(name='Logits', shape=[256], dtype='float64')
label = layers.data(name='Label', shape=[1], dtype='int64')
num_samples = 25
output = layers.sampled_softmax_with_cross_entropy(logits, label,
@decorators.prog_scope() @decorators.prog_scope()
def test_nce(self): def test_nce(self):
window_size = 5 window_size = 5
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册