diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 01c61fd8053aa5f2377a02b9904fb2b8846583cd..ca21f9db88c2b3ceb96114f6ce8f717c7284018e 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -112,33 +112,6 @@ int UniqSampler(const Sampler& sampler, const std::size_t num_samples, } return num_tries; } -/* -template -void Print(Tensor & t, std::string name) { - if (!FLAGS_debug_print) { - return; - } - VLOG(1) << "qxz print "<< name; - VLOG(1) << name << "size = " << t.numel(); - size_t size = t.numel(); - type *d = t.data(); -#ifdef PADDLE_WITH_CUDA - std::vector vec; - platform::DeviceContextPool::Instance().Get(t.place())->Wait(); - if (platform::is_gpu_place(t.place())) { - vec.resize(size); - cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost); - d = vec.data(); - } -#endif - VLOG(1) << name << " data_ptr = " << static_cast(d); - std::string out; - for (size_t i = 0; i < size; i++) { - out += std::to_string(d[i]); - out += ","; - } - VLOG(1) << out; -}*/ template void GPUSampleWithProb::operator()( diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc index 160eb066eabf467133d19d691b6590d08dd3e07d..22286ae87f9ecac3d0f9abd8c89a30b42da1b17b 100644 --- a/paddle/fluid/operators/sample_logits_op.cc +++ b/paddle/fluid/operators/sample_logits_op.cc @@ -64,12 +64,13 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate(); AddOutput("SampledLogits", "(Tensor, default: Tensor), A 2-D tensor with shape" - "[N x S+NT]. The outputs value of sampled softmax, which will be" + "[N x S+NT]. The outputs value of sample logits, which will be" "used in backward calculation.") .AsIntermediate(); - AddOutput("SampledLabel", - "(Tensor, default: Tensor), A 2-D tensor. The cross " - "entropy loss with shape [N x NT]."); + AddOutput( + "SampledLabel", + "(Tensor, default: Tensor), A 2-D tensor. The sampled label" + "with shape [N x S + NT]."); AddAttr( "use_custom_samples", "An indicator whether to use custom samples with probabilities, if True" @@ -81,7 +82,7 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker { "An indicator whether to sample non-repetitive negtive labels, if True" "the operator will sample negtive labels without replacement." "otherwise, the operator will sample negtive labels with replacement.") - .SetDefault(false); + .SetDefault(true); AddAttr( "remove_accidental_hits", "An indicator whether to remove accidental hits when samples hits true" @@ -92,35 +93,11 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("seed", "Random seed for generating samples").SetDefault(0); AddComment(R"DOC( -TODO(chenfeiyu): Write documentation for this Operator. -Sampled Softmax With Cross Entropy Operator. - -Cross entropy loss with sampled softmax is used as the output layer extensively. -This operator computes the softmax normalized values for each row of the input -tensor, after which cross-entropy loss is computed. This provides a more -numerically stable gradient. - -Because this operator performs a softmax on logits internally, it expects -unscaled logits. This operator should not be used with the output of -softmax operator since that would produce incorrect results. - -When the attribute soft_label is set false, this operators expects mutually -exclusive hard labels, each sample in a batch is in exactly one class with a -probability of 1.0. Each sample in the batch will have a single label. - -The equation is as follows: - -1) Hard label (one-hot label, so every sample has exactly one class) - -$$Loss_j = -\text{Logit}_{Label_j} + -\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), -j = 1,..., K$$ - -2) Soft label (each sample can have a distribution over all classes) + """ + Computes sampled output training logits and labels suitable for implementing + sampled softmax. -$$Loss_j = -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i - -\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), -j = 1,...,K$$ + """ )DOC"); } diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu index 5b311bb67149b5d232137fe2d0d34298ff2fa1de..fe95542fd8f5d620094bf99d0e197480855f17e1 100644 --- a/paddle/fluid/operators/sample_logits_op.cu +++ b/paddle/fluid/operators/sample_logits_op.cu @@ -248,8 +248,7 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel { if (!FLAGS_debug_print) { return; } - VLOG(1) << "qxz print " << name; - VLOG(1) << name << "size = " << t.numel(); + VLOG(1) << name << " size = " << t.numel(); size_t size = t.numel(); const type* d = t.data(); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h index 77d66a642e6dc2fed60ca6701d37df2e33a93001..139432178bdd90972a8cdd6934475854635cd645 100644 --- a/paddle/fluid/operators/sample_logits_op.h +++ b/paddle/fluid/operators/sample_logits_op.h @@ -207,37 +207,6 @@ class SampleLogitsKernel : public framework::OpKernel { num_true); } - /* Debug - const auto num_sampled_classes = samples_dim[1]; - std::cout << "Sampled Logits" << std::endl; - const auto sampled_logits_data = sampled_logits->data(); - for (int i = 0; i < sampled_logits->numel(); ++i) { - std::cout << sampled_logits_data[i] << ", "; - if ((i + 1) % num_sampled_classes == 0) - std::cout << std::endl; - } - std::cout << std::endl; - */ - /* Debug - std::cout << "Samples" << std::endl; - const auto samples_data = samples->data(); - for (int i = 0; i < samples->numel(); ++i) { - std::cout << samples_data[i] << ", "; - if ((i + 1) % num_sampled_classes == 0) - std::cout << std::endl; - } - std::cout << std::endl; - */ - /* Debug - std::cout << "Probabilities" << std::endl; - const auto probabilities_data = probabilities->data(); - for (int i = 0; i < probabilities->numel(); ++i) { - std::cout << probabilities_data[i] << ", "; - if ((i + 1) % num_sampled_classes == 0) - std::cout << std::endl; - } - std::cout << std::endl; - */ // subtracted sampled logits with logQ(y|x) auto probs = EigenMatrix::From(*probabilities); auto smp_logits = EigenMatrix::From(*sampled_logits); @@ -263,9 +232,6 @@ class SampleLogitsGradKernel : public framework::OpKernel { math::SetConstant set_zero; set_zero(dev_ctx, logits_grad, static_cast(0)); - // const bool remove_accidental_hits = - // context.Attr("remove_accidental_hits"); - // UNDERSTAND: scatter it back to logit_grad CPUPutAlongD1(dev_ctx, logits_grad, *samples, *sampled_logits_grad); }