diff --git a/paddle/fluid/operators/multinomial_op.cu b/paddle/fluid/operators/multinomial_op.cu index 51399f7116ca43b04251e67015b66ddd31a59ce3..160e461e989e5f631cf090d079e11d7e02074db5 100644 --- a/paddle/fluid/operators/multinomial_op.cu +++ b/paddle/fluid/operators/multinomial_op.cu @@ -26,69 +26,17 @@ limitations under the License. */ namespace paddle { namespace operators { -/* -template -using EigenVector = framework::EigenVector; -template -using EigenMatrix = framework::EigenMatrix; -*/ - -/* -template -__global__ void SumArrayCUDAKernel(T **in, T *out, size_t in_size) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - // T total(read_dst ? out[id] : static_cast(0)); - T total(static_cast(0)) - for (int i = 0; i < in_size; ++i) { - const T *tmp = in[i]; - if (tmp) { - total += tmp[id]; - } - } - out[id] = total; - id += blockDim.x * gridDim.x; -}*/ - -/* -template -__global__ void NormalizeProbability(T* probs, int64_t rows, int64_t cols) { - extern __shared__ std::vector sum_rows(rows); - T val; - for (int64_t i = blockId.x; i < rows; i += gridDim.x) { - T sum = static_cast(0); - for (int64_t j = threadIdx.x; j < cols; j += blockDim.x) { - val = probs[i * cols + j]; - sum += val; - } - - } -}*/ - template __global__ void NormalizeProbability(T* norm_probs, const T* in_data, T* sum_rows) { - // int id = blockIdx.x * blockDim.x + threadIdx.x; - // int id = threadIdx.x; int id = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x; norm_probs[id] = in_data[id] / sum_rows[blockIdx.y]; } -template -__global__ void yokiFunc(const T* in_data, T* out) { - // int id = blockIdx.x * blockDim.x + threadIdx.x; - // int id = threadIdx.x; - int id = threadIdx.x + blockIdx.x * blockDim.x + - blockIdx.y * gridDim.x * blockDim.x; - out[id] = in_data[id]; -} - template __global__ void Cumsum(T* norm_probs_data, int64_t num_distributions, int64_t num_categories, T* cumulative_probs) { - // int id = blockIdx.x; for (int id = blockIdx.x; id < num_distributions; id += gridDim.x) { thrust::inclusive_scan(thrust::device, norm_probs_data + id * num_categories, @@ -111,52 +59,43 @@ struct RandomGeneratorCudaFunctor { } }; -/* template -class MultinomialCudaFunctor(T* out_data, const T* in_data, - const int64_t num_samples, const bool replacement, - const int64_t num_categories, - const int64_t num_distributions) { - -}*/ - -template -__device__ int binarySearchForMultinomial(T* cumdist, T* dist, int size, - T val) { - int start = 0; - int end = size; +__device__ int binarySearchFunctor(T* cumdist, T* dist, int size, T val) { + int left = 0; + int right = size; // cumdist[size - 1] = 0 => all zero prob dist // CUDA_KERNEL_ASSERT(cumdist[size - 1] > static_cast(0)); - while (end - start > 0) { - int mid = start + (end - start) / 2; + while (right - left > 0) { + int mid = left + (right - left) / 2; T midVal = cumdist[mid]; if (midVal < val) { - start = mid + 1; + left = mid + 1; } else { - end = mid; + right = mid; } } - if (start == size) { + if (left == size) { // No probability mass or precision problems; just return the - // first non-zero element by setting start to size-1 here, + // first non-zero element by setting left to size-1 here, // the code below will move it to the last non-zero probability // this actually can happen when the random number is 1 // (github pytorch issue #4858). - start = size - 1; + left = size - 1; } - while (start >= 1 && dist[start] == 0) start--; + while (left >= 1 && dist[left] == 0) left--; - return start; + return left; } template __global__ void sampleMultinomialWithReplacement( - T* rng, const int64_t totalSamples, T* dest, const int64_t distributions, - const int64_t categories, T* normDistPrefixSum, T* normDist) { + T* rng_data, const int64_t num_samples, T* out_data, + const int64_t num_distributions, const int64_t num_categories, + T* cumulative_probs, T* norm_probs_data) { // At the moment, each warp computes one sample value in the binary // search due to divergence. It seems possible to compute multiple // values and limit divergence though later on. @@ -170,22 +109,23 @@ __global__ void sampleMultinomialWithReplacement( int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x; - for (int curDist = blockIdx.y; curDist < distributions; + for (int curDist = blockIdx.y; curDist < num_distributions; curDist += gridDim.y) { for (int sample = blockIdx.x * blockDim.x + threadIdx.x; - sample < totalSamples; sample += blockDim.x * gridDim.x) { + sample < num_samples; sample += blockDim.x * gridDim.x) { // we are losing 3 out of 4 generated numbers but it's ok // this kernel is not very efficient anyway // T uniform_random = dist(rng); - T uniform_random = rng[sample + curDist * totalSamples]; + T uniform_random = rng_data[sample + curDist * num_samples]; // Find the bucket that a uniform sample lies in - int choice = binarySearchForMultinomial( - normDistPrefixSum + curDist * categories, - normDist + curDist * categories, categories, uniform_random); + int choice = + binarySearchFunctor(cumulative_probs + curDist * num_categories, + norm_probs_data + curDist * num_categories, + num_categories, uniform_random); - dest[sample + curDist * totalSamples] = choice; + out_data[sample + curDist * num_samples] = choice; } } } @@ -198,14 +138,11 @@ class MultinomialOpKernel const auto x = ctx.Input("X"); auto out = ctx.Output("Out"); - // auto yokiout = ctx.Output("yokiOut"); - const int64_t num_samples = ctx.Attr("num_samples"); const bool replacement = ctx.Attr("replacement"); auto* in_data = x->data(); auto* out_data = out->mutable_data(ctx.GetPlace()); - // auto* yokiout_data = yokiout->mutable_data(ctx.GetPlace()); auto in_dims = x->dims(); int64_t in_rank = in_dims.size(); @@ -215,10 +152,6 @@ class MultinomialOpKernel if (!replacement) { int in_data_numel = x->numel(); int out_data_numel = out->numel(); - // std::vector cpu_in_data(in_data_numel); - // std::vector cpu_out_data(out_data_numel); - // T cpu_in_data[in_data_numel]; - // T cpu_out_data[out_data_numel]; T* cpu_in_data = new T[in_data_numel]; T* cpu_out_data = new T[out_data_numel]; @@ -226,10 +159,6 @@ class MultinomialOpKernel cudaMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T), cudaMemcpyDeviceToHost); - VLOG(3) << "Print cpu_in_data " << cpu_in_data[0] << "\n"; - VLOG(3) << "Print in_data_numel " << in_data_numel << "\n"; - VLOG(3) << "Print out_data_numel " << out_data_numel << "\n"; - MultinomialFunctor(cpu_out_data, cpu_in_data, num_samples, replacement, num_categories, num_distributions); cudaMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(T), @@ -240,21 +169,9 @@ class MultinomialOpKernel return; } - // std::vector sum_rows(num_distributions); - // SumArrayCUDAKernel(in_data, sum_rows,) - - VLOG(3) << "Print num_distributions " << num_distributions << "\n"; - - VLOG(3) << "Print num_categories " << num_categories << "\n"; - - VLOG(3) << "Print in_rank " << in_rank << "\n"; - framework::Tensor sum_rows_t; auto* sum_rows_data = sum_rows_t.mutable_data({num_distributions}, ctx.GetPlace()); - // auto* sum_rows_data = - // sum_rows_t->mutable_data(framework::make_ddim({num_distributions}), - // ctx.GetPlace()); auto& place = *ctx.template device_context() .eigen_device(); @@ -262,58 +179,34 @@ class MultinomialOpKernel if (num_distributions == 1) { auto eigen_input = framework::EigenVector::Flatten(*x); auto eigen_sum_rows = framework::EigenVector::From(sum_rows_t); - // auto eigen_sum_rows = framework::EigenScalar::From(sum_rows_t); eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes(1)) .eval() .reshape(Eigen::DSizes(sum_rows_t.dims()[0])); } else { auto eigen_input = framework::EigenMatrix::From(*x); - // auto eigen_sum_rows = framework::EigenVector::From(sum_rows_t); auto eigen_sum_rows = framework::EigenVector::From(sum_rows_t); eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes(1)); - // .eval() - // .reshape(Eigen::DSizes(sum_rows_t.dims()[0])); - // eigen_sum_rows.device(place) = - // eigen_input.sum().eval().reshape(Eigen::DSizes(1)); } - // std::vector in_data_norm(num_categories); framework::Tensor norm_probs_t; auto* norm_probs_data = norm_probs_t.mutable_data( {num_distributions, num_categories}, ctx.GetPlace()); - // dim3 grid(num_distributions); - // dim3 block(num_categories); - dim3 block(num_categories < 512 ? num_categories : 512); dim3 grid((num_categories - 1) / block.x + 1, num_distributions); NormalizeProbability< T><<>>( norm_probs_data, in_data, sum_rows_data); - // num_distributions can only be 1. - // std::vector cumulative_probs(num_categories); framework::Tensor cumulative_probs_t; auto* cumulative_probs = cumulative_probs_t.mutable_data( {num_distributions, num_categories}, ctx.GetPlace()); - // T cumulative_probs[num_categories]; dim3 block1(1); dim3 grid1(num_distributions); Cumsum<<>>( norm_probs_data, num_distributions, num_categories, cumulative_probs); - /* - dim3 block2(num_categories < 512 ? num_categories : 512); - dim3 grid2((num_categories-1)/block2.x+1, num_distributions); - yokiFunc<<>>( - cumulative_probs, yokiout_data);*/ - - // int64_t size = num_categories; - // thrust::inclusive_scan(thrust::device, norm_probs_data, - // norm_probs_data + num_categories, - // cumulative_probs); - VLOG(3) << "Print cumsum " << cumulative_probs << "\n"; if (replacement) { @@ -336,24 +229,11 @@ class MultinomialOpKernel index_sequence_begin + num_distributions * num_samples, rng_data, RandomGeneratorCudaFunctor(seed)); - VLOG(3) << "Print enter\n"; - // VLOG(3) << "Print size in_data " << - // sizeof(in_data)/sizeof(in_data[num_categories-1]) << "\n"; - // VLOG(3) << "Print norm_probs_data0 " << - // sizeof(norm_probs_data[num_categories-1]) << "\n"; - sampleMultinomialWithReplacement< T><<>>( rng_data, num_samples, out_data, num_distributions, num_categories, cumulative_probs, norm_probs_data); - - VLOG(3) << "Print end\n" << out_data; } - - VLOG(3) << "Print final end\n"; - - // MultinomialCudaFunctor(out_data, in_data, num_samples, replacement, - // num_categories, num_distributions); } }; diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py index 32d438eabf8777821327964828cd944a5a32b36a..b373af7c4b95ee8cab7827dd99286117270e8591 100644 --- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py +++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py @@ -126,6 +126,49 @@ class TestMultinomialApi(unittest.TestCase): sample_prob, prob, rtol=0, atol=0.01), "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob)) + def test_dygraph2(self): + paddle.disable_static() + x = paddle.rand([3, 4]) + out = paddle.multinomial(x, num_samples=100000, replacement=True) + x_numpy = x.numpy() + + out_list = np.split(out.numpy(), 3, axis=0) + count_array = [0] * 3 + for i in range(3): + count_array[i] = np.unique( + out_list[i], return_counts=True)[1].astype("float32") + sample_prob = np.stack(count_array, axis=0) + sample_prob /= sample_prob.sum(axis=-1, keepdims=True) + + prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True) + self.assertTrue( + np.allclose( + sample_prob, prob, rtol=0, atol=0.01), + "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob)) + paddle.enable_static() + + def test_dygraph3(self): + paddle.disable_static() + x = paddle.rand([1000]) + out = paddle.multinomial(x, num_samples=100, replacement=False) + x_numpy = x.numpy() + + unique_out = np.unique(out.numpy()) + self.assertEqual( + len(unique_out), 100, + "replacement is False. categories can't be sampled repeatedly") + paddle.enable_static() + + """ + def test_replacement_error(self): + def test_error(): + paddle.disable_static() + x = paddle.rand([5]) + out = paddle.multinomial(x, num_samples=10, replacement=False) + + self.assertRaises(OutOfRangeError, test_error) # not OutOfRangeError + """ + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 52b645c659a78709841ccaedde6b445489e76dae..49a63ded683c9584a1ddca1c7d3dcbf3ff3d3674 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: define random functions +# TODO: define random functions from ..fluid import core from ..fluid.framework import in_dygraph_mode, Variable, convert_np_dtype_to_dtype_ @@ -40,18 +40,18 @@ def bernoulli(x, name=None): This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution. The input ``x`` is a tensor with probabilities for generating the random binary number. Each element in ``x`` should be in [0, 1], and the out is generated by: - + .. math:: out_i ~ Bernoulli (x_i) Args: - x(Tensor): A tensor with probabilities for generating the random binary number. The data type + x(Tensor): A tensor with probabilities for generating the random binary number. The data type should be float32, float64. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. - Returns: + Returns: Tensor: A Tensor filled with random binary number with the same shape and dtype as ``x``. Examples: @@ -80,7 +80,7 @@ def bernoulli(x, name=None): helper = LayerHelper("randint", **locals()) out = helper.create_variable_for_type_inference( - dtype=x.dtype) # maybe set out to int32 ? + dtype=x.dtype) # maybe set out to int32 ? helper.append_op( type='bernoulli', inputs={"X": x}, outputs={'Out': out}, attrs={}) return out @@ -88,8 +88,23 @@ def bernoulli(x, name=None): def multinomial(x, num_samples=1, replacement=False, name=None): """ + This OP returns a Tensor filled with random values sampled from a Multinomical + distribution. The input ``x`` is a tensor with probabilities for generating the + random number. Each element in ``x`` should be larger or equal to 0, but not all + 0. ``replacement`` indicates whether it is a replaceable sample. If ``replacement`` + is True, a category can be sampled more than once. + + Args: + x(Tensor): A tensor with probabilities for generating the random number. The data type + should be float32, float64. + num_samples(int, optional): Number of samples, default is 1. + replacement(bool, optional): whether it is a replaceable sample, default is False. + name(str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + Returns: + Tensor: A Tensor filled with sampled category index after ``num_samples`` times samples. - Examples: .. code-block:: python @@ -97,15 +112,24 @@ def multinomial(x, num_samples=1, replacement=False, name=None): paddle.disable_static() - x = paddle.rand([2, 3]) + x = paddle.rand([2,4]) print(x.numpy()) - # [[0.11272584 0.3890902 0.7730957 ] - # [0.10351662 0.8510418 0.63806665]] + # [[0.7713825 0.4055941 0.433339 0.70706886] + # [0.9223313 0.8519825 0.04574518 0.16560672]] - out = paddle.bernoulli(x) - print(out.numpy()) - # [[0. 0. 1.] - # [0. 0. 1.]] + out1 = paddle.multinomial(x, num_samples=5, replacement=True) + print(out1.numpy()) + # [[3. 3. 1. 1. 0.] + # [0. 0. 0. 0. 1.]] + + out2 = paddle.multinomial(x, num_samples=5) + # OutOfRangeError: When replacement is False, number of samples + # should be less than non-zero categories + + out3 = paddle.multinomial(x, num_samples=3) + print(out3.numpy()) + # [[0. 2. 3.] + # [0. 1. 3.]] """ @@ -152,7 +176,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None): Returns: Tensor: A Tensor filled with random values sampled from a Gaussian - distribution, with ``shape`` and ``dtype``. + distribution, with ``shape`` and ``dtype``. """ op_type_for_check = 'gaussian/standard_normal/randn/normal' seed = 0 @@ -393,7 +417,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None): Examples: .. code-block:: python - + import paddle paddle.disable_static() @@ -481,7 +505,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None): need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. - Returns: + Returns: Tensor: A Tensor filled with random integers from a discrete uniform distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``. @@ -591,7 +615,7 @@ def randperm(n, dtype="int64", name=None): out2 = paddle.randperm(7, 'int32') # [1, 6, 2, 0, 4, 3, 5] # random - + """ if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype)