From ada787db757e34477de155040ea1c73362017c7b Mon Sep 17 00:00:00 2001 From: Chengmo Date: Thu, 9 Apr 2020 12:47:50 +0800 Subject: [PATCH] Cherry-pick tdm_sampler op in Contrib (#23598) * cherry-pick tdm_sampler --- paddle/fluid/operators/tdm_sampler_op.cc | 137 ++++++++ paddle/fluid/operators/tdm_sampler_op.h | 329 ++++++++++++++++++ python/paddle/fluid/contrib/layers/nn.py | 211 +++++++++++ .../tests/unittests/test_tdm_sampler_op.py | 291 ++++++++++++++++ 4 files changed, 968 insertions(+) create mode 100644 paddle/fluid/operators/tdm_sampler_op.cc create mode 100644 paddle/fluid/operators/tdm_sampler_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc new file mode 100644 index 00000000000..f91acc0420c --- /dev/null +++ b/paddle/fluid/operators/tdm_sampler_op.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/tdm_sampler_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/sampler.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +class TDMSamplerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", + "X(Tensor), Input variable which" + "mapping the leaf node idx of tdm tree," + "dtype support int32/int64"); + AddInput("Travel", + "Travel(Tensor), must has the same dtype with Layer" + "Contains path information of all leaf nodes to root node," + " dtype support int32/64"); + AddInput("Layer", + "Layer(Tensor), must has the same dtype with Travel " + "Indicates which nodes are in each layer"); + AddAttr("output_positive", + "output_positive(bool)" + "Whether positive samples are included in the output") + .SetDefault(true); + AddAttr>( + "neg_samples_num_list", + "neg_samples_num_list(python:list[int], C++:vector)" + "The num of negative samples in each layer") + .SetDefault({}); + AddAttr>("layer_offset_lod", + "offset lod information of Layer") + .SetDefault({}); + AddAttr("seed", + "(int) The seed used in sampler. If it is 0, " + "the sampler will generate a seed randomly.") + .SetDefault(0); + AddAttr("dtype", + "(int, default INT32) " + "Output data type.") + .SetDefault(2); + AddOutput("Out", + "Sampling result lodTensor, with shape [batch_size, layer_num, " + "neg_num_of_layer]"); + AddOutput("Labels", + "Labels of sampling result, has the same shape with Out." + "pos samples mapping value 1, neg sample mapping value 0") + .AsDispensable(); + AddOutput( + "Mask", + "Padding flag of Sampling result, if sampling res comes from padding," + "it will be 0, else 1, lodTensor, with shape [batch_size, " + "layer_num, neg_num_of_layer]"); + AddComment(R"DOC(" + **TDM Sampler** + According to the input positive samples at leaf node, do negative sampling layer by layer on the given tree.")DOC"); + } +}; + +class TDMSamplerOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + platform::errors::InvalidArgument( + "Inputs(Input) of TdmSampler should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Travel"), true, + platform::errors::InvalidArgument( + "Inputs(Travel) of TdmSampler should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Layer"), true, + platform::errors::InvalidArgument( + "Inputs(Layer) of TdmSampler should not be null.")); + auto neg_samples_num_vec = + ctx->Attrs().Get>("neg_samples_num_list"); + auto output_positive_flag = ctx->Attrs().Get("output_positive"); + + int64_t sample_res_length = 0; + for (auto sample_nums : neg_samples_num_vec) { + sample_res_length += sample_nums + (int64_t)output_positive_flag; + } + + auto input_dims = ctx->GetInputDim("X"); + auto ddim = framework::make_ddim({-1, sample_res_length}); + if (ctx->IsRuntime()) { + auto output_dims = framework::vectorize(input_dims); + auto batch_size = output_dims[0]; + ctx->SetOutputDim("Out", + framework::make_ddim({batch_size, sample_res_length})); + ctx->SetOutputDim("Labels", + framework::make_ddim({batch_size, sample_res_length})); + ctx->SetOutputDim("Mask", + framework::make_ddim({batch_size, sample_res_length})); + } else { + ctx->SetOutputDim("Out", ddim); + ctx->SetOutputDim("Labels", ddim); + ctx->SetOutputDim("Mask", ddim); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + tdm_sampler, ops::TDMSamplerOp, ops::TDMSamplerOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + tdm_sampler, ops::TDMSamplerKernel, + ops::TDMSamplerKernel, + ops::TDMSamplerKernel, + ops::TDMSamplerKernel); diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h new file mode 100644 index 00000000000..76a1df96eac --- /dev/null +++ b/paddle/fluid/operators/tdm_sampler_op.h @@ -0,0 +1,329 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/sampler.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using Sampler = math::Sampler; +using DDim = framework::DDim; +using LoD = framework::LoD; +using LoDTensor = framework::LoDTensor; +using LoDAndOffset = std::pair>; + +template +void TDMSamplerInner(const framework::ExecutionContext &context, + const LoDTensor &input_tensor, + const LoDTensor &travel_lod_tensor, + const LoDTensor &layer_lod_tensor, LoDTensor *out_tensor, + LoDTensor *label_tensor, LoDTensor *mask_tensor) { + auto neg_samples_num_vec = + context.Attr>("neg_samples_num_list"); + auto layer_offset_lod = context.Attr>("layer_offset_lod"); + auto output_positive_flag = context.Attr("output_positive"); + + // get dimension + int input_ids_num = input_tensor.numel(); + VLOG(3) << "TDM: input ids nums: " << input_ids_num; + auto layer_nums = neg_samples_num_vec.size(); + VLOG(3) << "TDM: tree layer nums: " << layer_nums; + + int sample_res_length = 0; + for (size_t layer_idx = 0; layer_idx < layer_nums; ++layer_idx) { + sample_res_length += (neg_samples_num_vec[layer_idx] + + static_cast(output_positive_flag)); + } + VLOG(3) << "TDM: sample res length: " << sample_res_length; + + auto travel_dim = travel_lod_tensor.dims(); + auto total_sample_nums = input_ids_num * sample_res_length; + + // get all data + auto *input_data = input_tensor.data(); + auto *travel_data = travel_lod_tensor.data(); + auto *layer_data = layer_lod_tensor.data(); + + OutT zero = 0; + OutT one = 1; + std::vector output_vec(total_sample_nums, zero); + std::vector label_vec(total_sample_nums, zero); + std::vector mask_vec(total_sample_nums, one); + + VLOG(3) << "End get input & output data"; + // generate uniform sampler + + auto seed = context.Attr("seed"); + std::vector sampler_vec{}; + for (size_t layer_index = 0; layer_index < layer_nums; layer_index++) { + int layer_node_nums = + layer_offset_lod[layer_index + 1] - layer_offset_lod[layer_index]; + Sampler *sampler = new math::UniformSampler(layer_node_nums - 1, seed); + sampler_vec.push_back(sampler); + } + VLOG(3) << "TDM: get sampler "; + + for (int i = 0; i < input_ids_num; ++i) { + // find leaf node travel path + T input_id = input_data[i]; + PADDLE_ENFORCE_LT( + -1, input_id, + platform::errors::InvalidArgument( + "Variable value (input) of OP(fluid.layers.tdm_sampler) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + travel_dim[0], input_id)); + PADDLE_ENFORCE_LT( + input_id, travel_dim[0], + platform::errors::InvalidArgument( + "Variable value (input) of OP(fluid.layers.tdm_sampler) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + travel_dim[0], input_id)); + + VLOG(3) << "TDM: input id: " << input_id; + int start_offset = static_cast(input_id * layer_nums); + VLOG(3) << "TDM: Start offset(input_id * layer_nums): " << start_offset; + // nce sample, layer by layer + int offset = 0; + for (size_t layer_idx = 0; layer_idx < layer_nums; ++layer_idx) { + int sample_num = neg_samples_num_vec[layer_idx]; + VLOG(3) << "TDM: Sample num: " << sample_num; + + int node_nums = + layer_offset_lod[layer_idx + 1] - layer_offset_lod[layer_idx]; + VLOG(3) << "TDM: layer - " << layer_idx + 1 + << " - has node_nums: " << node_nums; + + PADDLE_ENFORCE_LE( + sample_num, node_nums - 1, + platform::errors::InvalidArgument( + "Neg sample nums id of OP(fluid.layers.tdm_sampler) at layer %ld " + "expected <= %ld - 1 (positive included), but got %ld. Please " + "check neg_samples_num_list.", + layer_idx, node_nums, sample_num)); + + int node_id_min = layer_offset_lod[layer_idx]; + int node_id_max = layer_offset_lod[layer_idx + 1]; + + OutT positive_node_id = + static_cast(travel_data[start_offset + layer_idx]); + + if (positive_node_id == 0) { + // skip padding + VLOG(3) << "TDM: Skip padding "; + for (int sample_index = 0; + sample_index < sample_num + static_cast(output_positive_flag); + sample_index++) { + output_vec[i * sample_res_length + offset] = 0; + label_vec[i * sample_res_length + offset] = 0; + mask_vec[i * sample_res_length + offset] = 0; + VLOG(3) << "TDM: Res append positive " + << output_vec[i * sample_res_length + offset] + << " Label append positive " + << label_vec[i * sample_res_length + offset] + << " Mask append value " + << mask_vec[i * sample_res_length + offset]; + offset += 1; + } + continue; + } + + PADDLE_ENFORCE_LE( + positive_node_id, node_id_max, + platform::errors::InvalidArgument( + "Positive node id of OP(fluid.layers.tdm_sampler) at layer %ld " + "expected >= %ld and <= %ld, but got %ld. Please check input " + "value.", + layer_idx, node_id_min, node_id_max, positive_node_id)); + PADDLE_ENFORCE_LE( + node_id_min, positive_node_id, + platform::errors::InvalidArgument( + "Positive node id of OP(fluid.layers.tdm_sampler) at layer %ld " + "expected >= %ld and <= %ld, but got %ld. Please check input " + "value.", + layer_idx, node_id_min, node_id_max, positive_node_id)); + + // If output positive, add itself + if (output_positive_flag) { + output_vec[i * sample_res_length + offset] = positive_node_id; + label_vec[i * sample_res_length + offset] = 1; + mask_vec[i * sample_res_length + offset] = 1; + VLOG(3) << "TDM: node id: " << positive_node_id << " Res append " + << output_vec[i * sample_res_length + offset] + << " Label append " + << label_vec[i * sample_res_length + offset] << " Mask append " + << mask_vec[i * sample_res_length + offset]; + offset += 1; + } + std::vector sample_res_vec{}; + // Sampling at layer, until samples enough + for (int sample_index = 0; sample_index < sample_num; ++sample_index) { + // Avoid sampling to positive samples + int sample_res = 0; + do { + sample_res = sampler_vec[layer_idx]->Sample(); + } while (positive_node_id == + layer_data[layer_offset_lod[layer_idx] + sample_res] || + find(sample_res_vec.begin(), sample_res_vec.end(), + sample_res) != sample_res_vec.end()); + sample_res_vec.push_back(sample_res); + + output_vec[i * sample_res_length + offset] = static_cast( + layer_data[layer_offset_lod[layer_idx] + sample_res]); + label_vec[i * sample_res_length + offset] = 0; + mask_vec[i * sample_res_length + offset] = 1; + VLOG(3) << "TDM: node id: " << travel_data[start_offset + layer_idx] + << " Res append negitive " + << output_vec[i * sample_res_length + offset] + << " Label append negitive " + << label_vec[i * sample_res_length + offset] + << " Mask append value " + << mask_vec[i * sample_res_length + offset]; + + PADDLE_ENFORCE_LE( + layer_data[layer_offset_lod[layer_idx] + sample_res], node_id_max, + platform::errors::InvalidArgument( + "Negative node id of OP(fluid.layers.tdm_sampler) at layer %ld" + "expected >= %ld and <= %ld, but got %ld. Please check input " + "tdm tree structure and tdm travel info.", + layer_idx, node_id_min, node_id_max, + layer_data[layer_offset_lod[layer_idx] + sample_res])); + + offset += 1; + } // end layer nce + } // end one input nce + } // end all input nce + + auto *output_data = out_tensor->mutable_data(context.GetPlace()); + auto *label_data = label_tensor->mutable_data(context.GetPlace()); + auto *mask_data = mask_tensor->mutable_data(context.GetPlace()); + + memcpy(output_data, &output_vec[0], sizeof(OutT) * total_sample_nums); + memcpy(label_data, &label_vec[0], sizeof(OutT) * total_sample_nums); + memcpy(mask_data, &mask_vec[0], sizeof(OutT) * total_sample_nums); + + for (size_t layer_index = 0; layer_index < layer_nums; layer_index++) { + delete sampler_vec[layer_index]; + } +} + +template +class TDMSamplerKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *input_var = context.InputVar("X"); + auto *travel_var = context.InputVar("Travel"); + auto *layer_var = context.InputVar("Layer"); + + // get all tensor + auto &input_tensor = input_var->Get(); + auto &travel_lod_tensor = travel_var->Get(); + auto &layer_lod_tensor = layer_var->Get(); + + const auto &input_type = input_tensor.type(); + bool input_type_match = input_type == framework::proto::VarType::INT32 || + input_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ(input_type_match, true, + platform::errors::InvalidArgument( + "Input(X) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString(input_type), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT64))); + + const auto &travel_type = travel_lod_tensor.type(); + bool travel_type_match = travel_type == framework::proto::VarType::INT32 || + travel_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + travel_type_match, true, + platform::errors::InvalidArgument( + "Input(Travel) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString(travel_type), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT64))); + + const auto &layer_type = layer_lod_tensor.type(); + bool layer_type_match = layer_type == framework::proto::VarType::INT32 || + layer_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ(layer_type_match, true, + platform::errors::InvalidArgument( + "Input(Layer) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString(layer_type), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT64))); + PADDLE_ENFORCE_EQ( + travel_type, layer_type, + platform::errors::InvalidArgument( + "Input(Travel) must holds the same type with " + "Input(Layer), but Travel holds %s, and Layer holds %s", + paddle::framework::DataTypeToString(travel_type), + paddle::framework::DataTypeToString(layer_type))); + + auto *out_var = context.OutputVar("Out"); + auto *label_var = context.OutputVar("Labels"); + auto *mask_var = context.OutputVar("Mask"); + auto *out_tensor = out_var->GetMutable(); + auto *label_tensor = label_var->GetMutable(); + auto *mask_tensor = mask_var->GetMutable(); + + auto output_type = static_cast( + context.Attr("dtype")); + + if (travel_type == framework::proto::VarType::INT32 && + output_type == framework::proto::VarType::INT32) { + TDMSamplerInner(context, input_tensor, travel_lod_tensor, + layer_lod_tensor, out_tensor, label_tensor, + mask_tensor); + } else if (travel_type == framework::proto::VarType::INT64 && + output_type == framework::proto::VarType::INT32) { + TDMSamplerInner(context, input_tensor, travel_lod_tensor, + layer_lod_tensor, out_tensor, + label_tensor, mask_tensor); + } else if (travel_type == framework::proto::VarType::INT32 && + output_type == framework::proto::VarType::INT64) { + TDMSamplerInner(context, input_tensor, travel_lod_tensor, + layer_lod_tensor, out_tensor, + label_tensor, mask_tensor); + } else if (travel_type == framework::proto::VarType::INT64 && + output_type == framework::proto::VarType::INT64) { + TDMSamplerInner( + context, input_tensor, travel_lod_tensor, layer_lod_tensor, + out_tensor, label_tensor, mask_tensor); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index e9cb832e26b..066e48db20e 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -27,6 +27,7 @@ from ... import unique_name from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer from paddle.fluid.data_feeder import check_type, check_dtype, convert_dtype from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_ +from paddle.fluid.layers import slice, reshape __all__ = [ 'fused_elemwise_activation', @@ -39,6 +40,7 @@ __all__ = [ 'search_pyramid_hash', 'shuffle_batch', 'tdm_child', + 'tdm_sampler', ] @@ -897,3 +899,212 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'): 'dtype': c_dtype}, stop_gradient=True) return (child, leaf_mask) + + +def tdm_sampler(x, + neg_samples_num_list, + layer_node_num_list, + leaf_node_num, + tree_travel_attr=None, + tree_layer_attr=None, + output_positive=True, + output_list=True, + seed=0, + tree_dtype='int32', + dtype='int32'): + """ + **Tdm Sampler** + According to the input positive samples at leaf node(x), do negative sampling layer by layer on the given tree. + .. code-block:: text + + Given: + tree[[0], [1, 2], [3, 4], [5, 6]] # A binary tree with seven nodes + travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path (exclude root node) + layer_list = [[1, 2], [3, 4, 5, 6]] # two layer (exclude root node) + + x = [[0], [1], [2], [3]] # Corresponding to leaf node [[3], [4], [5], [6]] + neg_samples_num_list = [0, 0] # negative sample nums = 0 + layer_node_num_list = [2, 4] + leaf_node_num = 4 + output_list = False + + we get: + out = [[1, 3], [1, 4], [2, 5], [2, 6]] + labels = [[1, 1], [1, 1], [1, 1], [1, 1]] + mask = [[1, 1], [1, 1], [1, 1], [1, 1]] + + Args: + x (Variable): Variable contained the item_id(corresponding to leaf node) information, dtype support int32/int64. + neg_samples_num_list (list(int)): Number of negative samples per layer. + layer_node_num_list (list(int)): Number of nodes per layer, must has same shape with neg_samples_num_list. + leaf_node_num (int): Number of leaf nodes. + tree_travel_attr (ParamAttr): To specify the tdm-travel parameter property. Default: None, which means the + default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should + has shape (leaf_node_num, len(layer_node_num_list)), dtype support int32/int64. + tree_layer_attr (ParamAttr): To specify the tdm-layer parameter property. Default: None, which means the + default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should + has shape (node_num, 1), dtype support int32/int64. + output_positive (bool): Whether to output positive samples (includ label and mask )at the same time. + output_list (bool): Whether to divide the output into layers and organize it into list format. + seed (int): The number of random seed. + tree_dtype(np.dtype|core.VarDesc.VarType|str): The dtype of tdm-travel and tdm-layer, support int32/int64 + dtype(np.dtype|core.VarDesc.VarType|str): The dtype of output(sampling results, labels and masks) + + Returns: + tuple: A tuple including sampling results, corresponding labels and masks. if output_positive = True, sampling + result will include both positive and negative samples. If sampling reseult is a positive sample, the label is 1, + and if it is a negative sample, it is 0. If the tree is unbalanced, in order to ensure the consistency of the + sampling result shape, the padding sample's mask = 0, the real sample's mask value = 1. + If output_list = True, the result will organize into list format specified by layer information. + Output variable have same type with tdm-travel and tdm-layer parameter(tree_dtype). + + Examples: + .. code-block:: python + import paddle.fluid as fluid + import numpy as np + x = fluid.data(name="x", shape=[None, 1], dtype="int32", lod_level=1) + travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path, shape(leaf_node_num, layer_num) + layer_list_flat = [[1], [2], [3], [4], [5], [6]] # shape(node_nums, 1) + + neg_samples_num_list = [0, 0] # negative sample nums = 0 + layer_node_num_list = [2, 4] #two layer (exclude root node) + leaf_node_num = 4 + + travel_array = np.array(travel_list) + layer_array = np.array(layer_list_flat) + + sample, label, mask = fluid.contrib.layers.tdm_sampler( + x, + neg_samples_num_list, + layer_node_num_list, + leaf_node_num, + tree_travel_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + travel_array)), + tree_layer_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + layer_array)), + output_positive=True, + output_list=True, + seed=0, + tree_dtype='int32') + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + xx = np.array([[0],[1]]).reshape((2,1)).astype("int32") + + exe.run(feed={"x":xx}) + + """ + helper = LayerHelper("tdm_sampler", **locals()) + check_dtype(tree_dtype, 'tree_dtype', ['int32', 'int64'], + 'fluid.contrib.layers.tdm_sampler') + check_dtype(dtype, 'dtype', ['int32', 'int64'], + 'fluid.contrib.layers.tdm_sampler') + c_dtype = convert_np_dtype_to_dtype_(dtype) + + if len(neg_samples_num_list) != len(layer_node_num_list): + raise ValueError( + "The shape of negative samples list must match the shape of layers. " + "But received len of neg_samples_num_list: {}," + "and len of layer_node_num_list: {}, please check your input.". + format(len(neg_samples_num_list), len(layer_node_num_list))) + assert leaf_node_num is not None, "leaf_node_num should not be None here." + + layer_nums = 0 + node_nums = 0 + tree_layer_offset_lod = [0] + for layer_idx, layer_node_num in enumerate(layer_node_num_list): + layer_nums += 1 + node_nums += layer_node_num + tree_layer_offset_lod.append(node_nums) + if neg_samples_num_list[layer_idx] >= layer_node_num_list[layer_idx]: + raise ValueError( + "The number of negative samples must be less than the number of nodes " + "in the layer {}, But received negative nums {}, and num of node at layer {} " + "is {}, please check your input.".format( + layer_idx, neg_samples_num_list[ + layer_idx], layer_idx, layer_node_num_list[layer_idx])) + assert leaf_node_num < node_nums, "leaf_node_num must be less than total node nums." + + travel_shape = [leaf_node_num, layer_nums] + travel = helper.create_parameter( + attr=tree_travel_attr, + shape=travel_shape, + dtype=tree_dtype, + default_initializer=Constant(0)) + + layer_shape = [node_nums, 1] + layer = helper.create_parameter( + attr=tree_layer_attr, + shape=layer_shape, + dtype=tree_dtype, + default_initializer=Constant(0)) + + out = helper.create_variable_for_type_inference(dtype=dtype) + out.stop_gradient = True + + labels = helper.create_variable_for_type_inference(dtype=dtype) + labels.stop_gradient = True + + mask = helper.create_variable_for_type_inference(dtype=dtype) + mask.stop_gradient = True + + helper.append_op( + type='tdm_sampler', + inputs={"X": x, + "Travel": travel, + "Layer": layer}, + outputs={'Out': out, + 'Labels': labels, + 'Mask': mask}, + attrs={ + 'neg_samples_num_list': neg_samples_num_list, + 'output_positive': output_positive, + 'layer_offset_lod': tree_layer_offset_lod, + 'seed': seed, + 'dtype': c_dtype + }) + + if output_list: + output_list = [] + labels_list = [] + mask_list = [] + start_offset = 0 + positive_flag = 1 + if not output_positive: + positive_flag = 0 + + for layer_sample_num in neg_samples_num_list: + end_offset = start_offset + \ + layer_sample_num + positive_flag + layer_samples = slice( + out, axes=[1], starts=[start_offset], ends=[end_offset]) + layer_labels = slice( + labels, axes=[1], starts=[start_offset], ends=[end_offset]) + layer_mask = slice( + mask, axes=[1], starts=[start_offset], ends=[end_offset]) + + layer_samples = reshape(layer_samples, + [-1, layer_sample_num + positive_flag, 1]) + layer_samples.stop_gradient = True + + layer_labels = reshape(layer_labels, + [-1, layer_sample_num + positive_flag, 1]) + layer_labels.stop_gradient = True + + layer_mask = reshape(layer_mask, + [-1, layer_sample_num + positive_flag, 1]) + layer_mask.stop_gradient = True + + output_list.append(layer_samples) + labels_list.append(layer_labels) + mask_list.append(layer_mask) + start_offset = end_offset + + out = output_list + labels = labels_list + mask = mask_list + + return (out, labels, mask) diff --git a/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py new file mode 100644 index 00000000000..e245529edc6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py @@ -0,0 +1,291 @@ +# -*-coding:utf-8-*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid.layers as layers +import paddle.fluid as fluid +import random +import six +from sys import version_info + + +def create_tdm_travel(): + tree_travel = [[1, 3, 7, 14], [1, 3, 7, 15], [1, 3, 8, 16], [1, 3, 8, 17], + [1, 4, 9, 18], [1, 4, 9, 19], [1, 4, 10, 20], + [1, 4, 10, 21], [2, 5, 11, 22], [2, 5, 11, 23], + [2, 5, 12, 24], [2, 5, 12, 25], [2, 6, 13, 0]] + return tree_travel + + +def create_tdm_layer(): + tree_layer = [[1, 2], [3, 4, 5, 6], [7, 8, 9, 10, 11, 12, 13], + [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]] + return tree_layer + + +type_dict = { + "int32": int(core.VarDesc.VarType.INT32), + "int64": int(core.VarDesc.VarType.INT64) +} + + +class TestTDMSamplerOp(OpTest): + def setUp(self): + self.__class__.op_type = "tdm_sampler" + self.config() + + self.tree_travel = create_tdm_travel() + self.tree_layer = create_tdm_layer() + + output_0 = self.x_shape[0] + output_1 = len(self.neg_samples_num_list) + \ + np.sum(self.neg_samples_num_list) + self.output_shape = (output_0, output_1) + self.layer_sample_nums = [1 + i for i in self.neg_samples_num_list] + + layer_node_num_list = [len(i) for i in self.tree_layer] + tree_layer_offset_lod = [0] + tree_layer_flat = [] + node_nums = 0 + for layer_idx, layer_node in enumerate(layer_node_num_list): + tree_layer_flat += self.tree_layer[layer_idx] + node_nums += layer_node + tree_layer_offset_lod.append(node_nums) + + travel_np = np.array(self.tree_travel).astype(self.tree_dtype) + layer_np = np.array(tree_layer_flat).astype(self.tree_dtype) + layer_np = layer_np.reshape([-1, 1]) + + self.x_np = np.random.randint( + low=0, high=13, size=self.x_shape).astype(self.x_type) + + out = np.random.random(self.output_shape).astype(self.out_dtype) + label = np.random.random(self.output_shape).astype(self.out_dtype) + mask = np.random.random(self.output_shape).astype(self.out_dtype) + + self.attrs = { + 'neg_samples_num_list': self.neg_samples_num_list, + 'output_positive': True, + 'layer_offset_lod': tree_layer_offset_lod, + 'seed': 0, + 'dtype': type_dict[self.out_dtype] + } + self.inputs = {'X': self.x_np, 'Travel': travel_np, 'Layer': layer_np} + self.outputs = {'Out': out, 'Labels': label, 'Mask': mask} + + def config(self): + """set test shape & type""" + self.neg_samples_num_list = [0, 0, 0, 0] + self.x_shape = (10, 1) + self.x_type = 'int32' + self.tree_dtype = 'int32' + self.out_dtype = 'int32' + + def test_check_output(self): + places = self._get_places() + for place in places: + outs, fetch_list = self._calc_output(place) + self.out = [np.array(out) for out in outs] + + x_res = self.out[fetch_list.index('Out')] + label_res = self.out[fetch_list.index('Labels')] + mask_res = self.out[fetch_list.index('Mask')] + + # check dtype + if self.out_dtype == 'int32': + assert x_res.dtype == np.int32 + assert label_res.dtype == np.int32 + assert mask_res.dtype == np.int32 + elif self.out_dtype == 'int64': + assert x_res.dtype == np.int64 + assert label_res.dtype == np.int64 + assert mask_res.dtype == np.int64 + + x_res = x_res.reshape(self.output_shape) + label_res = label_res.reshape(self.output_shape) + mask_res = mask_res.reshape(self.output_shape) + + layer_nums = len(self.neg_samples_num_list) + for batch_ids, x_batch in enumerate(x_res): + start_offset = 0 + positive_travel = [] + for layer_idx in range(layer_nums): + end_offset = start_offset + self.layer_sample_nums[layer_idx] + sampling_res = x_batch[start_offset:end_offset] + sampling_res_list = sampling_res.tolist() + positive_travel.append(sampling_res_list[0]) + + label_sampling_res = label_res[batch_ids][start_offset: + end_offset] + mask_sampling_res = mask_res[batch_ids][start_offset:end_offset] + + # check unique + if sampling_res_list[0] != 0: + assert len(set(sampling_res_list)) == len( + sampling_res_list + ), "len(set(sampling_res_list)): {}, len(sampling_res_list): {} , sample_res: {}, label_res:{}, mask_res: {}".format( + len(set(sampling_res_list)), + len(sampling_res_list), sampling_res, + label_sampling_res, mask_sampling_res) + # check legal + layer_node = self.tree_layer[layer_idx] + layer_node.append(0) + for sample in sampling_res_list: + assert ( + sample in layer_node + ), "sample: {}, layer_node: {} , sample_res: {}, label_res: {}, mask_res:{}".format( + sample, layer_node, sampling_res, label_sampling_res, + mask_sampling_res) + + # check label + label_flag = 1 + if sampling_res[0] == 0: + label_flag = 0 + assert label_sampling_res[0] == label_flag + # check mask + padding_index = np.where(sampling_res == 0) + assert not np.sum( + mask_sampling_res[padding_index] + ), "np.sum(mask_sampling_res[padding_index]): {} ".format( + np.sum(mask_sampling_res[padding_index])) + start_offset = end_offset + # check travel legal + assert self.tree_travel[int(self.x_np[ + batch_ids])] == positive_travel + + +class TestCase1(TestTDMSamplerOp): + def config(self): + """test input int64""" + self.neg_samples_num_list = [0, 0, 0, 0] + self.x_shape = (10, 1) + self.x_type = 'int64' + self.tree_dtype = 'int64' + self.out_dtype = 'int32' + + +class TestCase2(TestTDMSamplerOp): + def config(self): + """test dtype int64""" + self.neg_samples_num_list = [0, 0, 0, 0] + self.x_shape = (10, 1) + self.x_type = 'int32' + self.tree_dtype = 'int32' + self.out_dtype = 'int64' + + +class TestCase3(TestTDMSamplerOp): + def config(self): + """test all dtype int64""" + self.neg_samples_num_list = [0, 0, 0, 0] + self.x_shape = (10, 1) + self.x_type = 'int64' + self.tree_dtype = 'int64' + self.out_dtype = 'int64' + + +class TestCase4(TestTDMSamplerOp): + def config(self): + """test one neg""" + self.neg_samples_num_list = [1, 1, 1, 1] + self.x_shape = (10, 1) + self.x_type = 'int64' + self.tree_dtype = 'int32' + self.out_dtype = 'int64' + + +class TestCase5(TestTDMSamplerOp): + def config(self): + """test normal neg""" + self.neg_samples_num_list = [1, 2, 3, 4] + self.x_shape = (10, 1) + self.x_type = 'int64' + self.tree_dtype = 'int32' + self.out_dtype = 'int64' + + +class TestCase6(TestTDMSamplerOp): + def config(self): + """test huge batchsize""" + self.neg_samples_num_list = [1, 2, 3, 4] + self.x_shape = (100, 1) + self.x_type = 'int64' + self.tree_dtype = 'int32' + self.out_dtype = 'int64' + + +class TestCase7(TestTDMSamplerOp): + def config(self): + """test full neg""" + self.neg_samples_num_list = [1, 3, 6, 11] + self.x_shape = (10, 1) + self.x_type = 'int64' + self.tree_dtype = 'int32' + self.out_dtype = 'int64' + + +class TestTDMSamplerShape(unittest.TestCase): + def test_shape(self): + x = fluid.layers.data(name='x', shape=[1], dtype='int32', lod_level=1) + tdm_tree_travel = create_tdm_travel() + tdm_tree_layer = create_tdm_layer() + layer_node_num_list = [len(i) for i in tdm_tree_layer] + + tree_layer_flat = [] + for layer_idx, layer_node in enumerate(layer_node_num_list): + tree_layer_flat += tdm_tree_layer[layer_idx] + + travel_array = np.array(tdm_tree_travel).astype('int32') + layer_array = np.array(tree_layer_flat).astype('int32') + + neg_samples_num_list = [1, 2, 3, 4] + leaf_node_num = 13 + + sample, label, mask = fluid.contrib.layers.tdm_sampler( + x, + neg_samples_num_list, + layer_node_num_list, + leaf_node_num, + tree_travel_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + travel_array)), + tree_layer_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + layer_array)), + output_positive=True, + output_list=True, + seed=0, + tree_dtype='int32', + dtype='int32') + + place = fluid.CPUPlace() + exe = fluid.Executor(place=place) + exe.run(fluid.default_startup_program()) + + feed = { + 'x': np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], + [10], [11], [12]]).astype('int32') + } + exe.run(feed=feed) + + +if __name__ == "__main__": + unittest.main() -- GitLab