Merge branch 'mlxsw-Add-support-for-non-equal-cost-multi-path'

Jiri Pirko says: ==================== mlxsw: Add support for non-equal-cost multi-path Ido says: In the device, nexthops are stored as adjacency entries in an array called the KVD linear (KVDL). When a multi-path route is hit the packet's headers are hashed and then converted to an index into KVDL based on the adjacency group's size and base index. Up until now the driver ignored the `weight` parameter for multi-path routes and allocated only one adjacency entry for each nexthop with a limit of 32 nexthops in a group. This set makes the driver take the `weight` parameter into account when allocating adjacency entries. First patch teaches dpipe to show the size of the adjacency group, so that users will be able to determine the actual weight of each nexthop. The second patch refactors the KVDL allocator, making it more receptive towards the addition of another partition later in the set. Patches 3-5 introduce small changes towards the actual change in the sixth patch that populates the adjacency entries according to their relative weight. Last two patches finally add another partition to the KVDL, which allows us to allocate more than 32 entries per-group and thus support more nexthops and also provide higher accuracy with regards to the requested weights. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>

Merge branch 'mlxsw-Add-support-for-non-equal-cost-multi-path'
Jiri Pirko says: ==================== mlxsw: Add support for non-equal-cost multi-path Ido says: In the device, nexthops are stored as adjacency entries in an array called the KVD linear (KVDL). When a multi-path route is hit the packet's headers are hashed and then converted to an index into KVDL based on the adjacency group's size and base index. Up until now the driver ignored the `weight` parameter for multi-path routes and allocated only one adjacency entry for each nexthop with a limit of 32 nexthops in a group. This set makes the driver take the `weight` parameter into account when allocating adjacency entries. First patch teaches dpipe to show the size of the adjacency group, so that users will be able to determine the actual weight of each nexthop. The second patch refactors the KVDL allocator, making it more receptive towards the addition of another partition later in the set. Patches 3-5 introduce small changes towards the actual change in the sixth patch that populates the adjacency entries according to their relative weight. Last two patches finally add another partition to the KVDL, which allows us to allocate more than 32 entries per-group and thus support more nexthops and also provide higher accuracy with regards to the requested weights. ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>
fbd15f48 · David S. Miller · bc9db417 · 330e2cc6 · fbd15f48 · fbd15f48
6 changed file
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3726,10 +3726,16 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 		return err;
 	}

+	err = mlxsw_sp_kvdl_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize KVDL\n");
+		return err;
+	}
+
 	err = mlxsw_sp_fids_init(mlxsw_sp);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize FIDs\n");
-		return err;
+		goto err_fids_init;
 	}

 	err = mlxsw_sp_traps_init(mlxsw_sp);
@@ -3834,6 +3840,8 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 	mlxsw_sp_traps_fini(mlxsw_sp);
 err_traps_init:
 	mlxsw_sp_fids_fini(mlxsw_sp);
+err_fids_init:
+	mlxsw_sp_kvdl_fini(mlxsw_sp);
 	return err;
 }

@@ -3854,6 +3862,7 @@ static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core)
 	mlxsw_sp_buffers_fini(mlxsw_sp);
 	mlxsw_sp_traps_fini(mlxsw_sp);
 	mlxsw_sp_fids_fini(mlxsw_sp);
+	mlxsw_sp_kvdl_fini(mlxsw_sp);
 }

 static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
@@ -3876,8 +3885,8 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
 	.max_pkey			= 0,
 	.used_kvd_split_data		= 1,
 	.kvd_hash_granularity		= MLXSW_SP_KVD_GRANULARITY,
-	.kvd_hash_single_parts		= 2,
-	.kvd_hash_double_parts		= 1,
+	.kvd_hash_single_parts		= 59,
+	.kvd_hash_double_parts		= 41,
 	.kvd_linear_size		= MLXSW_SP_KVD_LINEAR_SIZE,
 	.swid_config			= {
 		{

--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -62,7 +62,7 @@

 #define MLXSW_SP_PORT_BASE_SPEED 25000	/* Mb/s */

-#define MLXSW_SP_KVD_LINEAR_SIZE 65536 /* entries */
+#define MLXSW_SP_KVD_LINEAR_SIZE 98304 /* entries */
 #define MLXSW_SP_KVD_GRANULARITY 128

 struct mlxsw_sp_port;
@@ -143,6 +143,7 @@ struct mlxsw_sp_mr;
 struct mlxsw_sp_acl;
 struct mlxsw_sp_counter_pool;
 struct mlxsw_sp_fid_core;
+struct mlxsw_sp_kvdl;

 struct mlxsw_sp {
 	struct mlxsw_sp_port **ports;
@@ -158,9 +159,7 @@ struct mlxsw_sp {
 	struct mlxsw_afa *afa;
 	struct mlxsw_sp_acl *acl;
 	struct mlxsw_sp_fid_core *fid_core;
-	struct {
-		DECLARE_BITMAP(usage, MLXSW_SP_KVD_LINEAR_SIZE);
-	} kvdl;
+	struct mlxsw_sp_kvdl *kvdl;
 	struct notifier_block netdevice_nb;

 	struct mlxsw_sp_counter_pool *counter_pool;
@@ -411,9 +410,14 @@ mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
 void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif);

 /* spectrum_kvdl.c */
+int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp);
 int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int entry_count,
 			u32 *p_entry_index);
 void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index);
+int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
+				   unsigned int entry_count,
+				   unsigned int *p_alloc_size);

 struct mlxsw_sp_acl_rule_info {
 	unsigned int priority;

--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -44,6 +44,7 @@ enum mlxsw_sp_field_metadata_id {
 	MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
 	MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
 	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE,
 	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
 };

@@ -69,6 +70,11 @@ static struct devlink_dpipe_field mlxsw_sp_dpipe_fields_metadata[] = {
 		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
 		.bitwidth = 32,
 	},
+	{
+		.name = "adj_size",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE,
+		.bitwidth = 32,
+	},
 	{
 		.name = "adj_hash_index",
 		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
@@ -851,6 +857,14 @@ static int mlxsw_sp_dpipe_table_adj_matches_dump(void *priv,
 	match.header = &mlxsw_sp_dpipe_header_metadata;
 	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;

+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE;
+
 	err = devlink_dpipe_match_put(skb, &match);
 	if (err)
 		return err;
@@ -897,6 +911,7 @@ static u64 mlxsw_sp_dpipe_table_adj_size(struct mlxsw_sp *mlxsw_sp)

 enum mlxsw_sp_dpipe_table_adj_match {
 	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE,
 	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX,
 	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT,
 };
@@ -919,6 +934,11 @@ mlxsw_sp_dpipe_table_adj_match_action_prepare(struct devlink_dpipe_match *matche
 	match->header = &mlxsw_sp_dpipe_header_metadata;
 	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;

+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE;
+
 	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
 	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
 	match->header = &mlxsw_sp_dpipe_header_metadata;
@@ -955,6 +975,15 @@ mlxsw_sp_dpipe_table_adj_entry_prepare(struct devlink_dpipe_entry *entry,
 	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
 	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];

+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+
 	match_value->match = match;
 	match_value->value_size = sizeof(u32);
 	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
@@ -993,8 +1022,8 @@ mlxsw_sp_dpipe_table_adj_entry_prepare(struct devlink_dpipe_entry *entry,

 static void
 __mlxsw_sp_dpipe_table_adj_entry_fill(struct devlink_dpipe_entry *entry,
-				      u32 adj_index, u32 adj_hash_index,
-				      unsigned char *ha,
+				      u32 adj_index, u32 adj_size,
+				      u32 adj_hash_index, unsigned char *ha,
 				      struct mlxsw_sp_rif *rif)
 {
 	struct devlink_dpipe_value *value;
@@ -1005,6 +1034,10 @@ __mlxsw_sp_dpipe_table_adj_entry_fill(struct devlink_dpipe_entry *entry,
 	p_index = value->value;
 	*p_index = adj_index;

+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	p_index = value->value;
+	*p_index = adj_size;
+
 	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
 	p_index = value->value;
 	*p_index = adj_hash_index;
@@ -1027,10 +1060,11 @@ static void mlxsw_sp_dpipe_table_adj_entry_fill(struct mlxsw_sp *mlxsw_sp,
 	unsigned char *ha = mlxsw_sp_nexthop_ha(nh);
 	u32 adj_hash_index = 0;
 	u32 adj_index = 0;
+	u32 adj_size = 0;
 	int err;

-	mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_hash_index);
-	__mlxsw_sp_dpipe_table_adj_entry_fill(entry, adj_index,
+	mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_size, &adj_hash_index);
+	__mlxsw_sp_dpipe_table_adj_entry_fill(entry, adj_index, adj_size,
 					      adj_hash_index, ha, rif);
 	err = mlxsw_sp_nexthop_counter_get(mlxsw_sp, nh, &entry->counter);
 	if (!err)
@@ -1138,13 +1172,15 @@ static int mlxsw_sp_dpipe_table_adj_counters_update(void *priv, bool enable)
 	struct mlxsw_sp_nexthop *nh;
 	u32 adj_hash_index = 0;
 	u32 adj_index = 0;
+	u32 adj_size = 0;

 	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) {
 		if (!mlxsw_sp_nexthop_offload(nh) ||
 		    mlxsw_sp_nexthop_group_has_ipip(nh))
 			continue;

-		mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_hash_index);
+		mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_size,
+					 &adj_hash_index);
 		if (enable)
 			mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
 		else

--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -39,55 +39,276 @@

 #define MLXSW_SP_KVDL_SINGLE_BASE 0
 #define MLXSW_SP_KVDL_SINGLE_SIZE 16384
+#define MLXSW_SP_KVDL_SINGLE_END \
+	(MLXSW_SP_KVDL_SINGLE_SIZE + MLXSW_SP_KVDL_SINGLE_BASE - 1)
+
 #define MLXSW_SP_KVDL_CHUNKS_BASE \
 	(MLXSW_SP_KVDL_SINGLE_BASE + MLXSW_SP_KVDL_SINGLE_SIZE)
-#define MLXSW_SP_KVDL_CHUNKS_SIZE \
-	(MLXSW_SP_KVD_LINEAR_SIZE - MLXSW_SP_KVDL_CHUNKS_BASE)
+#define MLXSW_SP_KVDL_CHUNKS_SIZE 49152
+#define MLXSW_SP_KVDL_CHUNKS_END \
+	(MLXSW_SP_KVDL_CHUNKS_SIZE + MLXSW_SP_KVDL_CHUNKS_BASE - 1)
+
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_BASE \
+	(MLXSW_SP_KVDL_CHUNKS_BASE + MLXSW_SP_KVDL_CHUNKS_SIZE)
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE \
+	(MLXSW_SP_KVD_LINEAR_SIZE - MLXSW_SP_KVDL_LARGE_CHUNKS_BASE)
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_END \
+	(MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE + MLXSW_SP_KVDL_LARGE_CHUNKS_BASE - 1)
+
 #define MLXSW_SP_CHUNK_MAX 32
+#define MLXSW_SP_LARGE_CHUNK_MAX 512
+
+struct mlxsw_sp_kvdl_part_info {
+	unsigned int part_index;
+	unsigned int start_index;
+	unsigned int end_index;
+	unsigned int alloc_size;
+};
+
+struct mlxsw_sp_kvdl_part {
+	struct list_head list;
+	const struct mlxsw_sp_kvdl_part_info *info;
+	unsigned long usage[0];	/* Entries */
+};
+
+struct mlxsw_sp_kvdl {
+	struct list_head parts_list;
+};
+
+static struct mlxsw_sp_kvdl_part *
+mlxsw_sp_kvdl_alloc_size_part(struct mlxsw_sp_kvdl *kvdl,
+			      unsigned int alloc_size)
+{
+	struct mlxsw_sp_kvdl_part *part, *min_part = NULL;
+
+	list_for_each_entry(part, &kvdl->parts_list, list) {
+		if (alloc_size <= part->info->alloc_size &&
+		    (!min_part ||
+		     part->info->alloc_size <= min_part->info->alloc_size))
+			min_part = part;
+	}
+
+	return min_part ?: ERR_PTR(-ENOBUFS);
+}
+
+static struct mlxsw_sp_kvdl_part *
+mlxsw_sp_kvdl_index_part(struct mlxsw_sp_kvdl *kvdl, u32 kvdl_index)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	list_for_each_entry(part, &kvdl->parts_list, list) {
+		if (kvdl_index >= part->info->start_index &&
+		    kvdl_index <= part->info->end_index)
+			return part;
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static u32
+mlxsw_sp_entry_index_kvdl_index(const struct mlxsw_sp_kvdl_part_info *info,
+				unsigned int entry_index)
+{
+	return info->start_index + entry_index * info->alloc_size;
+}
+
+static unsigned int
+mlxsw_sp_kvdl_index_entry_index(const struct mlxsw_sp_kvdl_part_info *info,
+				u32 kvdl_index)
+{
+	return (kvdl_index - info->start_index) / info->alloc_size;
+}
+
+static int mlxsw_sp_kvdl_part_alloc(struct mlxsw_sp_kvdl_part *part,
+				    u32 *p_kvdl_index)
+{
+	const struct mlxsw_sp_kvdl_part_info *info = part->info;
+	unsigned int entry_index, nr_entries;
+
+	nr_entries = (info->end_index - info->start_index + 1) /
+		     info->alloc_size;
+	entry_index = find_first_zero_bit(part->usage, nr_entries);
+	if (entry_index == nr_entries)
+		return -ENOBUFS;
+	__set_bit(entry_index, part->usage);
+
+	*p_kvdl_index = mlxsw_sp_entry_index_kvdl_index(part->info,
+							entry_index);
+
+	return 0;
+}
+
+static void mlxsw_sp_kvdl_part_free(struct mlxsw_sp_kvdl_part *part,
+				    u32 kvdl_index)
+{
+	unsigned int entry_index;
+
+	entry_index = mlxsw_sp_kvdl_index_entry_index(part->info,
+						      kvdl_index);
+	__clear_bit(entry_index, part->usage);
+}

 int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int entry_count,
 			u32 *p_entry_index)
 {
-	int entry_index;
-	int size;
-	int type_base;
-	int type_size;
-	int type_entries;
-
-	if (entry_count == 0 || entry_count > MLXSW_SP_CHUNK_MAX) {
-		return -EINVAL;
-	} else if (entry_count == 1) {
-		type_base = MLXSW_SP_KVDL_SINGLE_BASE;
-		type_size = MLXSW_SP_KVDL_SINGLE_SIZE;
-		type_entries = 1;
-	} else {
-		type_base = MLXSW_SP_KVDL_CHUNKS_BASE;
-		type_size = MLXSW_SP_KVDL_CHUNKS_SIZE;
-		type_entries = MLXSW_SP_CHUNK_MAX;
+	struct mlxsw_sp_kvdl_part *part;
+
+	/* Find partition with smallest allocation size satisfying the
+	 * requested size.
+	 */
+	part = mlxsw_sp_kvdl_alloc_size_part(mlxsw_sp->kvdl, entry_count);
+	if (IS_ERR(part))
+		return PTR_ERR(part);
+
+	return mlxsw_sp_kvdl_part_alloc(part, p_entry_index);
+}
+
+void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp_kvdl_index_part(mlxsw_sp->kvdl, entry_index);
+	if (IS_ERR(part))
+		return;
+	mlxsw_sp_kvdl_part_free(part, entry_index);
+}
+
+int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
+				   unsigned int entry_count,
+				   unsigned int *p_alloc_size)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp_kvdl_alloc_size_part(mlxsw_sp->kvdl, entry_count);
+	if (IS_ERR(part))
+		return PTR_ERR(part);
+
+	*p_alloc_size = part->info->alloc_size;
+
+	return 0;
+}
+
+static const struct mlxsw_sp_kvdl_part_info kvdl_parts_info[] = {
+	{
+		.part_index	= 0,
+		.start_index	= MLXSW_SP_KVDL_SINGLE_BASE,
+		.end_index	= MLXSW_SP_KVDL_SINGLE_END,
+		.alloc_size	= 1,
+	},
+	{
+		.part_index	= 1,
+		.start_index	= MLXSW_SP_KVDL_CHUNKS_BASE,
+		.end_index	= MLXSW_SP_KVDL_CHUNKS_END,
+		.alloc_size	= MLXSW_SP_CHUNK_MAX,
+	},
+	{
+		.part_index	= 2,
+		.start_index	= MLXSW_SP_KVDL_LARGE_CHUNKS_BASE,
+		.end_index	= MLXSW_SP_KVDL_LARGE_CHUNKS_END,
+		.alloc_size	= MLXSW_SP_LARGE_CHUNK_MAX,
+	},
+};
+
+static struct mlxsw_sp_kvdl_part *
+mlxsw_sp_kvdl_part_find(struct mlxsw_sp *mlxsw_sp, unsigned int part_index)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	list_for_each_entry(part, &mlxsw_sp->kvdl->parts_list, list) {
+		if (part->info->part_index == part_index)
+			return part;
 	}

-	entry_index = type_base;
-	size = type_base + type_size;
-	for_each_clear_bit_from(entry_index, mlxsw_sp->kvdl.usage, size) {
-		int i;
+	return NULL;
+}
+
+static int mlxsw_sp_kvdl_part_init(struct mlxsw_sp *mlxsw_sp,
+				   unsigned int part_index)
+{
+	const struct mlxsw_sp_kvdl_part_info *info;
+	struct mlxsw_sp_kvdl_part *part;
+	unsigned int nr_entries;
+	size_t usage_size;
+
+	info = &kvdl_parts_info[part_index];
+
+	nr_entries = (info->end_index - info->start_index + 1) /
+		     info->alloc_size;
+	usage_size = BITS_TO_LONGS(nr_entries) * sizeof(unsigned long);
+	part = kzalloc(sizeof(*part) + usage_size, GFP_KERNEL);
+	if (!part)
+		return -ENOMEM;
+
+	part->info = info;
+	list_add(&part->list, &mlxsw_sp->kvdl->parts_list);

-		for (i = 0; i < type_entries; i++)
-			set_bit(entry_index + i, mlxsw_sp->kvdl.usage);
-		*p_entry_index = entry_index;
 	return 0;
+}
+
+static void mlxsw_sp_kvdl_part_fini(struct mlxsw_sp *mlxsw_sp,
+				    unsigned int part_index)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp_kvdl_part_find(mlxsw_sp, part_index);
+	if (!part)
+		return;
+
+	list_del(&part->list);
+	kfree(part);
+}
+
+static int mlxsw_sp_kvdl_parts_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int err, i;
+
+	INIT_LIST_HEAD(&mlxsw_sp->kvdl->parts_list);
+
+	for (i = 0; i < ARRAY_SIZE(kvdl_parts_info); i++) {
+		err = mlxsw_sp_kvdl_part_init(mlxsw_sp, i);
+		if (err)
+			goto err_kvdl_part_init;
 	}
-	return -ENOBUFS;
+
+	return 0;
+
+err_kvdl_part_init:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_kvdl_part_fini(mlxsw_sp, i);
+	return err;
 }

-void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index)
+static void mlxsw_sp_kvdl_parts_fini(struct mlxsw_sp *mlxsw_sp)
 {
-	int type_entries;
 	int i;

-	if (entry_index < MLXSW_SP_KVDL_CHUNKS_BASE)
-		type_entries = 1;
-	else
-		type_entries = MLXSW_SP_CHUNK_MAX;
-	for (i = 0; i < type_entries; i++)
-		clear_bit(entry_index + i, mlxsw_sp->kvdl.usage);
+	for (i = ARRAY_SIZE(kvdl_parts_info) - 1; i >= 0; i--)
+		mlxsw_sp_kvdl_part_fini(mlxsw_sp, i);
+}
+
+int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_kvdl *kvdl;
+	int err;
+
+	kvdl = kzalloc(sizeof(*mlxsw_sp->kvdl), GFP_KERNEL);
+	if (!kvdl)
+		return -ENOMEM;
+	mlxsw_sp->kvdl = kvdl;
+
+	err = mlxsw_sp_kvdl_parts_init(mlxsw_sp);
+	if (err)
+		goto err_kvdl_parts_init;
+
+	return 0;
+
+err_kvdl_parts_init:
+	kfree(mlxsw_sp->kvdl);
+	return err;
+}
+
+void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp_kvdl_parts_fini(mlxsw_sp);
+	kfree(mlxsw_sp->kvdl);
 }
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -46,6 +46,7 @@
 #include <linux/if_bridge.h>
 #include <linux/socket.h>
 #include <linux/route.h>
+#include <linux/gcd.h>
 #include <net/netevent.h>
 #include <net/neighbour.h>
 #include <net/arp.h>
@@ -2203,6 +2204,9 @@ struct mlxsw_sp_nexthop {
 	struct mlxsw_sp_nexthop_key key;
 	unsigned char gw_addr[sizeof(struct in6_addr)];
 	int ifindex;
+	int nh_weight;
+	int norm_nh_weight;
+	int num_adj_entries;
 	struct mlxsw_sp_rif *rif;
 	u8 should_offload:1, /* set indicates this neigh is connected and
 			      * should be put to KVD linear area of this group.
@@ -2232,6 +2236,7 @@ struct mlxsw_sp_nexthop_group {
 	u32 adj_index;
 	u16 ecmp_size;
 	u16 count;
+	int sum_norm_weight;
 	struct mlxsw_sp_nexthop nexthops[0];
 #define nh_rif	nexthops[0].rif
 };
@@ -2299,7 +2304,7 @@ unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh)
 }

 int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
-			     u32 *p_adj_hash_index)
+			     u32 *p_adj_size, u32 *p_adj_hash_index)
 {
 	struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp;
 	u32 adj_hash_index = 0;
@@ -2309,6 +2314,7 @@ int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
 		return -EINVAL;

 	*p_adj_index = nh_grp->adj_index;
+	*p_adj_size = nh_grp->ecmp_size;

 	for (i = 0; i < nh_grp->count; i++) {
 		struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i];
@@ -2316,7 +2322,7 @@ int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
 		if (nh_iter == nh)
 			break;
 		if (nh_iter->offloaded)
-			adj_hash_index++;
+			adj_hash_index += nh_iter->num_adj_entries;
 	}

 	*p_adj_hash_index = adj_hash_index;
@@ -2599,7 +2605,7 @@ static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp,
 	return 0;
 }

-int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+static int __mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
 				     struct mlxsw_sp_nexthop *nh)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
@@ -2617,7 +2623,23 @@ int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl);
 }

-static int mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
+int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			    struct mlxsw_sp_nexthop *nh)
+{
+	int i;
+
+	for (i = 0; i < nh->num_adj_entries; i++) {
+		int err;
+
+		err = __mlxsw_sp_nexthop_update(mlxsw_sp, adj_index + i, nh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int __mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
 					  u32 adj_index,
 					  struct mlxsw_sp_nexthop *nh)
 {
@@ -2627,6 +2649,24 @@ static int mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
 	return ipip_ops->nexthop_update(mlxsw_sp, adj_index, nh->ipip_entry);
 }

+static int mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
+					u32 adj_index,
+					struct mlxsw_sp_nexthop *nh)
+{
+	int i;
+
+	for (i = 0; i < nh->num_adj_entries; i++) {
+		int err;
+
+		err = __mlxsw_sp_nexthop_ipip_update(mlxsw_sp, adj_index + i,
+						     nh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int
 mlxsw_sp_nexthop_group_update(struct mlxsw_sp *mlxsw_sp,
 			      struct mlxsw_sp_nexthop_group *nh_grp,
@@ -2661,7 +2701,7 @@ mlxsw_sp_nexthop_group_update(struct mlxsw_sp *mlxsw_sp,
 			nh->update = 0;
 			nh->offloaded = 1;
 		}
-		adj_index++;
+		adj_index += nh->num_adj_entries;
 	}
 	return 0;
 }
@@ -2706,17 +2746,118 @@ mlxsw_sp_nexthop_fib_entries_refresh(struct mlxsw_sp_nexthop_group *nh_grp)
 	}
 }

+static void mlxsw_sp_adj_grp_size_round_up(u16 *p_adj_grp_size)
+{
+	/* Valid sizes for an adjacency group are:
+	 * 1-64, 512, 1024, 2048 and 4096.
+	 */
+	if (*p_adj_grp_size <= 64)
+		return;
+	else if (*p_adj_grp_size <= 512)
+		*p_adj_grp_size = 512;
+	else if (*p_adj_grp_size <= 1024)
+		*p_adj_grp_size = 1024;
+	else if (*p_adj_grp_size <= 2048)
+		*p_adj_grp_size = 2048;
+	else
+		*p_adj_grp_size = 4096;
+}
+
+static void mlxsw_sp_adj_grp_size_round_down(u16 *p_adj_grp_size,
+					     unsigned int alloc_size)
+{
+	if (alloc_size >= 4096)
+		*p_adj_grp_size = 4096;
+	else if (alloc_size >= 2048)
+		*p_adj_grp_size = 2048;
+	else if (alloc_size >= 1024)
+		*p_adj_grp_size = 1024;
+	else if (alloc_size >= 512)
+		*p_adj_grp_size = 512;
+}
+
+static int mlxsw_sp_fix_adj_grp_size(struct mlxsw_sp *mlxsw_sp,
+				     u16 *p_adj_grp_size)
+{
+	unsigned int alloc_size;
+	int err;
+
+	/* Round up the requested group size to the next size supported
+	 * by the device and make sure the request can be satisfied.
+	 */
+	mlxsw_sp_adj_grp_size_round_up(p_adj_grp_size);
+	err = mlxsw_sp_kvdl_alloc_size_query(mlxsw_sp, *p_adj_grp_size,
+					     &alloc_size);
+	if (err)
+		return err;
+	/* It is possible the allocation results in more allocated
+	 * entries than requested. Try to use as much of them as
+	 * possible.
+	 */
+	mlxsw_sp_adj_grp_size_round_down(p_adj_grp_size, alloc_size);
+
+	return 0;
+}
+
+static void
+mlxsw_sp_nexthop_group_normalize(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	int i, g = 0, sum_norm_weight = 0;
+	struct mlxsw_sp_nexthop *nh;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload)
+			continue;
+		if (g > 0)
+			g = gcd(nh->nh_weight, g);
+		else
+			g = nh->nh_weight;
+	}
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload)
+			continue;
+		nh->norm_nh_weight = nh->nh_weight / g;
+		sum_norm_weight += nh->norm_nh_weight;
+	}
+
+	nh_grp->sum_norm_weight = sum_norm_weight;
+}
+
+static void
+mlxsw_sp_nexthop_group_rebalance(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	int total = nh_grp->sum_norm_weight;
+	u16 ecmp_size = nh_grp->ecmp_size;
+	int i, weight = 0, lower_bound = 0;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+		int upper_bound;
+
+		if (!nh->should_offload)
+			continue;
+		weight += nh->norm_nh_weight;
+		upper_bound = DIV_ROUND_CLOSEST(ecmp_size * weight, total);
+		nh->num_adj_entries = upper_bound - lower_bound;
+		lower_bound = upper_bound;
+	}
+}
+
 static void
 mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp,
 			       struct mlxsw_sp_nexthop_group *nh_grp)
 {
+	u16 ecmp_size, old_ecmp_size;
 	struct mlxsw_sp_nexthop *nh;
 	bool offload_change = false;
 	u32 adj_index;
-	u16 ecmp_size = 0;
 	bool old_adj_index_valid;
 	u32 old_adj_index;
-	u16 old_ecmp_size;
 	int i;
 	int err;

@@ -2733,8 +2874,6 @@ mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp,
 			if (nh->should_offload)
 				nh->update = 1;
 		}
-		if (nh->should_offload)
-			ecmp_size++;
 	}
 	if (!offload_change) {
 		/* Nothing was added or removed, so no need to reallocate. Just
@@ -2747,12 +2886,19 @@ mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp,
 		}
 		return;
 	}
-	if (!ecmp_size)
+	mlxsw_sp_nexthop_group_normalize(nh_grp);
+	if (!nh_grp->sum_norm_weight)
 		/* No neigh of this group is connected so we just set
 		 * the trap and let everthing flow through kernel.
 		 */
 		goto set_trap;

+	ecmp_size = nh_grp->sum_norm_weight;
+	err = mlxsw_sp_fix_adj_grp_size(mlxsw_sp, &ecmp_size);
+	if (err)
+		/* No valid allocation size available. */
+		goto set_trap;
+
 	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, ecmp_size, &adj_index);
 	if (err) {
 		/* We ran out of KVD linear space, just set the
@@ -2767,6 +2913,7 @@ mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp,
 	nh_grp->adj_index_valid = 1;
 	nh_grp->adj_index = adj_index;
 	nh_grp->ecmp_size = ecmp_size;
+	mlxsw_sp_nexthop_group_rebalance(nh_grp);
 	err = mlxsw_sp_nexthop_group_update(mlxsw_sp, nh_grp, true);
 	if (err) {
 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n");
@@ -3044,6 +3191,11 @@ static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp,

 	nh->nh_grp = nh_grp;
 	nh->key.fib_nh = fib_nh;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	nh->nh_weight = fib_nh->nh_weight;
+#else
+	nh->nh_weight = 1;
+#endif
 	memcpy(&nh->gw_addr, &fib_nh->nh_gw, sizeof(fib_nh->nh_gw));
 	err = mlxsw_sp_nexthop_insert(mlxsw_sp, nh);
 	if (err)
@@ -4303,6 +4455,7 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
 	struct net_device *dev = rt->dst.dev;

 	nh->nh_grp = nh_grp;
+	nh->nh_weight = 1;
 	memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr));
 	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);


--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
@@ -115,7 +115,7 @@ struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router,
 bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh);
 unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh);
 int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
-			     u32 *p_adj_hash_index);
+			     u32 *p_adj_size, u32 *p_adj_hash_index);
 struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh);
 bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh);
 #define mlxsw_sp_nexthop_for_each(nh, router)				\