Merge tag 'mlx5-updates-2017-04-30' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

mlx5-updates-2017-04-30 Or says: ================ mlx5 neigh update This series (whose code name is 'neigh update') from Hadar, enhances the mlx5 TC IP tunnel offloads to deal with changes to tunnel destination neighbours used in offloaded flows which involved encapsulation. In order to keep track on the validity state of such neighbours, we register a netevent notifier callback and act on NEIGH_UPDATE events: if a neighbour becomes valid, offload the related flows to HW (the other way around when neigh becomes invalid) and similarly when a neigh mac addresses changes. Since this traffic is offloaded from the host OS, the neighbour for the IP tunnel destination can mistakenly become STALE and deleted by the kernel since its 'used' value wasn't changed. To address that, we proactively update the neighbour 'used' value every DELAY_PROBE_TIME seconds, using time stamps generated by the existing driver code for HW flow counters. We use the DELAY_PROBE_TIME_UPDATE event to adjust the frequency of the updates. Prior to the core of the series, there's a patch from Saeed that introduces an extendable vport representor implementation scheme. It provides a separation between the eswitch to the netdev related aspects of the representors. We would like to thank Ido Schimmel and Ilya Lesokhin for their coaching && advice through the long design and review cycles while we struggled to understand and (hopefully correctly) implement the locking around the different driver flows(..) . - Or. ================= Misc Updates: From Tariq: Some small performance and trivial code optimization for mlx5 netdev driver - Optimize poll ICOSQ completion queue - Use prefetchw when a write is to follow - Use u8 as ownership type in mlx5e_get_cqe() From Eran: - Disable LRO by default on specific setups From Eli: - Small cleanup for E-Switch to avoid redundant allocation Signed-off-by: N David S. Miller <davem@davemloft.net>

Merge tag 'mlx5-updates-2017-04-30' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
mlx5-updates-2017-04-30 Or says: ================ mlx5 neigh update This series (whose code name is 'neigh update') from Hadar, enhances the mlx5 TC IP tunnel offloads to deal with changes to tunnel destination neighbours used in offloaded flows which involved encapsulation. In order to keep track on the validity state of such neighbours, we register a netevent notifier callback and act on NEIGH_UPDATE events: if a neighbour becomes valid, offload the related flows to HW (the other way around when neigh becomes invalid) and similarly when a neigh mac addresses changes. Since this traffic is offloaded from the host OS, the neighbour for the IP tunnel destination can mistakenly become STALE and deleted by the kernel since its 'used' value wasn't changed. To address that, we proactively update the neighbour 'used' value every DELAY_PROBE_TIME seconds, using time stamps generated by the existing driver code for HW flow counters. We use the DELAY_PROBE_TIME_UPDATE event to adjust the frequency of the updates. Prior to the core of the series, there's a patch from Saeed that introduces an extendable vport representor implementation scheme. It provides a separation between the eswitch to the netdev related aspects of the representors. We would like to thank Ido Schimmel and Ilya Lesokhin for their coaching && advice through the long design and review cycles while we struggled to understand and (hopefully correctly) implement the locking around the different driver flows(..) . - Or. ================= Misc Updates: From Tariq: Some small performance and trivial code optimization for mlx5 netdev driver - Optimize poll ICOSQ completion queue - Use prefetchw when a write is to follow - Use u8 as ownership type in mlx5e_get_cqe() From Eran: - Disable LRO by default on specific setups From Eli: - Small cleanup for E-Switch to avoid redundant allocation Signed-off-by: N David S. Miller <davem@davemloft.net>
cedf90c0 · David S. Miller · 07ff2ed0 · 0a0ab1d2 · cedf90c0 · cedf90c0
14 changed file
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -991,20 +991,6 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
 void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
 int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb);

-struct mlx5_eswitch_rep;
-int mlx5e_vport_rep_load(struct mlx5_eswitch *esw,
-			 struct mlx5_eswitch_rep *rep);
-void mlx5e_vport_rep_unload(struct mlx5_eswitch *esw,
-			    struct mlx5_eswitch_rep *rep);
-int mlx5e_nic_rep_load(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep);
-void mlx5e_nic_rep_unload(struct mlx5_eswitch *esw,
-			  struct mlx5_eswitch_rep *rep);
-int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv);
-void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv);
-int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr);
-void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
-void mlx5e_update_hw_rep_counters(struct mlx5e_priv *priv);
-
 /* common netdev helpers */
 int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv);

@@ -1031,12 +1017,6 @@ int mlx5e_open(struct net_device *netdev);
 void mlx5e_update_stats_work(struct work_struct *work);
 u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout);

-int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
-			    void *sp);
-bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id);
-
-bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv);
-
 /* mlx5e generic netdev management API */
 struct net_device*
 mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile,

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -35,9 +35,10 @@
 #include <linux/mlx5/fs.h>
 #include <net/vxlan.h>
 #include <linux/bpf.h>
+#include "eswitch.h"
 #include "en.h"
 #include "en_tc.h"
-#include "eswitch.h"
+#include "en_rep.h"
 #include "vxlan.h"

 struct mlx5e_rq_param {
@@ -3784,6 +3785,12 @@ static bool cqe_compress_heuristic(u32 link_speed, u32 pci_bw)
 		(pci_bw < 40000) && (pci_bw < link_speed));
 }

+static bool hw_lro_heuristic(u32 link_speed, u32 pci_bw)
+{
+	return !(link_speed && pci_bw &&
+		 (pci_bw <= 16000) && (pci_bw < link_speed));
+}
+
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 {
 	params->rx_cq_period_mode = cq_period_mode;
@@ -3828,6 +3835,11 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 	params->num_channels = max_channels;
 	params->num_tc       = 1;

+	mlx5e_get_max_linkspeed(mdev, &link_speed);
+	mlx5e_get_pci_bw(mdev, &pci_bw);
+	mlx5_core_dbg(mdev, "Max link speed = %d, PCI BW = %d\n",
+		      link_speed, pci_bw);
+
 	/* SQ */
 	params->log_sq_size = is_kdump_kernel() ?
 		MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
@@ -3836,13 +3848,9 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 	/* set CQE compression */
 	params->rx_cqe_compress_def = false;
 	if (MLX5_CAP_GEN(mdev, cqe_compression) &&
-	     MLX5_CAP_GEN(mdev, vport_group_manager)) {
-		mlx5e_get_max_linkspeed(mdev, &link_speed);
-		mlx5e_get_pci_bw(mdev, &pci_bw);
-		mlx5_core_dbg(mdev, "Max link speed = %d, PCI BW = %d\n",
-			       link_speed, pci_bw);
+	     MLX5_CAP_GEN(mdev, vport_group_manager))
 		params->rx_cqe_compress_def = cqe_compress_heuristic(link_speed, pci_bw);
-	}
+
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def);

 	/* RQ */
@@ -3851,7 +3859,7 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 	/* HW LRO */
 	/* TODO: && MLX5_CAP_ETH(mdev, lro_cap) */
 	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
-		params->lro_en = true;
+		params->lro_en = hw_lro_heuristic(link_speed, pci_bw);
 	params->lro_timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);

 	/* CQ moderation params */
@@ -4123,48 +4131,10 @@ static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
 	return 0;
 }

-static void mlx5e_register_vport_rep(struct mlx5_core_dev *mdev)
-{
-	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	int total_vfs = MLX5_TOTAL_VPORTS(mdev);
-	int vport;
-	u8 mac[ETH_ALEN];
-
-	if (!MLX5_CAP_GEN(mdev, vport_group_manager))
-		return;
-
-	mlx5_query_nic_vport_mac_address(mdev, 0, mac);
-
-	for (vport = 1; vport < total_vfs; vport++) {
-		struct mlx5_eswitch_rep rep;
-
-		rep.load = mlx5e_vport_rep_load;
-		rep.unload = mlx5e_vport_rep_unload;
-		rep.vport = vport;
-		ether_addr_copy(rep.hw_id, mac);
-		mlx5_eswitch_register_vport_rep(esw, vport, &rep);
-	}
-}
-
-static void mlx5e_unregister_vport_rep(struct mlx5_core_dev *mdev)
-{
-	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	int total_vfs = MLX5_TOTAL_VPORTS(mdev);
-	int vport;
-
-	if (!MLX5_CAP_GEN(mdev, vport_group_manager))
-		return;
-
-	for (vport = 1; vport < total_vfs; vport++)
-		mlx5_eswitch_unregister_vport_rep(esw, vport);
-}
-
 static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 {
 	struct net_device *netdev = priv->netdev;
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	struct mlx5_eswitch_rep rep;
 	u16 max_mtu;

 	mlx5e_init_l2_addr(priv);
@@ -4179,16 +4149,8 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)

 	mlx5e_enable_async_events(priv);

-	if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
-		mlx5_query_nic_vport_mac_address(mdev, 0, rep.hw_id);
-		rep.load = mlx5e_nic_rep_load;
-		rep.unload = mlx5e_nic_rep_unload;
-		rep.vport = FDB_UPLINK_VPORT;
-		rep.netdev = netdev;
-		mlx5_eswitch_register_vport_rep(esw, 0, &rep);
-	}
-
-	mlx5e_register_vport_rep(mdev);
+	if (MLX5_CAP_GEN(mdev, vport_group_manager))
+		mlx5e_register_vport_reps(priv);

 	if (netdev->reg_state != NETREG_REGISTERED)
 		return;
@@ -4212,7 +4174,6 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5_eswitch *esw = mdev->priv.eswitch;

 	rtnl_lock();
 	if (netif_running(priv->netdev))
@@ -4221,9 +4182,10 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 	rtnl_unlock();

 	queue_work(priv->wq, &priv->set_rx_mode_work);
-	mlx5e_unregister_vport_rep(mdev);
+
 	if (MLX5_CAP_GEN(mdev, vport_group_manager))
-		mlx5_eswitch_unregister_vport_rep(esw, 0);
+		mlx5e_unregister_vport_reps(priv);
+
 	mlx5e_disable_async_events(priv);
 	mlx5_lag_remove(mdev);
 }
@@ -4394,7 +4356,7 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_eswitch *esw = mdev->priv.eswitch;
 	int total_vfs = MLX5_TOTAL_VPORTS(mdev);
-	void *ppriv = NULL;
+	struct mlx5e_rep_priv *rpriv = NULL;
 	void *priv;
 	int vport;
 	int err;
@@ -4404,10 +4366,17 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
 	if (err)
 		return NULL;

-	if (MLX5_CAP_GEN(mdev, vport_group_manager))
-		ppriv = &esw->offloads.vport_reps[0];
+	if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
+		rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
+		if (!rpriv) {
+			mlx5_core_warn(mdev,
+				       "Not creating net device, Failed to alloc rep priv data\n");
+			return NULL;
+		}
+		rpriv->rep = &esw->offloads.vport_reps[0];
+	}

-	netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, ppriv);
+	netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, rpriv);
 	if (!netdev) {
 		mlx5_core_err(mdev, "mlx5e_create_netdev failed\n");
 		goto err_unregister_reps;
@@ -4439,16 +4408,19 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
 	for (vport = 1; vport < total_vfs; vport++)
 		mlx5_eswitch_unregister_vport_rep(esw, vport);

+	kfree(rpriv);
 	return NULL;
 }

 static void mlx5e_remove(struct mlx5_core_dev *mdev, void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
+	void *ppriv = priv->ppriv;

 	unregister_netdev(priv->netdev);
 	mlx5e_detach(mdev, vpriv);
 	mlx5e_destroy_netdev(priv);
+	kfree(ppriv);
 }

 static void *mlx5e_get_netdev(void *vpriv)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5E_REP_H__
+#define __MLX5E_REP_H__
+
+#include <net/ip_tunnels.h>
+#include <linux/rhashtable.h>
+#include "eswitch.h"
+#include "en.h"
+
+struct mlx5e_neigh_update_table {
+	struct rhashtable       neigh_ht;
+	/* Save the neigh hash entries in a list in addition to the hash table
+	 * (neigh_ht). In order to iterate easily over the neigh entries.
+	 * Used for stats query.
+	 */
+	struct list_head	neigh_list;
+	/* protect lookup/remove operations */
+	spinlock_t              encap_lock;
+	struct notifier_block   netevent_nb;
+	struct delayed_work     neigh_stats_work;
+	unsigned long           min_interval; /* jiffies */
+};
+
+struct mlx5e_rep_priv {
+	struct mlx5_eswitch_rep *rep;
+	struct mlx5e_neigh_update_table neigh_update;
+};
+
+struct mlx5e_neigh {
+	struct net_device *dev;
+	union {
+		__be32	v4;
+		struct in6_addr v6;
+	} dst_ip;
+	int family;
+};
+
+struct mlx5e_neigh_hash_entry {
+	struct rhash_head rhash_node;
+	struct mlx5e_neigh m_neigh;
+
+	/* Save the neigh hash entry in a list on the representor in
+	 * addition to the hash table. In order to iterate easily over the
+	 * neighbour entries. Used for stats query.
+	 */
+	struct list_head neigh_list;
+
+	/* encap list sharing the same neigh */
+	struct list_head encap_list;
+
+	/* valid only when the neigh reference is taken during
+	 * neigh_update_work workqueue callback.
+	 */
+	struct neighbour *n;
+	struct work_struct neigh_update_work;
+
+	/* neigh hash entry can be deleted only when the refcount is zero.
+	 * refcount is needed to avoid neigh hash entry removal by TC, while
+	 * it's used by the neigh notification call.
+	 */
+	refcount_t refcnt;
+
+	/* Save the last reported time offloaded trafic pass over one of the
+	 * neigh hash entry flows. Use it to periodically update the neigh
+	 * 'used' value and avoid neigh deleting by the kernel.
+	 */
+	unsigned long reported_lastuse;
+};
+
+enum {
+	/* set when the encap entry is successfully offloaded into HW */
+	MLX5_ENCAP_ENTRY_VALID     = BIT(0),
+};
+
+struct mlx5e_encap_entry {
+	/* neigh hash entry list of encaps sharing the same neigh */
+	struct list_head encap_list;
+	struct mlx5e_neigh m_neigh;
+	/* a node of the eswitch encap hash table which keeping all the encap
+	 * entries
+	 */
+	struct hlist_node encap_hlist;
+	struct list_head flows;
+	u32 encap_id;
+	struct ip_tunnel_info tun_info;
+	unsigned char h_dest[ETH_ALEN];	/* destination eth addr	*/
+
+	struct net_device *out_dev;
+	int tunnel_type;
+	u8 flags;
+	char *encap_header;
+	int encap_size;
+};
+
+void mlx5e_register_vport_reps(struct mlx5e_priv *priv);
+void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv);
+bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv);
+int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv);
+void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv);
+
+int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, void *sp);
+bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id);
+
+int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr);
+void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+				 struct mlx5e_encap_entry *e);
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+				  struct mlx5e_encap_entry *e);
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
+
+#endif /* __MLX5E_REP_H__ */
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -39,6 +39,7 @@
 #include "en.h"
 #include "en_tc.h"
 #include "eswitch.h"
+#include "en_rep.h"
 #include "ipoib.h"

 static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp *tstamp)
@@ -809,7 +810,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct net_device *netdev = rq->netdev;
 	struct mlx5e_priv *priv = netdev_priv(netdev);
-	struct mlx5_eswitch_rep *rep = priv->ppriv;
+	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
 	__be16 wqe_counter_be;
@@ -904,7 +906,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		goto mpwrq_cqe_out;
 	}

-	prefetch(skb->data);
+	prefetchw(skb->data);
 	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);

 	mlx5e_mpwqe_fill_rx_skb(rq, cqe, wi, cqe_bcnt, skb);

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -44,7 +44,9 @@
 #include <net/tc_act/tc_tunnel_key.h>
 #include <net/tc_act/tc_pedit.h>
 #include <net/vxlan.h>
+#include <net/arp.h>
 #include "en.h"
+#include "en_rep.h"
 #include "en_tc.h"
 #include "eswitch.h"
 #include "vxlan.h"
@@ -58,6 +60,7 @@ struct mlx5_nic_flow_attr {
 enum {
 	MLX5E_TC_FLOW_ESWITCH	= BIT(0),
 	MLX5E_TC_FLOW_NIC	= BIT(1),
+	MLX5E_TC_FLOW_OFFLOADED	= BIT(2),
 };

 struct mlx5e_tc_flow {
@@ -244,18 +247,128 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;

-	mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->esw_attr);
+	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+		flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+		mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->esw_attr);
+	}

 	mlx5_eswitch_del_vlan_action(esw, flow->esw_attr);

-	if (flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
+	if (flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) {
 		mlx5e_detach_encap(priv, flow);
+		kvfree(flow->esw_attr->parse_attr);
+	}

 	if (flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
 		mlx5_modify_header_dealloc(priv->mdev,
 					   attr->mod_hdr_id);
 }

+void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e)
+{
+	struct mlx5e_tc_flow *flow;
+	int err;
+
+	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
+			       e->encap_size, e->encap_header,
+			       &e->encap_id);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %d\n",
+			       err);
+		return;
+	}
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(priv);
+
+	list_for_each_entry(flow, &e->flows, encap) {
+		flow->esw_attr->encap_id = e->encap_id;
+		flow->rule = mlx5e_tc_add_fdb_flow(priv,
+						   flow->esw_attr->parse_attr,
+						   flow);
+		if (IS_ERR(flow->rule)) {
+			err = PTR_ERR(flow->rule);
+			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
+				       err);
+			continue;
+		}
+		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+	}
+}
+
+void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e)
+{
+	struct mlx5e_tc_flow *flow;
+	struct mlx5_fc *counter;
+
+	list_for_each_entry(flow, &e->flows, encap) {
+		if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+			flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+			counter = mlx5_flow_rule_counter(flow->rule);
+			mlx5_del_flow_rules(flow->rule);
+			mlx5_fc_destroy(priv->mdev, counter);
+		}
+	}
+
+	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
+		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
+		mlx5_encap_dealloc(priv->mdev, e->encap_id);
+	}
+}
+
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
+	u64 bytes, packets, lastuse = 0;
+	struct mlx5e_tc_flow *flow;
+	struct mlx5e_encap_entry *e;
+	struct mlx5_fc *counter;
+	struct neigh_table *tbl;
+	bool neigh_used = false;
+	struct neighbour *n;
+
+	if (m_neigh->family == AF_INET)
+		tbl = &arp_tbl;
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (m_neigh->family == AF_INET6)
+		tbl = ipv6_stub->nd_tbl;
+#endif
+	else
+		return;
+
+	list_for_each_entry(e, &nhe->encap_list, encap_list) {
+		if (!(e->flags & MLX5_ENCAP_ENTRY_VALID))
+			continue;
+		list_for_each_entry(flow, &e->flows, encap) {
+			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+				counter = mlx5_flow_rule_counter(flow->rule);
+				mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
+					neigh_used = true;
+					break;
+				}
+			}
+		}
+	}
+
+	if (neigh_used) {
+		nhe->reported_lastuse = jiffies;
+
+		/* find the relevant neigh according to the cached device and
+		 * dst ip pair
+		 */
+		n = neigh_lookup(tbl, &m_neigh->dst_ip, m_neigh->dev);
+		if (!n) {
+			WARN(1, "The neighbour already freed\n");
+			return;
+		}
+
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+	}
+}
+
 static void mlx5e_detach_encap(struct mlx5e_priv *priv,
 			       struct mlx5e_tc_flow *flow)
 {
@@ -263,14 +376,16 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv,

 	list_del(&flow->encap);
 	if (list_empty(next)) {
-		struct mlx5_encap_entry *e;
+		struct mlx5e_encap_entry *e;
+
+		e = list_entry(next, struct mlx5e_encap_entry, flows);
+		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);

-		e = list_entry(next, struct mlx5_encap_entry, flows);
-		if (e->n) {
+		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
 			mlx5_encap_dealloc(priv->mdev, e->encap_id);
-			neigh_release(e->n);
-		}
+
 		hlist_del_rcu(&e->encap_hlist);
+		kfree(e->encap_header);
 		kfree(e);
 	}
 }
@@ -702,16 +817,18 @@ static int parse_cls_flower(struct mlx5e_priv *priv,
 {
 	struct mlx5_core_dev *dev = priv->mdev;
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
-	struct mlx5_eswitch_rep *rep = priv->ppriv;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep;
 	u8 min_inline;
 	int err;

 	err = __parse_cls_flower(priv, spec, f, &min_inline);

-	if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH) &&
-	    rep->vport != FDB_UPLINK_VPORT) {
-		if (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE &&
-		    esw->offloads.inline_mode < min_inline) {
+	if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH)) {
+		rep = rpriv->rep;
+		if (rep->vport != FDB_UPLINK_VPORT &&
+		    (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE &&
+		    esw->offloads.inline_mode < min_inline)) {
 			netdev_warn(priv->netdev,
 				    "Flow is not offloaded due to min inline setting, required %d actual %d\n",
 				    min_inline, esw->offloads.inline_mode);
@@ -1208,16 +1325,17 @@ static void gen_vxlan_header_ipv6(struct net_device *out_dev,

 static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 					  struct net_device *mirred_dev,
-					  struct mlx5_encap_entry *e,
-					  struct net_device **out_dev)
+					  struct mlx5e_encap_entry *e)
 {
 	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 	int ipv4_encap_size = ETH_HLEN + sizeof(struct iphdr) + VXLAN_HLEN;
 	struct ip_tunnel_key *tun_key = &e->tun_info.key;
+	struct net_device *out_dev;
 	struct neighbour *n = NULL;
 	struct flowi4 fl4 = {};
 	char *encap_header;
 	int ttl, err;
+	u8 nud_state;

 	if (max_encap_size < ipv4_encap_size) {
 		mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
@@ -1242,25 +1360,36 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 	fl4.daddr = tun_key->u.ipv4.dst;
 	fl4.saddr = tun_key->u.ipv4.src;

-	err = mlx5e_route_lookup_ipv4(priv, mirred_dev, out_dev,
+	err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev,
 				      &fl4, &n, &ttl);
 	if (err)
 		goto out;

-	if (!(n->nud_state & NUD_VALID)) {
-		pr_warn("%s: can't offload, neighbour to %pI4 invalid\n", __func__, &fl4.daddr);
-		err = -EOPNOTSUPP;
+	/* used by mlx5e_detach_encap to lookup a neigh hash table
+	 * entry in the neigh hash table when a user deletes a rule
+	 */
+	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
+	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+	e->out_dev = out_dev;
+
+	/* It's importent to add the neigh to the hash table before checking
+	 * the neigh validity state. So if we'll get a notification, in case the
+	 * neigh changes it's validity state, we would find the relevant neigh
+	 * in the hash.
+	 */
+	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
+	if (err)
 		goto out;
-	}
-
-	e->n = n;
-	e->out_dev = *out_dev;

-	neigh_ha_snapshot(e->h_dest, n, *out_dev);
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	ether_addr_copy(e->h_dest, n->ha);
+	read_unlock_bh(&n->lock);

 	switch (e->tunnel_type) {
 	case MLX5_HEADER_TYPE_VXLAN:
-		gen_vxlan_header_ipv4(*out_dev, encap_header,
+		gen_vxlan_header_ipv4(out_dev, encap_header,
 				      ipv4_encap_size, e->h_dest, ttl,
 				      fl4.daddr,
 				      fl4.saddr, tun_key->tp_dst,
@@ -1268,31 +1397,49 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 		break;
 	default:
 		err = -EOPNOTSUPP;
-		goto out;
+		goto destroy_neigh_entry;
+	}
+	e->encap_size = ipv4_encap_size;
+	e->encap_header = encap_header;
+
+	if (!(nud_state & NUD_VALID)) {
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+		return -EAGAIN;
 	}

 	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
 			       ipv4_encap_size, encap_header, &e->encap_id);
+	if (err)
+		goto destroy_neigh_entry;
+
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
+	neigh_release(n);
+	return err;
+
+destroy_neigh_entry:
+	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
 out:
-	if (err && n)
-		neigh_release(n);
 	kfree(encap_header);
+	if (n)
+		neigh_release(n);
 	return err;
 }

 static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 					  struct net_device *mirred_dev,
-					  struct mlx5_encap_entry *e,
-					  struct net_device **out_dev)
-
+					  struct mlx5e_encap_entry *e)
 {
 	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 	int ipv6_encap_size = ETH_HLEN + sizeof(struct ipv6hdr) + VXLAN_HLEN;
 	struct ip_tunnel_key *tun_key = &e->tun_info.key;
+	struct net_device *out_dev;
 	struct neighbour *n = NULL;
 	struct flowi6 fl6 = {};
 	char *encap_header;
 	int err, ttl = 0;
+	u8 nud_state;

 	if (max_encap_size < ipv6_encap_size) {
 		mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
@@ -1318,25 +1465,36 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 	fl6.daddr = tun_key->u.ipv6.dst;
 	fl6.saddr = tun_key->u.ipv6.src;

-	err = mlx5e_route_lookup_ipv6(priv, mirred_dev, out_dev,
+	err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev,
 				      &fl6, &n, &ttl);
 	if (err)
 		goto out;

-	if (!(n->nud_state & NUD_VALID)) {
-		pr_warn("%s: can't offload, neighbour to %pI6 invalid\n", __func__, &fl6.daddr);
-		err = -EOPNOTSUPP;
+	/* used by mlx5e_detach_encap to lookup a neigh hash table
+	 * entry in the neigh hash table when a user deletes a rule
+	 */
+	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
+	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+	e->out_dev = out_dev;
+
+	/* It's importent to add the neigh to the hash table before checking
+	 * the neigh validity state. So if we'll get a notification, in case the
+	 * neigh changes it's validity state, we would find the relevant neigh
+	 * in the hash.
+	 */
+	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
+	if (err)
 		goto out;
-	}

-	e->n = n;
-	e->out_dev = *out_dev;
-
-	neigh_ha_snapshot(e->h_dest, n, *out_dev);
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	ether_addr_copy(e->h_dest, n->ha);
+	read_unlock_bh(&n->lock);

 	switch (e->tunnel_type) {
 	case MLX5_HEADER_TYPE_VXLAN:
-		gen_vxlan_header_ipv6(*out_dev, encap_header,
+		gen_vxlan_header_ipv6(out_dev, encap_header,
 				      ipv6_encap_size, e->h_dest, ttl,
 				      &fl6.daddr,
 				      &fl6.saddr, tun_key->tp_dst,
@@ -1344,31 +1502,51 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 		break;
 	default:
 		err = -EOPNOTSUPP;
-		goto out;
+		goto destroy_neigh_entry;
+	}
+
+	e->encap_size = ipv6_encap_size;
+	e->encap_header = encap_header;
+
+	if (!(nud_state & NUD_VALID)) {
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+		return -EAGAIN;
 	}

 	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
 			       ipv6_encap_size, encap_header, &e->encap_id);
+	if (err)
+		goto destroy_neigh_entry;
+
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
+	neigh_release(n);
+	return err;
+
+destroy_neigh_entry:
+	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
 out:
-	if (err && n)
-		neigh_release(n);
 	kfree(encap_header);
+	if (n)
+		neigh_release(n);
 	return err;
 }

 static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 			      struct ip_tunnel_info *tun_info,
 			      struct net_device *mirred_dev,
-			      struct mlx5_esw_flow_attr *attr)
+			      struct net_device **encap_dev,
+			      struct mlx5e_tc_flow *flow)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw);
-	struct mlx5e_priv *up_priv = netdev_priv(up_dev);
 	unsigned short family = ip_tunnel_info_af(tun_info);
+	struct mlx5e_priv *up_priv = netdev_priv(up_dev);
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
 	struct ip_tunnel_key *key = &tun_info->key;
-	struct mlx5_encap_entry *e;
-	struct net_device *out_dev;
-	int tunnel_type, err = -EOPNOTSUPP;
+	struct mlx5e_encap_entry *e;
+	int tunnel_type, err = 0;
 	uintptr_t hash_key;
 	bool found = false;

@@ -1403,10 +1581,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 		}
 	}

-	if (found) {
-		attr->encap = e;
-		return 0;
-	}
+	if (found)
+		goto attach_flow;

 	e = kzalloc(sizeof(*e), GFP_KERNEL);
 	if (!e)
@@ -1417,16 +1593,21 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 	INIT_LIST_HEAD(&e->flows);

 	if (family == AF_INET)
-		err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e, &out_dev);
+		err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e);
 	else if (family == AF_INET6)
-		err = mlx5e_create_encap_header_ipv6(priv, mirred_dev, e, &out_dev);
+		err = mlx5e_create_encap_header_ipv6(priv, mirred_dev, e);

-	if (err)
+	if (err && err != -EAGAIN)
 		goto out_err;

-	attr->encap = e;
 	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);

+attach_flow:
+	list_add(&flow->encap, &e->flows);
+	*encap_dev = e->out_dev;
+	if (e->flags & MLX5_ENCAP_ENTRY_VALID)
+		attr->encap_id = e->encap_id;
+
 	return err;

 out_err:
@@ -1439,17 +1620,18 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				struct mlx5e_tc_flow *flow)
 {
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct ip_tunnel_info *info = NULL;
 	const struct tc_action *a;
 	LIST_HEAD(actions);
 	bool encap = false;
-	int err;
+	int err = 0;

 	if (tc_no_actions(exts))
 		return -EINVAL;

 	memset(attr, 0, sizeof(*attr));
-	attr->in_rep = priv->ppriv;
+	attr->in_rep = rpriv->rep;

 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
@@ -1471,7 +1653,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,

 		if (is_tcf_mirred_egress_redirect(a)) {
 			int ifindex = tcf_mirred_ifindex(a);
-			struct net_device *out_dev;
+			struct net_device *out_dev, *encap_dev = NULL;
 			struct mlx5e_priv *out_priv;

 			out_dev = __dev_get_by_index(dev_net(priv->netdev), ifindex);
@@ -1481,18 +1663,20 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
 					MLX5_FLOW_CONTEXT_ACTION_COUNT;
 				out_priv = netdev_priv(out_dev);
-				attr->out_rep = out_priv->ppriv;
+				rpriv = out_priv->ppriv;
+				attr->out_rep = rpriv->rep;
 			} else if (encap) {
 				err = mlx5e_attach_encap(priv, info,
-							 out_dev, attr);
-				if (err)
+							 out_dev, &encap_dev, flow);
+				if (err && err != -EAGAIN)
 					return err;
-				list_add(&flow->encap, &attr->encap->flows);
 				attr->action |= MLX5_FLOW_CONTEXT_ACTION_ENCAP |
 					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
 					MLX5_FLOW_CONTEXT_ACTION_COUNT;
-				out_priv = netdev_priv(attr->encap->out_dev);
-				attr->out_rep = out_priv->ppriv;
+				out_priv = netdev_priv(encap_dev);
+				rpriv = out_priv->ppriv;
+				attr->out_rep = rpriv->rep;
+				attr->parse_attr = parse_attr;
 			} else {
 				pr_err("devices %s %s not on same switch HW, can't offload forwarding\n",
 				       priv->netdev->name, out_dev->name);
@@ -1532,7 +1716,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,

 		return -EINVAL;
 	}
-	return 0;
+	return err;
 }

 int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
@@ -1570,7 +1754,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 	if (flow->flags & MLX5E_TC_FLOW_ESWITCH) {
 		err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow);
 		if (err < 0)
-			goto err_free;
+			goto err_handle_encap_flow;
 		flow->rule = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow);
 	} else {
 		err = parse_tc_nic_actions(priv, f->exts, parse_attr, flow);
@@ -1584,20 +1768,33 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 		goto err_free;
 	}

+	flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
 	err = rhashtable_insert_fast(&tc->ht, &flow->node,
 				     tc->ht_params);
 	if (err)
 		goto err_del_rule;

-	goto out;
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH &&
+	    !(flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP))
+		kvfree(parse_attr);
+	return err;

 err_del_rule:
 	mlx5e_tc_del_flow(priv, flow);

+err_handle_encap_flow:
+	if (err == -EAGAIN) {
+		err = rhashtable_insert_fast(&tc->ht, &flow->node,
+					     tc->ht_params);
+		if (err)
+			mlx5e_tc_del_flow(priv, flow);
+		else
+			return 0;
+	}
+
 err_free:
-	kfree(flow);
-out:
 	kvfree(parse_attr);
+	kfree(flow);
 	return err;
 }

@@ -1616,7 +1813,6 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,

 	mlx5e_tc_del_flow(priv, flow);

-
 	kfree(flow);

 	return 0;
@@ -1639,6 +1835,9 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 	if (!flow)
 		return -EINVAL;

+	if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
+		return 0;
+
 	counter = mlx5_flow_rule_counter(flow->rule);
 	if (!counter)
 		return 0;

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -46,6 +46,15 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,
 int mlx5e_stats_flower(struct mlx5e_priv *priv,
 		       struct tc_cls_flower_offload *f);

+struct mlx5e_encap_entry;
+void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e);
+void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e);
+
+struct mlx5e_neigh_hash_entry;
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
+
 static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
 {
 	return atomic_read(&priv->fs.tc.ht.nelems);

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -37,8 +37,8 @@ struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq)
 	struct mlx5_cqwq *wq = &cq->wq;
 	u32 ci = mlx5_cqwq_get_ci(wq);
 	struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(wq, ci);
-	int cqe_ownership_bit = cqe->op_own & MLX5_CQE_OWNER_MASK;
-	int sw_ownership_val = mlx5_cqwq_get_wrap_cnt(wq) & 1;
+	u8 cqe_ownership_bit = cqe->op_own & MLX5_CQE_OWNER_MASK;
+	u8 sw_ownership_val = mlx5_cqwq_get_wrap_cnt(wq) & 1;

 	if (cqe_ownership_bit != sw_ownership_val)
 		return NULL;
@@ -49,10 +49,40 @@ struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq)
 	return cqe;
 }

+static inline void mlx5e_poll_ico_single_cqe(struct mlx5e_cq *cq,
+					     struct mlx5e_icosq *sq,
+					     struct mlx5_cqe64 *cqe,
+					     u16 *sqcc)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
+	struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
+	struct mlx5e_rq *rq = &sq->channel->rq;
+
+	prefetch(rq);
+	mlx5_cqwq_pop(&cq->wq);
+	*sqcc += icowi->num_wqebbs;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
+		WARN_ONCE(true, "mlx5e: Bad OP in ICOSQ CQE: 0x%x\n",
+			  cqe->op_own);
+		return;
+	}
+
+	if (likely(icowi->opcode == MLX5_OPCODE_UMR)) {
+		mlx5e_post_rx_mpwqe(rq);
+		return;
+	}
+
+	if (unlikely(icowi->opcode != MLX5_OPCODE_NOP))
+		WARN_ONCE(true,
+			  "mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
+			  icowi->opcode);
+}
+
 static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
 {
 	struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
-	struct mlx5_wq_cyc *wq;
 	struct mlx5_cqe64 *cqe;
 	u16 sqcc;

@@ -63,39 +93,13 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
 	if (likely(!cqe))
 		return;

-	wq = &sq->wq;
-
 	/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
 	 * otherwise a cq overrun may occur
 	 */
 	sqcc = sq->cc;

-	do {
-		u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
-		struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
-
-		mlx5_cqwq_pop(&cq->wq);
-		sqcc += icowi->num_wqebbs;
-
-		if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
-			WARN_ONCE(true, "mlx5e: Bad OP in ICOSQ CQE: 0x%x\n",
-				  cqe->op_own);
-			break;
-		}
-
-		switch (icowi->opcode) {
-		case MLX5_OPCODE_NOP:
-			break;
-		case MLX5_OPCODE_UMR:
-			mlx5e_post_rx_mpwqe(&sq->channel->rq);
-			break;
-		default:
-			WARN_ONCE(true,
-				  "mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
-				  icowi->opcode);
-		}
-
-	} while ((cqe = mlx5e_get_cqe(cq)));
+	/* by design, there's only a single cqe */
+	mlx5e_poll_ico_single_cqe(cq, sq, cqe, &sqcc);

 	mlx5_cqwq_update_db_record(&cq->wq);


--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -53,13 +53,6 @@ struct esw_uc_addr {
 	u32                vport;
 };

-/* E-Switch MC FDB table hash node */
-struct esw_mc_addr { /* SRIOV only */
-	struct l2addr_node     node;
-	struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
-	u32                    refcnt;
-};
-
 /* Vport UC/MC hash node */
 struct vport_addr {
 	struct l2addr_node     node;
@@ -817,7 +810,7 @@ static void esw_update_vport_mc_promisc(struct mlx5_eswitch *esw, u32 vport_num)
 static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num,
 				    bool promisc, bool mc_promisc)
 {
-	struct esw_mc_addr *allmulti_addr = esw->mc_promisc;
+	struct esw_mc_addr *allmulti_addr = &esw->mc_promisc;
 	struct mlx5_vport *vport = &esw->vports[vport_num];

 	if (IS_ERR_OR_NULL(vport->allmulti_rule) != mc_promisc)
@@ -1688,7 +1681,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 	esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n",
 		 esw->enabled_vports, esw->mode);

-	mc_promisc = esw->mc_promisc;
+	mc_promisc = &esw->mc_promisc;
 	nvports = esw->enabled_vports;

 	for (i = 0; i < esw->total_vports; i++)
@@ -1732,7 +1725,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
 	int l2_table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
 	int total_vports = MLX5_TOTAL_VPORTS(dev);
-	struct esw_mc_addr *mc_promisc;
 	struct mlx5_eswitch *esw;
 	int vport_num;
 	int err;
@@ -1761,13 +1753,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	}
 	esw->l2_table.size = l2_table_size;

-	mc_promisc = kzalloc(sizeof(*mc_promisc), GFP_KERNEL);
-	if (!mc_promisc) {
-		err = -ENOMEM;
-		goto abort;
-	}
-	esw->mc_promisc = mc_promisc;
-
 	esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
 	if (!esw->work_queue) {
 		err = -ENOMEM;
@@ -1835,7 +1820,6 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
 	esw->dev->priv.eswitch = NULL;
 	destroy_workqueue(esw->work_queue);
 	kfree(esw->l2_table.bitmap);
-	kfree(esw->mc_promisc);
 	kfree(esw->offloads.vport_reps);
 	kfree(esw->vports);
 	kfree(esw);

--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -36,7 +36,6 @@
 #include <linux/if_ether.h>
 #include <linux/if_link.h>
 #include <net/devlink.h>
-#include <net/ip_tunnels.h>
 #include <linux/mlx5/device.h>

 #define MLX5_MAX_UC_PER_VPORT(dev) \
@@ -213,6 +212,13 @@ struct mlx5_esw_offload {
 	u8 encap;
 };

+/* E-Switch MC FDB table hash node */
+struct esw_mc_addr { /* SRIOV only */
+	struct l2addr_node     node;
+	struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
+	u32                    refcnt;
+};
+
 struct mlx5_eswitch {
 	struct mlx5_core_dev    *dev;
 	struct mlx5_l2_table    l2_table;
@@ -226,7 +232,7 @@ struct mlx5_eswitch {
 	 * and async SRIOV admin state changes
 	 */
 	struct mutex            state_lock;
-	struct esw_mc_addr      *mc_promisc;
+	struct esw_mc_addr	mc_promisc;

 	struct {
 		bool            enabled;
@@ -289,18 +295,6 @@ enum {
 #define MLX5_FLOW_CONTEXT_ACTION_VLAN_POP  0x4000
 #define MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH 0x8000

-struct mlx5_encap_entry {
-	struct hlist_node encap_hlist;
-	struct list_head flows;
-	u32 encap_id;
-	struct neighbour *n;
-	struct ip_tunnel_info tun_info;
-	unsigned char h_dest[ETH_ALEN];	/* destination eth addr	*/
-
-	struct net_device *out_dev;
-	int tunnel_type;
-};
-
 struct mlx5_esw_flow_attr {
 	struct mlx5_eswitch_rep *in_rep;
 	struct mlx5_eswitch_rep *out_rep;
@@ -308,8 +302,9 @@ struct mlx5_esw_flow_attr {
 	int	action;
 	u16	vlan;
 	bool	vlan_handled;
-	struct mlx5_encap_entry *encap;
+	u32	encap_id;
 	u32	mod_hdr_id;
+	struct mlx5e_tc_flow_parse_attr *parse_attr;
 };

 int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,

--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -92,7 +92,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 		flow_act.modify_id = attr->mod_hdr_id;

 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
-		flow_act.encap_id = attr->encap->encap_id;
+		flow_act.encap_id = attr->encap_id;

 	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.fdb,
 				   spec, &flow_act, dest, i);

--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -199,6 +199,11 @@ struct mlx5_flow_root_namespace {

 int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay);
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval);

 int mlx5_init_fs(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev);

--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -165,7 +165,8 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 	list_splice_tail_init(&fc_stats->addlist, &tmplist);

 	if (!list_empty(&tmplist) || !RB_EMPTY_ROOT(&fc_stats->counters))
-		queue_delayed_work(fc_stats->wq, &fc_stats->work, MLX5_FC_STATS_PERIOD);
+		queue_delayed_work(fc_stats->wq, &fc_stats->work,
+				   fc_stats->sampling_interval);

 	spin_unlock(&fc_stats->addlist_lock);

@@ -200,7 +201,7 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 		node = mlx5_fc_stats_query(dev, counter, last->id);
 	}

-	fc_stats->next_query = now + MLX5_FC_STATS_PERIOD;
+	fc_stats->next_query = now + fc_stats->sampling_interval;
 }

 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
@@ -265,6 +266,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	if (!fc_stats->wq)
 		return -ENOMEM;

+	fc_stats->sampling_interval = MLX5_FC_STATS_PERIOD;
 	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);

 	return 0;
@@ -317,3 +319,21 @@ void mlx5_fc_query_cached(struct mlx5_fc *counter,
 	counter->lastbytes = c.bytes;
 	counter->lastpackets = c.packets;
 }
+
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	queue_delayed_work(fc_stats->wq, dwork, delay);
+}
+
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	fc_stats->sampling_interval = min_t(unsigned long, interval,
+					    fc_stats->sampling_interval);
+}
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -540,6 +540,7 @@ struct mlx5_fc_stats {
 	struct workqueue_struct *wq;
 	struct delayed_work work;
 	unsigned long next_query;
+	unsigned long sampling_interval; /* jiffies */
 };

 struct mlx5_eswitch;