diff --git a/Documentation/networking/netvsc.txt b/Documentation/networking/netvsc.txt index 92f5b31392fac508f4a465b64de130de4ab1f4ee..3bfa635bbbd598d0cf53689be706ac6679e0e53a 100644 --- a/Documentation/networking/netvsc.txt +++ b/Documentation/networking/netvsc.txt @@ -45,6 +45,15 @@ Features like packets and significantly reduces CPU usage under heavy Rx load. + Large Receive Offload (LRO), or Receive Side Coalescing (RSC) + ------------------------------------------------------------- + The driver supports LRO/RSC in the vSwitch feature. It reduces the per packet + processing overhead by coalescing multiple TCP segments when possible. The + feature is enabled by default on VMs running on Windows Server 2019 and + later. It may be changed by ethtool command: + ethtool -K eth0 lro on + ethtool -K eth0 lro off + SR-IOV support -------------- Hyper-V supports SR-IOV as a hardware acceleration option. If SR-IOV diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index a32ded5b4f416f662e2a820356f16c9bfbef00db..ef6f766f6389380894d855494d371035fc26325e 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -185,7 +185,9 @@ struct rndis_device { /* Interface */ struct rndis_message; +struct ndis_offload_params; struct netvsc_device; +struct netvsc_channel; struct net_device_context; extern u32 netvsc_ring_bytes; @@ -203,10 +205,7 @@ void netvsc_linkstatus_callback(struct net_device *net, struct rndis_message *resp); int netvsc_recv_callback(struct net_device *net, struct netvsc_device *nvdev, - struct vmbus_channel *channel, - void *data, u32 len, - const struct ndis_tcp_ip_checksum_info *csum_info, - const struct ndis_pkt_8021q_info *vlan); + struct netvsc_channel *nvchan); void netvsc_channel_cb(void *context); int netvsc_poll(struct napi_struct *napi, int budget); @@ -220,9 +219,12 @@ void rndis_filter_device_remove(struct hv_device *dev, struct netvsc_device *nvdev); int rndis_filter_set_rss_param(struct rndis_device *rdev, const u8 *key); +int rndis_filter_set_offload_params(struct net_device *ndev, + struct netvsc_device *nvdev, + struct ndis_offload_params *req_offloads); int rndis_filter_receive(struct net_device *ndev, struct netvsc_device *net_dev, - struct vmbus_channel *channel, + struct netvsc_channel *nvchan, void *data, u32 buflen); int rndis_filter_set_device_mac(struct netvsc_device *ndev, @@ -524,6 +526,8 @@ struct nvsp_2_vsc_capability { u64 ieee8021q:1; u64 correlation_id:1; u64 teaming:1; + u64 vsubnetid:1; + u64 rsc:1; }; }; } __packed; @@ -826,7 +830,7 @@ struct nvsp_message { #define NETVSC_SUPPORTED_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | \ NETIF_F_TSO | NETIF_F_IPV6_CSUM | \ - NETIF_F_TSO6) + NETIF_F_TSO6 | NETIF_F_LRO) #define VRSS_SEND_TAB_SIZE 16 /* must be power of 2 */ #define VRSS_CHANNEL_MAX 64 @@ -852,6 +856,18 @@ struct multi_recv_comp { u32 next; /* next entry for writing */ }; +#define NVSP_RSC_MAX 562 /* Max #RSC frags in a vmbus xfer page pkt */ + +struct nvsc_rsc { + const struct ndis_pkt_8021q_info *vlan; + const struct ndis_tcp_ip_checksum_info *csum_info; + u8 is_last; /* last RNDIS msg in a vmtransfer_page */ + u32 cnt; /* #fragments in an RSC packet */ + u32 pktlen; /* Full packet length */ + void *data[NVSP_RSC_MAX]; + u32 len[NVSP_RSC_MAX]; +}; + struct netvsc_stats { u64 packets; u64 bytes; @@ -955,6 +971,7 @@ struct netvsc_channel { struct multi_send_data msd; struct multi_recv_comp mrc; atomic_t queue_sends; + struct nvsc_rsc rsc; struct netvsc_stats tx_stats; struct netvsc_stats rx_stats; @@ -1136,7 +1153,8 @@ struct rndis_oobd { /* Packet extension field contents associated with a Data message. */ struct rndis_per_packet_info { u32 size; - u32 type; + u32 type:31; + u32 internal:1; u32 ppi_offset; }; @@ -1157,6 +1175,25 @@ enum ndis_per_pkt_info_type { MAX_PER_PKT_INFO }; +enum rndis_per_pkt_info_interal_type { + RNDIS_PKTINFO_ID = 1, + /* Add more memebers here */ + + RNDIS_PKTINFO_MAX +}; + +#define RNDIS_PKTINFO_SUBALLOC BIT(0) +#define RNDIS_PKTINFO_1ST_FRAG BIT(1) +#define RNDIS_PKTINFO_LAST_FRAG BIT(2) + +#define RNDIS_PKTINFO_ID_V1 1 + +struct rndis_pktinfo_id { + u8 ver; + u8 flag; + u16 pkt_id; +}; + struct ndis_pkt_8021q_info { union { struct { diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index fe01e141c8f87d50e42a5cb2670ff5ba4921744a..922054c1d5448bf6406742c52a94a7bc4226575c 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -542,6 +542,9 @@ static int negotiate_nvsp_ver(struct hv_device *device, init_packet->msg.v2_msg.send_ndis_config.capability.teaming = 1; } + if (nvsp_ver >= NVSP_PROTOCOL_VERSION_61) + init_packet->msg.v2_msg.send_ndis_config.capability.rsc = 1; + trace_nvsp_send(ndev, init_packet); ret = vmbus_sendpacket(device->channel, init_packet, @@ -1111,11 +1114,12 @@ static void enq_receive_complete(struct net_device *ndev, static int netvsc_receive(struct net_device *ndev, struct netvsc_device *net_device, - struct vmbus_channel *channel, + struct netvsc_channel *nvchan, const struct vmpacket_descriptor *desc, const struct nvsp_message *nvsp) { struct net_device_context *net_device_ctx = netdev_priv(ndev); + struct vmbus_channel *channel = nvchan->channel; const struct vmtransfer_page_packet_header *vmxferpage_packet = container_of(desc, const struct vmtransfer_page_packet_header, d); u16 q_idx = channel->offermsg.offer.sub_channel_index; @@ -1150,6 +1154,7 @@ static int netvsc_receive(struct net_device *ndev, int ret; if (unlikely(offset + buflen > net_device->recv_buf_size)) { + nvchan->rsc.cnt = 0; status = NVSP_STAT_FAIL; netif_err(net_device_ctx, rx_err, ndev, "Packet offset:%u + len:%u too big\n", @@ -1160,11 +1165,13 @@ static int netvsc_receive(struct net_device *ndev, data = recv_buf + offset; + nvchan->rsc.is_last = (i == count - 1); + trace_rndis_recv(ndev, q_idx, data); /* Pass it to the upper layer */ ret = rndis_filter_receive(ndev, net_device, - channel, data, buflen); + nvchan, data, buflen); if (unlikely(ret != NVSP_STAT_SUCCESS)) status = NVSP_STAT_FAIL; @@ -1223,12 +1230,13 @@ static void netvsc_receive_inband(struct net_device *ndev, } static int netvsc_process_raw_pkt(struct hv_device *device, - struct vmbus_channel *channel, + struct netvsc_channel *nvchan, struct netvsc_device *net_device, struct net_device *ndev, const struct vmpacket_descriptor *desc, int budget) { + struct vmbus_channel *channel = nvchan->channel; const struct nvsp_message *nvmsg = hv_pkt_data(desc); trace_nvsp_recv(ndev, channel, nvmsg); @@ -1240,7 +1248,7 @@ static int netvsc_process_raw_pkt(struct hv_device *device, break; case VM_PKT_DATA_USING_XFER_PAGES: - return netvsc_receive(ndev, net_device, channel, + return netvsc_receive(ndev, net_device, nvchan, desc, nvmsg); break; @@ -1284,7 +1292,7 @@ int netvsc_poll(struct napi_struct *napi, int budget) nvchan->desc = hv_pkt_iter_first(channel); while (nvchan->desc && work_done < budget) { - work_done += netvsc_process_raw_pkt(device, channel, net_device, + work_done += netvsc_process_raw_pkt(device, nvchan, net_device, ndev, nvchan->desc, budget); nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc); } diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 3af6d8d15233756e411500a8753d2789d544f076..ec699741170b6488c2a19a23df3931fcee627d88 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -744,14 +744,16 @@ void netvsc_linkstatus_callback(struct net_device *net, } static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, - struct napi_struct *napi, - const struct ndis_tcp_ip_checksum_info *csum_info, - const struct ndis_pkt_8021q_info *vlan, - void *data, u32 buflen) + struct netvsc_channel *nvchan) { + struct napi_struct *napi = &nvchan->napi; + const struct ndis_pkt_8021q_info *vlan = nvchan->rsc.vlan; + const struct ndis_tcp_ip_checksum_info *csum_info = + nvchan->rsc.csum_info; struct sk_buff *skb; + int i; - skb = napi_alloc_skb(napi, buflen); + skb = napi_alloc_skb(napi, nvchan->rsc.pktlen); if (!skb) return skb; @@ -759,7 +761,8 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, * Copy to skb. This copy is needed here since the memory pointed by * hv_netvsc_packet cannot be deallocated */ - skb_put_data(skb, data, buflen); + for (i = 0; i < nvchan->rsc.cnt; i++) + skb_put_data(skb, nvchan->rsc.data[i], nvchan->rsc.len[i]); skb->protocol = eth_type_trans(skb, net); @@ -792,14 +795,11 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, */ int netvsc_recv_callback(struct net_device *net, struct netvsc_device *net_device, - struct vmbus_channel *channel, - void *data, u32 len, - const struct ndis_tcp_ip_checksum_info *csum_info, - const struct ndis_pkt_8021q_info *vlan) + struct netvsc_channel *nvchan) { struct net_device_context *net_device_ctx = netdev_priv(net); + struct vmbus_channel *channel = nvchan->channel; u16 q_idx = channel->offermsg.offer.sub_channel_index; - struct netvsc_channel *nvchan = &net_device->chan_table[q_idx]; struct sk_buff *skb; struct netvsc_stats *rx_stats; @@ -807,8 +807,8 @@ int netvsc_recv_callback(struct net_device *net, return NVSP_STAT_FAIL; /* Allocate a skb - TODO direct I/O to pages? */ - skb = netvsc_alloc_recv_skb(net, &nvchan->napi, - csum_info, vlan, data, len); + skb = netvsc_alloc_recv_skb(net, nvchan); + if (unlikely(!skb)) { ++net_device_ctx->eth_stats.rx_no_memory; rcu_read_unlock(); @@ -825,7 +825,7 @@ int netvsc_recv_callback(struct net_device *net, rx_stats = &nvchan->rx_stats; u64_stats_update_begin(&rx_stats->syncp); rx_stats->packets++; - rx_stats->bytes += len; + rx_stats->bytes += nvchan->rsc.pktlen; if (skb->pkt_type == PACKET_BROADCAST) ++rx_stats->broadcast; @@ -1006,6 +1006,8 @@ static void netvsc_init_settings(struct net_device *dev) ndc->speed = SPEED_UNKNOWN; ndc->duplex = DUPLEX_FULL; + + dev->features = NETIF_F_LRO; } static int netvsc_get_link_ksettings(struct net_device *dev, @@ -1733,6 +1735,33 @@ static int netvsc_set_ringparam(struct net_device *ndev, return ret; } +static int netvsc_set_features(struct net_device *ndev, + netdev_features_t features) +{ + netdev_features_t change = features ^ ndev->features; + struct net_device_context *ndevctx = netdev_priv(ndev); + struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev); + struct ndis_offload_params offloads; + + if (!nvdev || nvdev->destroy) + return -ENODEV; + + if (!(change & NETIF_F_LRO)) + return 0; + + memset(&offloads, 0, sizeof(struct ndis_offload_params)); + + if (features & NETIF_F_LRO) { + offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED; + offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED; + } else { + offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED; + offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED; + } + + return rndis_filter_set_offload_params(ndev, nvdev, &offloads); +} + static u32 netvsc_get_msglevel(struct net_device *ndev) { struct net_device_context *ndev_ctx = netdev_priv(ndev); @@ -1776,6 +1805,7 @@ static const struct net_device_ops device_ops = { .ndo_start_xmit = netvsc_start_xmit, .ndo_change_rx_flags = netvsc_change_rx_flags, .ndo_set_rx_mode = netvsc_set_rx_mode, + .ndo_set_features = netvsc_set_features, .ndo_change_mtu = netvsc_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = netvsc_set_mac_addr, diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 2a5209f23f296a8036c8aafd8248cc71a1582d23..8b537a049c1e5960c2f3f31c3afc59b4b4ef19e0 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -342,7 +342,8 @@ static void rndis_filter_receive_response(struct net_device *ndev, * Get the Per-Packet-Info with the specified type * return NULL if not found. */ -static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type) +static inline void *rndis_get_ppi(struct rndis_packet *rpkt, + u32 type, u8 internal) { struct rndis_per_packet_info *ppi; int len; @@ -355,7 +356,7 @@ static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type) len = rpkt->per_pkt_info_len; while (len > 0) { - if (ppi->type == type) + if (ppi->type == type && ppi->internal == internal) return (void *)((ulong)ppi + ppi->ppi_offset); len -= ppi->size; ppi = (struct rndis_per_packet_info *)((ulong)ppi + ppi->size); @@ -364,17 +365,41 @@ static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type) return NULL; } +static inline +void rsc_add_data(struct netvsc_channel *nvchan, + const struct ndis_pkt_8021q_info *vlan, + const struct ndis_tcp_ip_checksum_info *csum_info, + void *data, u32 len) +{ + u32 cnt = nvchan->rsc.cnt; + + if (cnt) { + nvchan->rsc.pktlen += len; + } else { + nvchan->rsc.vlan = vlan; + nvchan->rsc.csum_info = csum_info; + nvchan->rsc.pktlen = len; + } + + nvchan->rsc.data[cnt] = data; + nvchan->rsc.len[cnt] = len; + nvchan->rsc.cnt++; +} + static int rndis_filter_receive_data(struct net_device *ndev, struct netvsc_device *nvdev, - struct vmbus_channel *channel, + struct netvsc_channel *nvchan, struct rndis_message *msg, u32 data_buflen) { struct rndis_packet *rndis_pkt = &msg->msg.pkt; const struct ndis_tcp_ip_checksum_info *csum_info; const struct ndis_pkt_8021q_info *vlan; + const struct rndis_pktinfo_id *pktinfo_id; u32 data_offset; void *data; + bool rsc_more = false; + int ret; /* Remove the rndis header and pass it back up the stack */ data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset; @@ -393,25 +418,59 @@ static int rndis_filter_receive_data(struct net_device *ndev, return NVSP_STAT_FAIL; } - vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO); + vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO, 0); + + csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO, 0); - csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO); + pktinfo_id = rndis_get_ppi(rndis_pkt, RNDIS_PKTINFO_ID, 1); data = (void *)msg + data_offset; - /* - * Remove the rndis trailer padding from rndis packet message + /* Identify RSC frags, drop erroneous packets */ + if (pktinfo_id && (pktinfo_id->flag & RNDIS_PKTINFO_SUBALLOC)) { + if (pktinfo_id->flag & RNDIS_PKTINFO_1ST_FRAG) + nvchan->rsc.cnt = 0; + else if (nvchan->rsc.cnt == 0) + goto drop; + + rsc_more = true; + + if (pktinfo_id->flag & RNDIS_PKTINFO_LAST_FRAG) + rsc_more = false; + + if (rsc_more && nvchan->rsc.is_last) + goto drop; + } else { + nvchan->rsc.cnt = 0; + } + + if (unlikely(nvchan->rsc.cnt >= NVSP_RSC_MAX)) + goto drop; + + /* Put data into per channel structure. + * Also, remove the rndis trailer padding from rndis packet message * rndis_pkt->data_len tell us the real data length, we only copy * the data packet to the stack, without the rndis trailer padding */ - return netvsc_recv_callback(ndev, nvdev, channel, - data, rndis_pkt->data_len, - csum_info, vlan); + rsc_add_data(nvchan, vlan, csum_info, data, rndis_pkt->data_len); + + if (rsc_more) + return NVSP_STAT_SUCCESS; + + ret = netvsc_recv_callback(ndev, nvdev, nvchan); + nvchan->rsc.cnt = 0; + + return ret; + +drop: + /* Drop incomplete packet */ + nvchan->rsc.cnt = 0; + return NVSP_STAT_FAIL; } int rndis_filter_receive(struct net_device *ndev, struct netvsc_device *net_dev, - struct vmbus_channel *channel, + struct netvsc_channel *nvchan, void *data, u32 buflen) { struct net_device_context *net_device_ctx = netdev_priv(ndev); @@ -422,7 +481,7 @@ int rndis_filter_receive(struct net_device *ndev, switch (rndis_msg->ndis_msg_type) { case RNDIS_MSG_PACKET: - return rndis_filter_receive_data(ndev, net_dev, channel, + return rndis_filter_receive_data(ndev, net_dev, nvchan, rndis_msg, buflen); case RNDIS_MSG_INIT_C: case RNDIS_MSG_QUERY_C: @@ -657,7 +716,7 @@ int rndis_filter_set_device_mac(struct netvsc_device *nvdev, return ret; } -static int +int rndis_filter_set_offload_params(struct net_device *ndev, struct netvsc_device *nvdev, struct ndis_offload_params *req_offloads) @@ -1184,6 +1243,18 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, } } + if (hwcaps.rsc.ip4 && hwcaps.rsc.ip6) { + net->hw_features |= NETIF_F_LRO; + + if (net->features & NETIF_F_LRO) { + offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED; + offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED; + } else { + offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED; + offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED; + } + } + /* In case some hw_features disappeared we need to remove them from * net->features list as they're no longer supported. */