diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 9f71844042634e06af148b872e6a343c54a789f1..d9292c59789b09d2bea268ea4b61af45b0f8e619 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -47,11 +47,25 @@ #include #include +/* + * This is the maximum slots a skb can have. If a guest sends a skb + * which exceeds this limit it is considered malicious. + */ +#define MAX_SKB_SLOTS_DEFAULT 20 +static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT; +module_param(max_skb_slots, uint, 0444); + +typedef unsigned int pending_ring_idx_t; +#define INVALID_PENDING_RING_IDX (~0U) + struct pending_tx_info { - struct xen_netif_tx_request req; + struct xen_netif_tx_request req; /* coalesced tx request */ struct xenvif *vif; + pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX + * if it is head of one or more tx + * reqs + */ }; -typedef unsigned int pending_ring_idx_t; struct netbk_rx_meta { int id; @@ -102,7 +116,11 @@ struct xen_netbk { atomic_t netfront_count; struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; - struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS]; + /* Coalescing tx requests before copying makes number of grant + * copy ops greater or equal to number of slots required. In + * worst case a tx request consumes 2 gnttab_copy. + */ + struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; u16 pending_ring[MAX_PENDING_REQS]; @@ -118,6 +136,16 @@ struct xen_netbk { static struct xen_netbk *xen_netbk; static int xen_netbk_group_nr; +/* + * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of + * one or more merged tx requests, otherwise it is the continuation of + * previous tx request. + */ +static inline int pending_tx_is_head(struct xen_netbk *netbk, RING_IDX idx) +{ + return netbk->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX; +} + void xen_netbk_add_xenvif(struct xenvif *vif) { int i; @@ -250,6 +278,7 @@ static int max_required_rx_slots(struct xenvif *vif) { int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); + /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ if (vif->can_sg || vif->gso || vif->gso_prefix) max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ @@ -657,6 +686,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) __skb_queue_tail(&rxq, skb); /* Filled the batch queue? */ + /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) break; } @@ -898,47 +928,78 @@ static void netbk_fatal_tx_err(struct xenvif *vif) static int netbk_count_requests(struct xenvif *vif, struct xen_netif_tx_request *first, + RING_IDX first_idx, struct xen_netif_tx_request *txp, int work_to_do) { RING_IDX cons = vif->tx.req_cons; - int frags = 0; + int slots = 0; + int drop_err = 0; if (!(first->flags & XEN_NETTXF_more_data)) return 0; do { - if (frags >= work_to_do) { - netdev_err(vif->dev, "Need more frags\n"); + if (slots >= work_to_do) { + netdev_err(vif->dev, + "Asked for %d slots but exceeds this limit\n", + work_to_do); netbk_fatal_tx_err(vif); return -ENODATA; } - if (unlikely(frags >= MAX_SKB_FRAGS)) { - netdev_err(vif->dev, "Too many frags\n"); + /* This guest is really using too many slots and + * considered malicious. + */ + if (unlikely(slots >= max_skb_slots)) { + netdev_err(vif->dev, + "Malicious frontend using %d slots, threshold %u\n", + slots, max_skb_slots); netbk_fatal_tx_err(vif); return -E2BIG; } - memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags), + /* Xen network protocol had implicit dependency on + * MAX_SKB_FRAGS. XEN_NETIF_NR_SLOTS_MIN is set to the + * historical MAX_SKB_FRAGS value 18 to honor the same + * behavior as before. Any packet using more than 18 + * slots but less than max_skb_slots slots is dropped + */ + if (!drop_err && slots >= XEN_NETIF_NR_SLOTS_MIN) { + if (net_ratelimit()) + netdev_dbg(vif->dev, + "Too many slots (%d) exceeding limit (%d), dropping packet\n", + slots, XEN_NETIF_NR_SLOTS_MIN); + drop_err = -E2BIG; + } + + memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots), sizeof(*txp)); if (txp->size > first->size) { - netdev_err(vif->dev, "Frag is bigger than frame.\n"); + netdev_err(vif->dev, + "Invalid tx request, slot size %u > remaining size %u\n", + txp->size, first->size); netbk_fatal_tx_err(vif); return -EIO; } first->size -= txp->size; - frags++; + slots++; if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { - netdev_err(vif->dev, "txp->offset: %x, size: %u\n", + netdev_err(vif->dev, "Cross page boundary, txp->offset: %x, size: %u\n", txp->offset, txp->size); netbk_fatal_tx_err(vif); return -EINVAL; } } while ((txp++)->flags & XEN_NETTXF_more_data); - return frags; + + if (drop_err) { + netbk_tx_err(vif, first, first_idx + slots); + return drop_err; + } + + return slots; } static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk, @@ -962,48 +1023,114 @@ static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk, struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; u16 pending_idx = *((u16 *)skb->data); - int i, start; + u16 head_idx = 0; + int slot, start; + struct page *page; + pending_ring_idx_t index, start_idx = 0; + uint16_t dst_offset; + unsigned int nr_slots; + struct pending_tx_info *first = NULL; + + /* At this point shinfo->nr_frags is in fact the number of + * slots, which can be as large as XEN_NETIF_NR_SLOTS_MIN. + */ + nr_slots = shinfo->nr_frags; /* Skip first skb fragment if it is on same page as header fragment. */ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); - for (i = start; i < shinfo->nr_frags; i++, txp++) { - struct page *page; - pending_ring_idx_t index; + /* Coalesce tx requests, at this point the packet passed in + * should be <= 64K. Any packets larger than 64K have been + * handled in netbk_count_requests(). + */ + for (shinfo->nr_frags = slot = start; slot < nr_slots; + shinfo->nr_frags++) { struct pending_tx_info *pending_tx_info = netbk->pending_tx_info; - index = pending_index(netbk->pending_cons++); - pending_idx = netbk->pending_ring[index]; - page = xen_netbk_alloc_page(netbk, pending_idx); + page = alloc_page(GFP_KERNEL|__GFP_COLD); if (!page) goto err; - gop->source.u.ref = txp->gref; - gop->source.domid = vif->domid; - gop->source.offset = txp->offset; - - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); - gop->dest.domid = DOMID_SELF; - gop->dest.offset = txp->offset; - - gop->len = txp->size; - gop->flags = GNTCOPY_source_gref; + dst_offset = 0; + first = NULL; + while (dst_offset < PAGE_SIZE && slot < nr_slots) { + gop->flags = GNTCOPY_source_gref; + + gop->source.u.ref = txp->gref; + gop->source.domid = vif->domid; + gop->source.offset = txp->offset; + + gop->dest.domid = DOMID_SELF; + + gop->dest.offset = dst_offset; + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); + + if (dst_offset + txp->size > PAGE_SIZE) { + /* This page can only merge a portion + * of tx request. Do not increment any + * pointer / counter here. The txp + * will be dealt with in future + * rounds, eventually hitting the + * `else` branch. + */ + gop->len = PAGE_SIZE - dst_offset; + txp->offset += gop->len; + txp->size -= gop->len; + dst_offset += gop->len; /* quit loop */ + } else { + /* This tx request can be merged in the page */ + gop->len = txp->size; + dst_offset += gop->len; + + index = pending_index(netbk->pending_cons++); + + pending_idx = netbk->pending_ring[index]; + + memcpy(&pending_tx_info[pending_idx].req, txp, + sizeof(*txp)); + xenvif_get(vif); + + pending_tx_info[pending_idx].vif = vif; + + /* Poison these fields, corresponding + * fields for head tx req will be set + * to correct values after the loop. + */ + netbk->mmap_pages[pending_idx] = (void *)(~0UL); + pending_tx_info[pending_idx].head = + INVALID_PENDING_RING_IDX; + + if (!first) { + first = &pending_tx_info[pending_idx]; + start_idx = index; + head_idx = pending_idx; + } + + txp++; + slot++; + } - gop++; + gop++; + } - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); - xenvif_get(vif); - pending_tx_info[pending_idx].vif = vif; - frag_set_pending_idx(&frags[i], pending_idx); + first->req.offset = 0; + first->req.size = dst_offset; + first->head = start_idx; + set_page_ext(page, netbk, head_idx); + netbk->mmap_pages[head_idx] = page; + frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); } + BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); + return gop; err: /* Unwind, freeing all pages and sending error responses. */ - while (i-- > start) { - xen_netbk_idx_release(netbk, frag_get_pending_idx(&frags[i]), - XEN_NETIF_RSP_ERROR); + while (shinfo->nr_frags-- > start) { + xen_netbk_idx_release(netbk, + frag_get_pending_idx(&frags[shinfo->nr_frags]), + XEN_NETIF_RSP_ERROR); } /* The head too, if necessary. */ if (start) @@ -1019,8 +1146,10 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, struct gnttab_copy *gop = *gopp; u16 pending_idx = *((u16 *)skb->data); struct skb_shared_info *shinfo = skb_shinfo(skb); + struct pending_tx_info *tx_info; int nr_frags = shinfo->nr_frags; int i, err, start; + u16 peek; /* peek into next tx request */ /* Check status of header. */ err = gop->status; @@ -1032,11 +1161,20 @@ static int xen_netbk_tx_check_gop(struct xen_netbk *netbk, for (i = start; i < nr_frags; i++) { int j, newerr; + pending_ring_idx_t head; pending_idx = frag_get_pending_idx(&shinfo->frags[i]); + tx_info = &netbk->pending_tx_info[pending_idx]; + head = tx_info->head; /* Check error status: if okay then remember grant handle. */ - newerr = (++gop)->status; + do { + newerr = (++gop)->status; + if (newerr) + break; + peek = netbk->pending_ring[pending_index(++head)]; + } while (!pending_tx_is_head(netbk, peek)); + if (likely(!newerr)) { /* Had a previous error? Invalidate this fragment. */ if (unlikely(err)) @@ -1256,11 +1394,12 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) struct sk_buff *skb; int ret; - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && + while ((nr_pending_reqs(netbk) + XEN_NETIF_NR_SLOTS_MIN + < MAX_PENDING_REQS) && !list_empty(&netbk->net_schedule_list)) { struct xenvif *vif; struct xen_netif_tx_request txreq; - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; + struct xen_netif_tx_request txfrags[max_skb_slots]; struct page *page; struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; u16 pending_idx; @@ -1321,7 +1460,8 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) continue; } - ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do); + ret = netbk_count_requests(vif, &txreq, idx, + txfrags, work_to_do); if (unlikely(ret < 0)) continue; @@ -1348,7 +1488,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) pending_idx = netbk->pending_ring[index]; data_len = (txreq.size > PKT_PROT_LEN && - ret < MAX_SKB_FRAGS) ? + ret < XEN_NETIF_NR_SLOTS_MIN) ? PKT_PROT_LEN : txreq.size; skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, @@ -1398,6 +1538,7 @@ static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk) memcpy(&netbk->pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); netbk->pending_tx_info[pending_idx].vif = vif; + netbk->pending_tx_info[pending_idx].head = index; *((u16 *)skb->data) = pending_idx; __skb_put(skb, data_len); @@ -1528,7 +1669,10 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, { struct xenvif *vif; struct pending_tx_info *pending_tx_info; - pending_ring_idx_t index; + pending_ring_idx_t head; + u16 peek; /* peek into next tx request */ + + BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL)); /* Already complete? */ if (netbk->mmap_pages[pending_idx] == NULL) @@ -1537,19 +1681,40 @@ static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx, pending_tx_info = &netbk->pending_tx_info[pending_idx]; vif = pending_tx_info->vif; + head = pending_tx_info->head; - make_tx_response(vif, &pending_tx_info->req, status); + BUG_ON(!pending_tx_is_head(netbk, head)); + BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx); - index = pending_index(netbk->pending_prod++); - netbk->pending_ring[index] = pending_idx; + do { + pending_ring_idx_t index; + pending_ring_idx_t idx = pending_index(head); + u16 info_idx = netbk->pending_ring[idx]; - xenvif_put(vif); + pending_tx_info = &netbk->pending_tx_info[info_idx]; + make_tx_response(vif, &pending_tx_info->req, status); + + /* Setting any number other than + * INVALID_PENDING_RING_IDX indicates this slot is + * starting a new packet / ending a previous packet. + */ + pending_tx_info->head = 0; + + index = pending_index(netbk->pending_prod++); + netbk->pending_ring[index] = netbk->pending_ring[info_idx]; - netbk->mmap_pages[pending_idx]->mapping = NULL; + xenvif_put(vif); + + peek = netbk->pending_ring[pending_index(++head)]; + + } while (!pending_tx_is_head(netbk, peek)); + + netbk->mmap_pages[pending_idx]->mapping = 0; put_page(netbk->mmap_pages[pending_idx]); netbk->mmap_pages[pending_idx] = NULL; } + static void make_tx_response(struct xenvif *vif, struct xen_netif_tx_request *txp, s8 st) @@ -1602,8 +1767,9 @@ static inline int rx_work_todo(struct xen_netbk *netbk) static inline int tx_work_todo(struct xen_netbk *netbk) { - if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && - !list_empty(&netbk->net_schedule_list)) + if ((nr_pending_reqs(netbk) + XEN_NETIF_NR_SLOTS_MIN + < MAX_PENDING_REQS) && + !list_empty(&netbk->net_schedule_list)) return 1; return 0; @@ -1686,6 +1852,13 @@ static int __init netback_init(void) if (!xen_domain()) return -ENODEV; + if (max_skb_slots < XEN_NETIF_NR_SLOTS_MIN) { + printk(KERN_INFO + "xen-netback: max_skb_slots too small (%d), bump it to XEN_NETIF_NR_SLOTS_MIN (%d)\n", + max_skb_slots, XEN_NETIF_NR_SLOTS_MIN); + max_skb_slots = XEN_NETIF_NR_SLOTS_MIN; + } + xen_netbk_group_nr = num_online_cpus(); xen_netbk = vzalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr); if (!xen_netbk) diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h index 58fadcac33a17c004c1cdf17557bc570b8a4f33e..3ef3fe05ee99c55aa27d7c3eb5f828935b89ed59 100644 --- a/include/xen/interface/io/netif.h +++ b/include/xen/interface/io/netif.h @@ -12,6 +12,24 @@ #include #include +/* + * Older implementation of Xen network frontend / backend has an + * implicit dependency on the MAX_SKB_FRAGS as the maximum number of + * ring slots a skb can use. Netfront / netback may not work as + * expected when frontend and backend have different MAX_SKB_FRAGS. + * + * A better approach is to add mechanism for netfront / netback to + * negotiate this value. However we cannot fix all possible + * frontends, so we need to define a value which states the minimum + * slots backend must support. + * + * The minimum value derives from older Linux kernel's MAX_SKB_FRAGS + * (18), which is proved to work with most frontends. Any new backend + * which doesn't negotiate with frontend should expect frontend to + * send a valid packet using slots up to this value. + */ +#define XEN_NETIF_NR_SLOTS_MIN 18 + /* * Notifications after enqueuing any type of message should be conditional on * the appropriate req_event or rsp_event field in the shared ring.