diff mbox series

[net-next] avoid fragmenting page memory with netdev_alloc_cache

Message ID 20210212001842.32714-1-doshir@vmware.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] avoid fragmenting page memory with netdev_alloc_cache | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers success CCed 19 of 19 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 8793 this patch: 8794
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/build_allmodconfig_warn fail Errors and warnings before: 9192 this patch: 9193
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Ronak Doshi Feb. 12, 2021, 12:18 a.m. UTC
From: Todd Sabin <tsabin@vmware.com>

Linux network stack uses an allocation page cache for skbs.  The
purpose is to reduce the number of page allocations that it needs to
make, and it works by allocating a group of pages, and then
sub-allocating skb memory from them.  When all skbs referencing the
shared pages are freed, then the block of pages is finally freed.

When these skbs are all freed close together in time, this works fine.
However, what can happen is that there are multiple nics (or multiple
rx-queues in a single nic), and the skbs are allocated to fill the rx
ring(s). If some nics or queues are far more active than others, the
entries in the less busy nic/queue may end up referencing a page
block, while all of the other packets that referenced that block of
pages are freed.

The result of this is that the memory used by an appliance for its rx
rings can slowly grow to be much greater than it was originally.

This patch fixes that by giving each vmxnet3 device a per-rx-queue page
cache.

Signed-off-by: Todd Sabin <tsabin@vmware.com>
Signed-off-by: Ronak Doshi <doshir@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 30 ++++++++++++++++++++++++------
 drivers/net/vmxnet3/vmxnet3_int.h |  2 ++
 include/linux/skbuff.h            |  2 ++
 net/core/skbuff.c                 | 21 +++++++++++++++------
 4 files changed, 43 insertions(+), 12 deletions(-)

Comments

kernel test robot Feb. 12, 2021, 1:45 a.m. UTC | #1
Hi Ronak,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Ronak-Doshi/avoid-fragmenting-page-memory-with-netdev_alloc_cache/20210212-082217
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git e4b62cf7559f2ef9a022de235e5a09a8d7ded520
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/9f45ca1995ce8958b4ee24fcdc80639314ce25aa
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ronak-Doshi/avoid-fragmenting-page-memory-with-netdev_alloc_cache/20210212-082217
        git checkout 9f45ca1995ce8958b4ee24fcdc80639314ce25aa
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/net/vmxnet3/vmxnet3_drv.c: In function 'vmxnet3_rq_rx_complete':
>> drivers/net/vmxnet3/vmxnet3_drv.c:1402:8: warning: variable 'len' set but not used [-Wunused-but-set-variable]
    1402 |    u16 len;
         |        ^~~


vim +/len +1402 drivers/net/vmxnet3/vmxnet3_drv.c

45dac1d6ea045a Shreyas Bhatewara  2015-06-19  1343  
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1344  static int
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1345  vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1346  		       struct vmxnet3_adapter *adapter, int quota)
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1347  {
215faf9c5f6e31 Joe Perches        2010-12-21  1348  	static const u32 rxprod_reg[2] = {
215faf9c5f6e31 Joe Perches        2010-12-21  1349  		VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2
215faf9c5f6e31 Joe Perches        2010-12-21  1350  	};
0769636cb5b956 Neil Horman        2015-07-07  1351  	u32 num_pkts = 0;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1352  	bool skip_page_frags = false;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1353  	struct Vmxnet3_RxCompDesc *rcd;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1354  	struct vmxnet3_rx_ctx *ctx = &rq->rx_ctx;
45dac1d6ea045a Shreyas Bhatewara  2015-06-19  1355  	u16 segCnt = 0, mss = 0;
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1356  #ifdef __BIG_ENDIAN_BITFIELD
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1357  	struct Vmxnet3_RxDesc rxCmdDesc;
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1358  	struct Vmxnet3_RxCompDesc rxComp;
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1359  #endif
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1360  	vmxnet3_getRxComp(rcd, &rq->comp_ring.base[rq->comp_ring.next2proc].rcd,
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1361  			  &rxComp);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1362  	while (rcd->gen == rq->comp_ring.gen) {
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1363  		struct vmxnet3_rx_buf_info *rbi;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1364  		struct sk_buff *skb, *new_skb = NULL;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1365  		struct page *new_page = NULL;
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1366  		dma_addr_t new_dma_addr;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1367  		int num_to_alloc;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1368  		struct Vmxnet3_RxDesc *rxd;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1369  		u32 idx, ring_idx;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1370  		struct vmxnet3_cmd_ring	*ring = NULL;
0769636cb5b956 Neil Horman        2015-07-07  1371  		if (num_pkts >= quota) {
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1372  			/* we may stop even before we see the EOP desc of
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1373  			 * the current pkt
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1374  			 */
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1375  			break;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1376  		}
f3002c1374fb23 hpreg@vmware.com   2018-05-14  1377  
f3002c1374fb23 hpreg@vmware.com   2018-05-14  1378  		/* Prevent any rcd field from being (speculatively) read before
f3002c1374fb23 hpreg@vmware.com   2018-05-14  1379  		 * rcd->gen is read.
f3002c1374fb23 hpreg@vmware.com   2018-05-14  1380  		 */
f3002c1374fb23 hpreg@vmware.com   2018-05-14  1381  		dma_rmb();
f3002c1374fb23 hpreg@vmware.com   2018-05-14  1382  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1383  		BUG_ON(rcd->rqID != rq->qid && rcd->rqID != rq->qid2 &&
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1384  		       rcd->rqID != rq->dataRingQid);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1385  		idx = rcd->rxdIdx;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1386  		ring_idx = VMXNET3_GET_RING_IDX(adapter, rcd->rqID);
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1387  		ring = rq->rx_ring + ring_idx;
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1388  		vmxnet3_getRxDesc(rxd, &rq->rx_ring[ring_idx].base[idx].rxd,
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1389  				  &rxCmdDesc);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1390  		rbi = rq->buf_info[ring_idx] + idx;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1391  
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1392  		BUG_ON(rxd->addr != rbi->dma_addr ||
115924b6bdc7cc Shreyas Bhatewara  2009-11-16  1393  		       rxd->len != rbi->len);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1394  
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1395  		if (unlikely(rcd->eop && rcd->err)) {
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1396  			vmxnet3_rx_error(rq, rcd, ctx, adapter);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1397  			goto rcd_done;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1398  		}
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1399  
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1400  		if (rcd->sop) { /* first buf of the pkt */
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1401  			bool rxDataRingUsed;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16 @1402  			u16 len;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1403  
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1404  			BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_HEAD ||
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1405  			       (rcd->rqID != rq->qid &&
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1406  				rcd->rqID != rq->dataRingQid));
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1407  
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1408  			BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_SKB);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1409  			BUG_ON(ctx->skb != NULL || rbi->skb == NULL);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1410  
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1411  			if (unlikely(rcd->len == 0)) {
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1412  				/* Pretend the rx buffer is skipped. */
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1413  				BUG_ON(!(rcd->sop && rcd->eop));
fdcd79b94b2441 Stephen Hemminger  2013-01-15  1414  				netdev_dbg(adapter->netdev,
f6965582ac9b87 Randy Dunlap       2009-10-16  1415  					"rxRing[%u][%u] 0 length\n",
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1416  					ring_idx, idx);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1417  				goto rcd_done;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1418  			}
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1419  
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1420  			skip_page_frags = false;
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1421  			ctx->skb = rbi->skb;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1422  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1423  			rxDataRingUsed =
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1424  				VMXNET3_RX_DATA_RING(adapter, rcd->rqID);
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1425  			len = rxDataRingUsed ? rcd->len : rbi->len;
9f45ca1995ce89 Todd Sabin         2021-02-11  1426  			new_skb = ___netdev_alloc_skb(adapter->netdev,
9f45ca1995ce89 Todd Sabin         2021-02-11  1427  						      rbi->len + NET_IP_ALIGN, GFP_ATOMIC,
9f45ca1995ce89 Todd Sabin         2021-02-11  1428  						      &adapter->frag_cache[rq->qid]);
9f45ca1995ce89 Todd Sabin         2021-02-11  1429  			if (NET_IP_ALIGN && new_skb)
9f45ca1995ce89 Todd Sabin         2021-02-11  1430  				skb_reserve(new_skb, NET_IP_ALIGN);
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1431  			if (new_skb == NULL) {
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1432  				/* Skb allocation failed, do not handover this
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1433  				 * skb to stack. Reuse it. Drop the existing pkt
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1434  				 */
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1435  				rq->stats.rx_buf_alloc_failure++;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1436  				ctx->skb = NULL;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1437  				rq->stats.drop_total++;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1438  				skip_page_frags = true;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1439  				goto rcd_done;
5318d809d7b497 Shreyas Bhatewara  2011-07-05  1440  			}
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1441  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1442  			if (rxDataRingUsed) {
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1443  				size_t sz;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1444  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1445  				BUG_ON(rcd->len > rq->data_ring.desc_size);
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1446  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1447  				ctx->skb = new_skb;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1448  				sz = rcd->rxdIdx * rq->data_ring.desc_size;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1449  				memcpy(new_skb->data,
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1450  				       &rq->data_ring.base[sz], rcd->len);
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1451  			} else {
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1452  				ctx->skb = rbi->skb;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1453  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1454  				new_dma_addr =
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1455  					dma_map_single(&adapter->pdev->dev,
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1456  						       new_skb->data, rbi->len,
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1457  						       PCI_DMA_FROMDEVICE);
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1458  				if (dma_mapping_error(&adapter->pdev->dev,
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1459  						      new_dma_addr)) {
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1460  					dev_kfree_skb(new_skb);
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1461  					/* Skb allocation failed, do not
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1462  					 * handover this skb to stack. Reuse
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1463  					 * it. Drop the existing pkt.
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1464  					 */
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1465  					rq->stats.rx_buf_alloc_failure++;
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1466  					ctx->skb = NULL;
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1467  					rq->stats.drop_total++;
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1468  					skip_page_frags = true;
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1469  					goto rcd_done;
5738a09d58d5ad Alexey Khoroshilov 2015-11-28  1470  				}
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1471  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1472  				dma_unmap_single(&adapter->pdev->dev,
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1473  						 rbi->dma_addr,
b0eb57cb97e783 Andy King          2013-08-23  1474  						 rbi->len,
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1475  						 PCI_DMA_FROMDEVICE);
d1a890fa37f27d Shreyas Bhatewara  2009-10-13  1476  
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1477  				/* Immediate refill */
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1478  				rbi->skb = new_skb;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1479  				rbi->dma_addr = new_dma_addr;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1480  				rxd->addr = cpu_to_le64(rbi->dma_addr);
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1481  				rxd->len = rbi->len;
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1482  			}
50a5ce3e7116a7 Shrikrishna Khare  2016-06-16  1483  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Ronak Doshi Feb. 24, 2021, 7:31 p.m. UTC | #2
On 2/11/21, 4:18 PM, "Ronak Doshi" <doshir@vmware.com> wrote:
>    From: Todd Sabin <tsabin@vmware.com>
>
>    Linux network stack uses an allocation page cache for skbs.  The
>   purpose is to reduce the number of page allocations that it needs to
>    make, and it works by allocating a group of pages, and then
>    sub-allocating skb memory from them.  When all skbs referencing the
>    shared pages are freed, then the block of pages is finally freed.
>
>    When these skbs are all freed close together in time, this works fine.
>    However, what can happen is that there are multiple nics (or multiple
>    rx-queues in a single nic), and the skbs are allocated to fill the rx
>    ring(s). If some nics or queues are far more active than others, the
>    entries in the less busy nic/queue may end up referencing a page
>    block, while all of the other packets that referenced that block of
>    pages are freed.
>
>    The result of this is that the memory used by an appliance for its rx
>    rings can slowly grow to be much greater than it was originally.
>
>    This patch fixes that by giving each vmxnet3 device a per-rx-queue page
>    cache.
>
>    Signed-off-by: Todd Sabin <tsabin@vmware.com>
>    Signed-off-by: Ronak Doshi <doshir@vmware.com>
>    ---
>     drivers/net/vmxnet3/vmxnet3_drv.c | 30 ++++++++++++++++++++++++------
>     drivers/net/vmxnet3/vmxnet3_int.h |  2 ++
>     include/linux/skbuff.h            |  2 ++
>     net/core/skbuff.c                 | 21 +++++++++++++++------
>     4 files changed, 43 insertions(+), 12 deletions(-)

Any update on this patch?

Thanks,
Ronak
diff mbox series

Patch

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 6e87f1fc4874..edcbc38c3ff6 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -574,9 +574,11 @@  vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 
 		if (rbi->buf_type == VMXNET3_RX_BUF_SKB) {
 			if (rbi->skb == NULL) {
-				rbi->skb = __netdev_alloc_skb_ip_align(adapter->netdev,
-								       rbi->len,
-								       GFP_KERNEL);
+				rbi->skb = ___netdev_alloc_skb(adapter->netdev,
+							       rbi->len + NET_IP_ALIGN, GFP_KERNEL,
+							       &adapter->frag_cache[rq->qid]);
+				if (NET_IP_ALIGN && rbi->skb)
+					skb_reserve(rbi->skb, NET_IP_ALIGN);
 				if (unlikely(rbi->skb == NULL)) {
 					rq->stats.rx_buf_alloc_failure++;
 					break;
@@ -1421,8 +1423,11 @@  vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			rxDataRingUsed =
 				VMXNET3_RX_DATA_RING(adapter, rcd->rqID);
 			len = rxDataRingUsed ? rcd->len : rbi->len;
-			new_skb = netdev_alloc_skb_ip_align(adapter->netdev,
-							    len);
+			new_skb = ___netdev_alloc_skb(adapter->netdev,
+						      rbi->len + NET_IP_ALIGN, GFP_ATOMIC,
+						      &adapter->frag_cache[rq->qid]);
+			if (NET_IP_ALIGN && new_skb)
+				skb_reserve(new_skb, NET_IP_ALIGN);
 			if (new_skb == NULL) {
 				/* Skb allocation failed, do not handover this
 				 * skb to stack. Reuse it. Drop the existing pkt
@@ -1483,6 +1488,7 @@  vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 					     le32_to_cpu(rcd->rssHash),
 					     PKT_HASH_TYPE_L3);
 #endif
+			skb_record_rx_queue(ctx->skb, rq->qid);
 			skb_put(ctx->skb, rcd->len);
 
 			if (VMXNET3_VERSION_GE_2(adapter) &&
@@ -3652,7 +3658,7 @@  vmxnet3_remove_device(struct pci_dev *pdev)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
-	int size = 0;
+	int size = 0, i;
 	int num_rx_queues;
 
 #ifdef VMXNET3_RSS
@@ -3691,6 +3697,18 @@  vmxnet3_remove_device(struct pci_dev *pdev)
 			  adapter->shared, adapter->shared_pa);
 	dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
 			 sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
+	for (i = 0; i < VMXNET3_DEVICE_MAX_RX_QUEUES; i++) {
+		struct page *page;
+		struct page_frag_cache *nc;
+
+		nc = &adapter->frag_cache[i];
+		if (unlikely(!nc->va)) {
+			/* nothing to do */
+			continue;
+		}
+		page = virt_to_page(nc->va);
+		__page_frag_cache_drain(page, nc->pagecnt_bias);
+	}
 	free_netdev(netdev);
 }
 
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index e910596b79cf..7e8767007203 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -42,6 +42,7 @@ 
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 #include <asm/dma.h>
 #include <asm/page.h>
 
@@ -362,6 +363,7 @@  struct vmxnet3_adapter {
 	dma_addr_t			shared_pa;
 	dma_addr_t queue_desc_pa;
 	dma_addr_t coal_conf_pa;
+	struct page_frag_cache          frag_cache[VMXNET3_DEVICE_MAX_RX_QUEUES];
 
 	/* Wake-on-LAN */
 	u32     wol;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0a4e91a2f873..b57485016e04 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2841,6 +2841,8 @@  static inline void *netdev_alloc_frag_align(unsigned int fragsz,
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
 				   gfp_t gfp_mask);
+struct sk_buff *___netdev_alloc_skb(struct net_device *dev, unsigned int length,
+				    gfp_t gfp_mask, struct page_frag_cache *nc);
 
 /**
  *	netdev_alloc_skb - allocate an skbuff for rx on a specific device
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d380c7b5a12d..ee0611345f6c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -409,10 +409,11 @@  void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 EXPORT_SYMBOL(__netdev_alloc_frag_align);
 
 /**
- *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	___netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
  *	@len: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *	@nc: page frag cache
  *
  *	Allocate a new &sk_buff and assign it a usage count of one. The
  *	buffer has NET_SKB_PAD headroom built in. Users should allocate
@@ -421,10 +422,9 @@  EXPORT_SYMBOL(__netdev_alloc_frag_align);
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
-				   gfp_t gfp_mask)
+struct sk_buff *___netdev_alloc_skb(struct net_device *dev, unsigned int len,
+				    gfp_t gfp_mask, struct page_frag_cache *nc)
 {
-	struct page_frag_cache *nc;
 	struct sk_buff *skb;
 	bool pfmemalloc;
 	void *data;
@@ -450,12 +450,14 @@  struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 		gfp_mask |= __GFP_MEMALLOC;
 
 	if (in_irq() || irqs_disabled()) {
-		nc = this_cpu_ptr(&netdev_alloc_cache);
+		if (!nc)
+			nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 	} else {
 		local_bh_disable();
-		nc = this_cpu_ptr(&napi_alloc_cache.page);
+		if (!nc)
+			nc = this_cpu_ptr(&napi_alloc_cache.page);
 		data = page_frag_alloc(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 		local_bh_enable();
@@ -481,6 +483,13 @@  struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 skb_fail:
 	return skb;
 }
+EXPORT_SYMBOL(___netdev_alloc_skb);
+
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+				   gfp_t gfp_mask)
+{
+	return ___netdev_alloc_skb(dev, len, gfp_mask, NULL);
+}
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
 /**