diff mbox series

delayacct: track delays from write-protect copy

Message ID 20220408103708.2495882-1-yang.yang29@zte.com.cn (mailing list archive)
State New
Headers show
Series delayacct: track delays from write-protect copy | expand

Commit Message

CGEL April 8, 2022, 10:37 a.m. UTC
From: Yang Yang <yang.yang29@zte.com.cn>

Delay accounting does not track the delay of write-protect copy. When
tasks trigger many write-protect copys(include COW and unsharing of
anonymous pages[1]), it may spend a amount of time waiting for them.
To get the delay of tasks in write-protect copy, could help users to
evaluate the impact of using KSM or fork() or GUP.

Also update tools/accounting/getdelays.c:

    / # ./getdelays -dl -p 231
    print delayacct stats ON
    listen forever
    PID     231

    CPU             count     real total  virtual total    delay total  delay average
                     6247     1859000000     2154070021     1674255063          0.268ms
    IO              count    delay total  delay average
                        0              0              0ms
    SWAP            count    delay total  delay average
                        0              0              0ms
    RECLAIM         count    delay total  delay average
                        0              0              0ms
    THRASHING       count    delay total  delay average
                        0              0              0ms
    COMPACT         count    delay total  delay average
                        3          72758              0ms
    WPCOPY          count    delay total  delay average
                     3635      271567604              0ms

[1] commit 31cc5bc4af70("mm: support GUP-triggered unsharing of anonymous pages")

Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Reviewed-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Reviewed-by: wangyong <wang.yong12@zte.com.cn>
---
 Documentation/accounting/delay-accounting.rst |  5 +++-
 include/linux/delayacct.h                     | 28 +++++++++++++++++++
 include/uapi/linux/taskstats.h                |  6 +++-
 kernel/delayacct.c                            | 16 +++++++++++
 mm/hugetlb.c                                  |  7 +++++
 mm/memory.c                                   |  8 ++++++
 tools/accounting/getdelays.c                  |  8 +++++-
 7 files changed, 75 insertions(+), 3 deletions(-)

Comments

kernel test robot April 8, 2022, 3:02 p.m. UTC | #1
Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-mm/master]
[also build test ERROR on next-20220408]
[cannot apply to linus/master linux/master v5.18-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/cgel-zte-gmail-com/delayacct-track-delays-from-write-protect-copy/20220408-183941
base:   https://github.com/hnaz/linux-mm master
config: parisc-randconfig-r036-20220408 (https://download.01.org/0day-ci/archive/20220408/202204082220.1w5xTsNt-lkp@intel.com/config)
compiler: hppa64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/6ae63c8c70d9597a9e67cb06b6a23a5b01e52da6
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review cgel-zte-gmail-com/delayacct-track-delays-from-write-protect-copy/20220408-183941
        git checkout 6ae63c8c70d9597a9e67cb06b6a23a5b01e52da6
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=parisc64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   mm/hugetlb.c: In function 'hugetlb_wp':
>> mm/hugetlb.c:5176:9: error: implicit declaration of function 'delayacct_wpcopy_start' [-Werror=implicit-function-declaration]
    5176 |         delayacct_wpcopy_start();
         |         ^~~~~~~~~~~~~~~~~~~~~~
>> mm/hugetlb.c:5189:17: error: implicit declaration of function 'delayacct_wpcopy_end' [-Werror=implicit-function-declaration]
    5189 |                 delayacct_wpcopy_end();
         |                 ^~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/delayacct_wpcopy_start +5176 mm/hugetlb.c

  5150	
  5151	/*
  5152	 * hugetlb_wp() should be called with page lock of the original hugepage held.
  5153	 * Called with hugetlb_fault_mutex_table held and pte_page locked so we
  5154	 * cannot race with other handlers or page migration.
  5155	 * Keep the pte_same checks anyway to make transition from the mutex easier.
  5156	 */
  5157	static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
  5158			       unsigned long address, pte_t *ptep, unsigned int flags,
  5159			       struct page *pagecache_page, spinlock_t *ptl)
  5160	{
  5161		const bool unshare = flags & FAULT_FLAG_UNSHARE;
  5162		pte_t pte;
  5163		struct hstate *h = hstate_vma(vma);
  5164		struct page *old_page, *new_page;
  5165		int outside_reserve = 0;
  5166		vm_fault_t ret = 0;
  5167		unsigned long haddr = address & huge_page_mask(h);
  5168		struct mmu_notifier_range range;
  5169	
  5170		VM_BUG_ON(unshare && (flags & FOLL_WRITE));
  5171		VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
  5172	
  5173		pte = huge_ptep_get(ptep);
  5174		old_page = pte_page(pte);
  5175	
> 5176		delayacct_wpcopy_start();
  5177	
  5178	retry_avoidcopy:
  5179		/*
  5180		 * If no-one else is actually using this page, we're the exclusive
  5181		 * owner and can reuse this page.
  5182		 */
  5183		if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
  5184			if (!PageAnonExclusive(old_page))
  5185				page_move_anon_rmap(old_page, vma);
  5186			if (likely(!unshare))
  5187				set_huge_ptep_writable(vma, haddr, ptep);
  5188	
> 5189			delayacct_wpcopy_end();
  5190			return 0;
  5191		}
  5192		VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
  5193			       old_page);
  5194	
  5195		/*
  5196		 * If the process that created a MAP_PRIVATE mapping is about to
  5197		 * perform a COW due to a shared page count, attempt to satisfy
  5198		 * the allocation without using the existing reserves. The pagecache
  5199		 * page is used to determine if the reserve at this address was
  5200		 * consumed or not. If reserves were used, a partial faulted mapping
  5201		 * at the time of fork() could consume its reserves on COW instead
  5202		 * of the full address range.
  5203		 */
  5204		if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
  5205				old_page != pagecache_page)
  5206			outside_reserve = 1;
  5207	
  5208		get_page(old_page);
  5209	
  5210		/*
  5211		 * Drop page table lock as buddy allocator may be called. It will
  5212		 * be acquired again before returning to the caller, as expected.
  5213		 */
  5214		spin_unlock(ptl);
  5215		new_page = alloc_huge_page(vma, haddr, outside_reserve);
  5216	
  5217		if (IS_ERR(new_page)) {
  5218			/*
  5219			 * If a process owning a MAP_PRIVATE mapping fails to COW,
  5220			 * it is due to references held by a child and an insufficient
  5221			 * huge page pool. To guarantee the original mappers
  5222			 * reliability, unmap the page from child processes. The child
  5223			 * may get SIGKILLed if it later faults.
  5224			 */
  5225			if (outside_reserve) {
  5226				struct address_space *mapping = vma->vm_file->f_mapping;
  5227				pgoff_t idx;
  5228				u32 hash;
  5229	
  5230				put_page(old_page);
  5231				BUG_ON(huge_pte_none(pte));
  5232				/*
  5233				 * Drop hugetlb_fault_mutex and i_mmap_rwsem before
  5234				 * unmapping.  unmapping needs to hold i_mmap_rwsem
  5235				 * in write mode.  Dropping i_mmap_rwsem in read mode
  5236				 * here is OK as COW mappings do not interact with
  5237				 * PMD sharing.
  5238				 *
  5239				 * Reacquire both after unmap operation.
  5240				 */
  5241				idx = vma_hugecache_offset(h, vma, haddr);
  5242				hash = hugetlb_fault_mutex_hash(mapping, idx);
  5243				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  5244				i_mmap_unlock_read(mapping);
  5245	
  5246				unmap_ref_private(mm, vma, old_page, haddr);
  5247	
  5248				i_mmap_lock_read(mapping);
  5249				mutex_lock(&hugetlb_fault_mutex_table[hash]);
  5250				spin_lock(ptl);
  5251				ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
  5252				if (likely(ptep &&
  5253					   pte_same(huge_ptep_get(ptep), pte)))
  5254					goto retry_avoidcopy;
  5255				/*
  5256				 * race occurs while re-acquiring page table
  5257				 * lock, and our job is done.
  5258				 */
  5259				delayacct_wpcopy_end();
  5260				return 0;
  5261			}
  5262	
  5263			ret = vmf_error(PTR_ERR(new_page));
  5264			goto out_release_old;
  5265		}
  5266	
  5267		/*
  5268		 * When the original hugepage is shared one, it does not have
  5269		 * anon_vma prepared.
  5270		 */
  5271		if (unlikely(anon_vma_prepare(vma))) {
  5272			ret = VM_FAULT_OOM;
  5273			goto out_release_all;
  5274		}
  5275	
  5276		copy_user_huge_page(new_page, old_page, address, vma,
  5277				    pages_per_huge_page(h));
  5278		__SetPageUptodate(new_page);
  5279	
  5280		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
  5281					haddr + huge_page_size(h));
  5282		mmu_notifier_invalidate_range_start(&range);
  5283	
  5284		/*
  5285		 * Retake the page table lock to check for racing updates
  5286		 * before the page tables are altered
  5287		 */
  5288		spin_lock(ptl);
  5289		ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
  5290		if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
  5291			ClearHPageRestoreReserve(new_page);
  5292	
  5293			/* Break COW or unshare */
  5294			huge_ptep_clear_flush(vma, haddr, ptep);
  5295			mmu_notifier_invalidate_range(mm, range.start, range.end);
  5296			page_remove_rmap(old_page, vma, true);
  5297			hugepage_add_new_anon_rmap(new_page, vma, haddr);
  5298			set_huge_pte_at(mm, haddr, ptep,
  5299					make_huge_pte(vma, new_page, !unshare));
  5300			SetHPageMigratable(new_page);
  5301			/* Make the old page be freed below */
  5302			new_page = old_page;
  5303		}
  5304		spin_unlock(ptl);
  5305		mmu_notifier_invalidate_range_end(&range);
  5306	out_release_all:
  5307		/*
  5308		 * No restore in case of successful pagetable update (Break COW or
  5309		 * unshare)
  5310		 */
  5311		if (new_page != old_page)
  5312			restore_reserve_on_error(h, vma, haddr, new_page);
  5313		put_page(new_page);
  5314	out_release_old:
  5315		put_page(old_page);
  5316	
  5317		spin_lock(ptl); /* Caller expects lock to be held */
  5318	
  5319		delayacct_wpcopy_end();
  5320		return ret;
  5321	}
  5322
CGEL May 6, 2022, 2:48 a.m. UTC | #2
Hi
    I found this patch is first merged into linux-next, and dropped silently
without sending any mail. Could you please tell the reason? Thanks!
    It was merged by this:
    https://lore.kernel.org/all/20220409031312.583D3C385A4@smtp.kernel.org/
Andrew Morton May 6, 2022, 4:57 a.m. UTC | #3
On Fri, 6 May 2022 02:48:04 +0000 CGEL <cgel.zte@gmail.com> wrote:

>     I found this patch is first merged into linux-next, and dropped silently
> without sending any mail. Could you please tell the reason? Thanks!

It's still in my tree, part of a small queue of post-linux-next patches
which I'm not presently distributing.  I'll bring it back soon.
CGEL May 13, 2022, 1:30 a.m. UTC | #4
On Thu, May 05, 2022 at 09:57:06PM -0700, Andrew Morton wrote:
> On Fri, 6 May 2022 02:48:04 +0000 CGEL <cgel.zte@gmail.com> wrote:
> 
> >     I found this patch is first merged into linux-next, and dropped silently
> > without sending any mail. Could you please tell the reason? Thanks!
> 
> It's still in my tree, part of a small queue of post-linux-next patches
> which I'm not presently distributing.  I'll bring it back soon.

Thanks! I want to write some other patches base on this patch, so please
notify me when it back.
diff mbox series

Patch

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 197fe319cbec..241d1a87f2cd 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -15,6 +15,7 @@  c) swapping in pages
 d) memory reclaim
 e) thrashing page cache
 f) direct compact
+g) write-protect copy
 
 and makes these statistics available to userspace through
 the taskstats interface.
@@ -48,7 +49,7 @@  this structure. See
 for a description of the fields pertaining to delay accounting.
 It will generally be in the form of counters returning the cumulative
 delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page
-cache, direct compact etc.
+cache, direct compact, write-protect copy etc.
 
 Taking the difference of two successive readings of a given
 counter (say cpu_delay_total) for a task will give the delay
@@ -117,6 +118,8 @@  Get sum of delays, since system boot, for all pids with tgid 5::
 	                    0              0              0ms
 	COMPACT         count    delay total  delay average
 	                    0              0              0ms
+        WPCOPY          count    delay total  delay average
+                            0              0              0ms
 
 Get IO accounting for pid 1, it works only with -p::
 
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 6b16a6930a19..58aea2d7385c 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -45,9 +45,13 @@  struct task_delay_info {
 	u64 compact_start;
 	u64 compact_delay;	/* wait for memory compact */
 
+	u64 wpcopy_start;
+	u64 wpcopy_delay;	/* wait for write-protect copy */
+
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
 	u32 compact_count;	/* total count of memory compact */
+	u32 wpcopy_count;	/* total count of write-protect copy */
 };
 #endif
 
@@ -75,6 +79,8 @@  extern void __delayacct_swapin_start(void);
 extern void __delayacct_swapin_end(void);
 extern void __delayacct_compact_start(void);
 extern void __delayacct_compact_end(void);
+extern void __delayacct_wpcopy_start(void);
+extern void __delayacct_wpcopy_end(void);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -191,6 +197,24 @@  static inline void delayacct_compact_end(void)
 		__delayacct_compact_end();
 }
 
+static inline void delayacct_wpcopy_start(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_wpcopy_start();
+}
+
+static inline void delayacct_wpcopy_end(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_wpcopy_end();
+}
+
 #else
 static inline void delayacct_init(void)
 {}
@@ -225,6 +249,10 @@  static inline void delayacct_compact_start(void)
 {}
 static inline void delayacct_compact_end(void)
 {}
+static inline void delayacct_wpcopy_start(void)
+{}
+static inline void delayacct_wpcopy_end(void)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 736154171489..a7f5b11a8f1b 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@ 
  */
 
 
-#define TASKSTATS_VERSION	12
+#define TASKSTATS_VERSION	13
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -194,6 +194,10 @@  struct taskstats {
 	__u64   ac_exe_dev;     /* program binary device ID */
 	__u64   ac_exe_inode;   /* program binary inode number */
 	/* v12 end */
+
+	/* v13: Delay waiting for write-protect copy */
+	__u64    wpcopy_count;
+	__u64    wpcopy_delay_total;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2c1e18f7c5cf..164ed9ef77a3 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -177,11 +177,14 @@  int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
 	tmp = d->compact_delay_total + tsk->delays->compact_delay;
 	d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
+	tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
+	d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
 	d->thrashing_count += tsk->delays->thrashing_count;
 	d->compact_count += tsk->delays->compact_count;
+	d->wpcopy_count += tsk->delays->wpcopy_count;
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -249,3 +252,16 @@  void __delayacct_compact_end(void)
 		      &current->delays->compact_delay,
 		      &current->delays->compact_count);
 }
+
+void __delayacct_wpcopy_start(void)
+{
+	current->delays->wpcopy_start = local_clock();
+}
+
+void __delayacct_wpcopy_end(void)
+{
+	delayacct_end(&current->delays->lock,
+		      &current->delays->wpcopy_start,
+		      &current->delays->wpcopy_delay,
+		      &current->delays->wpcopy_count);
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fb5a549169ce..b131d44741dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5173,6 +5173,8 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte = huge_ptep_get(ptep);
 	old_page = pte_page(pte);
 
+	delayacct_wpcopy_start();
+
 retry_avoidcopy:
 	/*
 	 * If no-one else is actually using this page, we're the exclusive
@@ -5183,6 +5185,8 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_move_anon_rmap(old_page, vma);
 		if (likely(!unshare))
 			set_huge_ptep_writable(vma, haddr, ptep);
+
+		delayacct_wpcopy_end();
 		return 0;
 	}
 	VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
@@ -5252,6 +5256,7 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 			 * race occurs while re-acquiring page table
 			 * lock, and our job is done.
 			 */
+			delayacct_wpcopy_end();
 			return 0;
 		}
 
@@ -5310,6 +5315,8 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 	put_page(old_page);
 
 	spin_lock(ptl); /* Caller expects lock to be held */
+
+	delayacct_wpcopy_end();
 	return ret;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index a82bf21be5e3..aad64c51c175 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3008,6 +3008,8 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	int page_copied = 0;
 	struct mmu_notifier_range range;
 
+	delayacct_wpcopy_start();
+
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 
@@ -3032,6 +3034,8 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			put_page(new_page);
 			if (old_page)
 				put_page(old_page);
+
+			delayacct_wpcopy_end();
 			return 0;
 		}
 	}
@@ -3138,12 +3142,16 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			free_swap_cache(old_page);
 		put_page(old_page);
 	}
+
+	delayacct_wpcopy_end();
 	return page_copied && !unshare ? VM_FAULT_WRITE : 0;
 oom_free_new:
 	put_page(new_page);
 oom:
 	if (old_page)
 		put_page(old_page);
+
+	delayacct_wpcopy_end();
 	return VM_FAULT_OOM;
 }
 
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 11e86739456d..e83e6e47a21e 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -207,6 +207,8 @@  static void print_delayacct(struct taskstats *t)
 	       "THRASHING%12s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n"
 	       "COMPACT  %12s%15s%15s\n"
+	       "      %15llu%15llu%15llums\n"
+	       "WPCOPY   %12s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n",
 	       "count", "real total", "virtual total",
 	       "delay total", "delay average",
@@ -234,7 +236,11 @@  static void print_delayacct(struct taskstats *t)
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->compact_count,
 	       (unsigned long long)t->compact_delay_total,
-	       average_ms(t->compact_delay_total, t->compact_count));
+	       average_ms(t->compact_delay_total, t->compact_count),
+	       "count", "delay total", "delay average",
+	       (unsigned long long)t->wpcopy_count,
+	       (unsigned long long)t->wpcopy_delay_total,
+	       average_ms(t->wpcopy_delay_total, t->wpcopy_count));
 }
 
 static void task_context_switch_counts(struct taskstats *t)