Message ID | 20241108212946.2642085-3-joshua.hahnjy@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | memcg/hugetlb: Rework memcg hugetlb charging | expand |
On Fri, Nov 08, 2024 at 01:29:45PM -0800, Joshua Hahn wrote: > This patch introduces mem_cgroup_charge_hugetlb, which combines the > logic of mem_cgroup{try,commit}_hugetlb. This reduces the footprint of > memcg in hugetlb code, and also consolidates the error path that memcg > can take into just one point. > > Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com> > > --- > include/linux/memcontrol.h | 2 ++ > mm/hugetlb.c | 34 ++++++++++++---------------------- > mm/memcontrol.c | 19 +++++++++++++++++++ > 3 files changed, 33 insertions(+), 22 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index bb49e0d4b377..d90c1ac791f1 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -678,6 +678,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, > int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, > long nr_pages); > > +int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); > + > int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, > gfp_t gfp, swp_entry_t entry); > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index fbb10e52d7ea..6f6841483039 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -2967,21 +2967,13 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, > struct hugepage_subpool *spool = subpool_vma(vma); > struct hstate *h = hstate_vma(vma); > struct folio *folio; > - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); > + long map_chg, map_commit; > long gbl_chg; > - int memcg_charge_ret, ret, idx; > + int ret, idx; > struct hugetlb_cgroup *h_cg = NULL; > - struct mem_cgroup *memcg; > bool deferred_reserve; > gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; > > - memcg = get_mem_cgroup_from_current(); > - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); > - if (memcg_charge_ret == -ENOMEM) { > - mem_cgroup_put(memcg); > - return ERR_PTR(-ENOMEM); > - } > - > idx = hstate_index(h); > /* > * Examine the region/reserve map to determine if the process > @@ -2989,12 +2981,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, > * code of zero indicates a reservation exists (no change). > */ > map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); > - if (map_chg < 0) { > - if (!memcg_charge_ret) > - mem_cgroup_cancel_charge(memcg, nr_pages); > - mem_cgroup_put(memcg); > + if (map_chg < 0) > return ERR_PTR(-ENOMEM); > - } > > /* > * Processes that did not create the mapping will have no > @@ -3092,10 +3080,15 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, > } > } > > - if (!memcg_charge_ret) > - mem_cgroup_commit_charge(folio, memcg); > - lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); > - mem_cgroup_put(memcg); > + ret = mem_cgroup_charge_hugetlb(folio, gfp); > + if (ret == -ENOMEM) { > + spin_unlock_irq(&hugetlb_lock); spin_unlock_irq?? > + free_huge_folio(folio); free_huge_folio() will call lruvec_stat_mod_folio() unconditionally but you are only calling it on success. This may underflow the metric. > + return ERR_PTR(-ENOMEM); > + } > + else if (!ret) > + lruvec_stat_mod_folio(folio, NR_HUGETLB, > + pages_per_huge_page(h)); > > return folio; > > @@ -3110,9 +3103,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, > hugepage_subpool_put_pages(spool, 1); > out_end_reservation: > vma_end_reservation(h, vma, addr); > - if (!memcg_charge_ret) > - mem_cgroup_cancel_charge(memcg, nr_pages); > - mem_cgroup_put(memcg); > return ERR_PTR(-ENOSPC); > } > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 97f63ec9c9fb..95ee77fe27af 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4529,6 +4529,25 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, > return 0; > } > > +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) > +{ > + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); > + int ret = 0; > + > + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || > + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) { > + ret = -EOPNOTSUPP; why EOPNOTSUPP? You need to return 0 here. We do want lruvec_stat_mod_folio() to be called. > + goto out; > + } > + > + if (charge_memcg(folio, memcg, gfp)) > + ret = -ENOMEM; > + > +out: > + mem_cgroup_put(memcg); > + return ret; > +} > + > /** > * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. > * @folio: folio to charge. > -- > 2.43.5 >
Hi Joshua, On Fri, 8 Nov 2024 13:29:45 -0800 Joshua Hahn <joshua.hahnjy@gmail.com> wrote: > This patch introduces mem_cgroup_charge_hugetlb, which combines the > logic of mem_cgroup{try,commit}_hugetlb. This reduces the footprint of Nit. Seems the regular expression is not technically correct? > memcg in hugetlb code, and also consolidates the error path that memcg > can take into just one point. > > Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com> > > --- > include/linux/memcontrol.h | 2 ++ > mm/hugetlb.c | 34 ++++++++++++---------------------- > mm/memcontrol.c | 19 +++++++++++++++++++ > 3 files changed, 33 insertions(+), 22 deletions(-) [...] > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 97f63ec9c9fb..95ee77fe27af 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4529,6 +4529,25 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, > return 0; > } > > +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) Can we add a kernel-doc comment for this function? Maybe that for mem_cgroup_hugetlb_try_charge() can be stolen with only small updates? > +{ > + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); > + int ret = 0; > + > + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || > + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) { > + ret = -EOPNOTSUPP; > + goto out; > + } > + > + if (charge_memcg(folio, memcg, gfp)) > + ret = -ENOMEM; > + > +out: > + mem_cgroup_put(memcg); > + return ret; > +} > + > /** > * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. > * @folio: folio to charge. > -- > 2.43.5 Thanks, SJ
Hi Joshua, kernel test robot noticed the following build errors: [auto build test ERROR on akpm-mm/mm-everything] [also build test ERROR on next-20241108] [cannot apply to linus/master v6.12-rc6] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Joshua-Hahn/memcg-hugetlb-Introduce-memcg_accounts_hugetlb/20241109-053045 base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/20241108212946.2642085-3-joshua.hahnjy%40gmail.com patch subject: [PATCH 2/3] memcg/hugetlb: Introduce mem_cgroup_charge_hugetlb config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20241109/202411091159.y6yDCkdf-lkp@intel.com/config) compiler: gcc-11 (Debian 11.3.0-12) 11.3.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241109/202411091159.y6yDCkdf-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202411091159.y6yDCkdf-lkp@intel.com/ All errors (new ones prefixed by >>): mm/hugetlb.c: In function 'alloc_hugetlb_folio': >> mm/hugetlb.c:3083:15: error: implicit declaration of function 'mem_cgroup_charge_hugetlb'; did you mean 'mem_cgroup_charge_skmem'? [-Werror=implicit-function-declaration] 3083 | ret = mem_cgroup_charge_hugetlb(folio, gfp); | ^~~~~~~~~~~~~~~~~~~~~~~~~ | mem_cgroup_charge_skmem cc1: some warnings being treated as errors vim +3083 mm/hugetlb.c 2963 2964 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, 2965 unsigned long addr, int avoid_reserve) 2966 { 2967 struct hugepage_subpool *spool = subpool_vma(vma); 2968 struct hstate *h = hstate_vma(vma); 2969 struct folio *folio; 2970 long map_chg, map_commit; 2971 long gbl_chg; 2972 int ret, idx; 2973 struct hugetlb_cgroup *h_cg = NULL; 2974 bool deferred_reserve; 2975 gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; 2976 2977 idx = hstate_index(h); 2978 /* 2979 * Examine the region/reserve map to determine if the process 2980 * has a reservation for the page to be allocated. A return 2981 * code of zero indicates a reservation exists (no change). 2982 */ 2983 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2984 if (map_chg < 0) 2985 return ERR_PTR(-ENOMEM); 2986 2987 /* 2988 * Processes that did not create the mapping will have no 2989 * reserves as indicated by the region/reserve map. Check 2990 * that the allocation will not exceed the subpool limit. 2991 * Allocations for MAP_NORESERVE mappings also need to be 2992 * checked against any subpool limit. 2993 */ 2994 if (map_chg || avoid_reserve) { 2995 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2996 if (gbl_chg < 0) 2997 goto out_end_reservation; 2998 2999 /* 3000 * Even though there was no reservation in the region/reserve 3001 * map, there could be reservations associated with the 3002 * subpool that can be used. This would be indicated if the 3003 * return value of hugepage_subpool_get_pages() is zero. 3004 * However, if avoid_reserve is specified we still avoid even 3005 * the subpool reservations. 3006 */ 3007 if (avoid_reserve) 3008 gbl_chg = 1; 3009 } 3010 3011 /* If this allocation is not consuming a reservation, charge it now. 3012 */ 3013 deferred_reserve = map_chg || avoid_reserve; 3014 if (deferred_reserve) { 3015 ret = hugetlb_cgroup_charge_cgroup_rsvd( 3016 idx, pages_per_huge_page(h), &h_cg); 3017 if (ret) 3018 goto out_subpool_put; 3019 } 3020 3021 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 3022 if (ret) 3023 goto out_uncharge_cgroup_reservation; 3024 3025 spin_lock_irq(&hugetlb_lock); 3026 /* 3027 * glb_chg is passed to indicate whether or not a page must be taken 3028 * from the global free pool (global change). gbl_chg == 0 indicates 3029 * a reservation exists for the allocation. 3030 */ 3031 folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); 3032 if (!folio) { 3033 spin_unlock_irq(&hugetlb_lock); 3034 folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); 3035 if (!folio) 3036 goto out_uncharge_cgroup; 3037 spin_lock_irq(&hugetlb_lock); 3038 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 3039 folio_set_hugetlb_restore_reserve(folio); 3040 h->resv_huge_pages--; 3041 } 3042 list_add(&folio->lru, &h->hugepage_activelist); 3043 folio_ref_unfreeze(folio, 1); 3044 /* Fall through */ 3045 } 3046 3047 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); 3048 /* If allocation is not consuming a reservation, also store the 3049 * hugetlb_cgroup pointer on the page. 3050 */ 3051 if (deferred_reserve) { 3052 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 3053 h_cg, folio); 3054 } 3055 3056 spin_unlock_irq(&hugetlb_lock); 3057 3058 hugetlb_set_folio_subpool(folio, spool); 3059 3060 map_commit = vma_commit_reservation(h, vma, addr); 3061 if (unlikely(map_chg > map_commit)) { 3062 /* 3063 * The page was added to the reservation map between 3064 * vma_needs_reservation and vma_commit_reservation. 3065 * This indicates a race with hugetlb_reserve_pages. 3066 * Adjust for the subpool count incremented above AND 3067 * in hugetlb_reserve_pages for the same page. Also, 3068 * the reservation count added in hugetlb_reserve_pages 3069 * no longer applies. 3070 */ 3071 long rsv_adjust; 3072 3073 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 3074 hugetlb_acct_memory(h, -rsv_adjust); 3075 if (deferred_reserve) { 3076 spin_lock_irq(&hugetlb_lock); 3077 hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), 3078 pages_per_huge_page(h), folio); 3079 spin_unlock_irq(&hugetlb_lock); 3080 } 3081 } 3082 > 3083 ret = mem_cgroup_charge_hugetlb(folio, gfp); 3084 if (ret == -ENOMEM) { 3085 spin_unlock_irq(&hugetlb_lock); 3086 free_huge_folio(folio); 3087 return ERR_PTR(-ENOMEM); 3088 } 3089 else if (!ret) 3090 lruvec_stat_mod_folio(folio, NR_HUGETLB, 3091 pages_per_huge_page(h)); 3092 3093 return folio; 3094 3095 out_uncharge_cgroup: 3096 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 3097 out_uncharge_cgroup_reservation: 3098 if (deferred_reserve) 3099 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 3100 h_cg); 3101 out_subpool_put: 3102 if (map_chg || avoid_reserve) 3103 hugepage_subpool_put_pages(spool, 1); 3104 out_end_reservation: 3105 vma_end_reservation(h, vma, addr); 3106 return ERR_PTR(-ENOSPC); 3107 } 3108
Hi Joshua, kernel test robot noticed the following build errors: [auto build test ERROR on akpm-mm/mm-everything] [also build test ERROR on next-20241108] [cannot apply to linus/master v6.12-rc6] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Joshua-Hahn/memcg-hugetlb-Introduce-memcg_accounts_hugetlb/20241109-053045 base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/20241108212946.2642085-3-joshua.hahnjy%40gmail.com patch subject: [PATCH 2/3] memcg/hugetlb: Introduce mem_cgroup_charge_hugetlb config: x86_64-buildonly-randconfig-002-20241109 (https://download.01.org/0day-ci/archive/20241109/202411091344.51lAbqmY-lkp@intel.com/config) compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241109/202411091344.51lAbqmY-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202411091344.51lAbqmY-lkp@intel.com/ All errors (new ones prefixed by >>): In file included from mm/hugetlb.c:8: In file included from include/linux/mm.h:2211: include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ In file included from mm/hugetlb.c:37: include/linux/mm_inline.h:47:41: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion] 47 | __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); | ~~~~~~~~~~~ ^ ~~~ include/linux/mm_inline.h:49:22: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum lru_list') [-Wenum-enum-conversion] 49 | NR_ZONE_LRU_BASE + lru, nr_pages); | ~~~~~~~~~~~~~~~~ ^ ~~~ >> mm/hugetlb.c:3083:8: error: call to undeclared function 'mem_cgroup_charge_hugetlb'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 3083 | ret = mem_cgroup_charge_hugetlb(folio, gfp); | ^ mm/hugetlb.c:3083:8: note: did you mean 'mem_cgroup_charge_skmem'? include/linux/memcontrol.h:1613:6: note: 'mem_cgroup_charge_skmem' declared here 1613 | bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, | ^ 3 warnings and 1 error generated. vim +/mem_cgroup_charge_hugetlb +3083 mm/hugetlb.c 2963 2964 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, 2965 unsigned long addr, int avoid_reserve) 2966 { 2967 struct hugepage_subpool *spool = subpool_vma(vma); 2968 struct hstate *h = hstate_vma(vma); 2969 struct folio *folio; 2970 long map_chg, map_commit; 2971 long gbl_chg; 2972 int ret, idx; 2973 struct hugetlb_cgroup *h_cg = NULL; 2974 bool deferred_reserve; 2975 gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; 2976 2977 idx = hstate_index(h); 2978 /* 2979 * Examine the region/reserve map to determine if the process 2980 * has a reservation for the page to be allocated. A return 2981 * code of zero indicates a reservation exists (no change). 2982 */ 2983 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); 2984 if (map_chg < 0) 2985 return ERR_PTR(-ENOMEM); 2986 2987 /* 2988 * Processes that did not create the mapping will have no 2989 * reserves as indicated by the region/reserve map. Check 2990 * that the allocation will not exceed the subpool limit. 2991 * Allocations for MAP_NORESERVE mappings also need to be 2992 * checked against any subpool limit. 2993 */ 2994 if (map_chg || avoid_reserve) { 2995 gbl_chg = hugepage_subpool_get_pages(spool, 1); 2996 if (gbl_chg < 0) 2997 goto out_end_reservation; 2998 2999 /* 3000 * Even though there was no reservation in the region/reserve 3001 * map, there could be reservations associated with the 3002 * subpool that can be used. This would be indicated if the 3003 * return value of hugepage_subpool_get_pages() is zero. 3004 * However, if avoid_reserve is specified we still avoid even 3005 * the subpool reservations. 3006 */ 3007 if (avoid_reserve) 3008 gbl_chg = 1; 3009 } 3010 3011 /* If this allocation is not consuming a reservation, charge it now. 3012 */ 3013 deferred_reserve = map_chg || avoid_reserve; 3014 if (deferred_reserve) { 3015 ret = hugetlb_cgroup_charge_cgroup_rsvd( 3016 idx, pages_per_huge_page(h), &h_cg); 3017 if (ret) 3018 goto out_subpool_put; 3019 } 3020 3021 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 3022 if (ret) 3023 goto out_uncharge_cgroup_reservation; 3024 3025 spin_lock_irq(&hugetlb_lock); 3026 /* 3027 * glb_chg is passed to indicate whether or not a page must be taken 3028 * from the global free pool (global change). gbl_chg == 0 indicates 3029 * a reservation exists for the allocation. 3030 */ 3031 folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); 3032 if (!folio) { 3033 spin_unlock_irq(&hugetlb_lock); 3034 folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); 3035 if (!folio) 3036 goto out_uncharge_cgroup; 3037 spin_lock_irq(&hugetlb_lock); 3038 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { 3039 folio_set_hugetlb_restore_reserve(folio); 3040 h->resv_huge_pages--; 3041 } 3042 list_add(&folio->lru, &h->hugepage_activelist); 3043 folio_ref_unfreeze(folio, 1); 3044 /* Fall through */ 3045 } 3046 3047 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); 3048 /* If allocation is not consuming a reservation, also store the 3049 * hugetlb_cgroup pointer on the page. 3050 */ 3051 if (deferred_reserve) { 3052 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), 3053 h_cg, folio); 3054 } 3055 3056 spin_unlock_irq(&hugetlb_lock); 3057 3058 hugetlb_set_folio_subpool(folio, spool); 3059 3060 map_commit = vma_commit_reservation(h, vma, addr); 3061 if (unlikely(map_chg > map_commit)) { 3062 /* 3063 * The page was added to the reservation map between 3064 * vma_needs_reservation and vma_commit_reservation. 3065 * This indicates a race with hugetlb_reserve_pages. 3066 * Adjust for the subpool count incremented above AND 3067 * in hugetlb_reserve_pages for the same page. Also, 3068 * the reservation count added in hugetlb_reserve_pages 3069 * no longer applies. 3070 */ 3071 long rsv_adjust; 3072 3073 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 3074 hugetlb_acct_memory(h, -rsv_adjust); 3075 if (deferred_reserve) { 3076 spin_lock_irq(&hugetlb_lock); 3077 hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), 3078 pages_per_huge_page(h), folio); 3079 spin_unlock_irq(&hugetlb_lock); 3080 } 3081 } 3082 > 3083 ret = mem_cgroup_charge_hugetlb(folio, gfp); 3084 if (ret == -ENOMEM) { 3085 spin_unlock_irq(&hugetlb_lock); 3086 free_huge_folio(folio); 3087 return ERR_PTR(-ENOMEM); 3088 } 3089 else if (!ret) 3090 lruvec_stat_mod_folio(folio, NR_HUGETLB, 3091 pages_per_huge_page(h)); 3092 3093 return folio; 3094 3095 out_uncharge_cgroup: 3096 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 3097 out_uncharge_cgroup_reservation: 3098 if (deferred_reserve) 3099 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), 3100 h_cg); 3101 out_subpool_put: 3102 if (map_chg || avoid_reserve) 3103 hugepage_subpool_put_pages(spool, 1); 3104 out_end_reservation: 3105 vma_end_reservation(h, vma, addr); 3106 return ERR_PTR(-ENOSPC); 3107 } 3108
Hello SJ, thank you for reviewing my patch! On Fri, Nov 8, 2024 at 8:03 PM SeongJae Park <sj@kernel.org> wrote: > > Hi Joshua, > > On Fri, 8 Nov 2024 13:29:45 -0800 Joshua Hahn <joshua.hahnjy@gmail.com> wrote: > > > This patch introduces mem_cgroup_charge_hugetlb, which combines the > > logic of mem_cgroup{try,commit}_hugetlb. This reduces the footprint of > > Nit. Seems the regular expression is not technically correct? I see, I will change it expand it out to include both. What I meant to say is that it combines the functionality of both the functions, but I think there was a typo there. I will just expand it out so that it is more clear to readers! > > +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) > > Can we add a kernel-doc comment for this function? Maybe that for > mem_cgroup_hugetlb_try_charge() can be stolen with only small updates? Yes, I can definitely add a kernel-doc for this function. Would you mind expanding on the "stolen only with small updates" part? Do you mean that instead of writing a completely new section in the kernel-doc, I can just change the name of the section and modify small parts of the description? > Thanks, > SJ Thank you for your time! I hope you have a good weekend! Joshua
Hello Shakeel, thank you for reviewing my patch! On Fri, Nov 8, 2024 at 5:43 PM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > On Fri, Nov 08, 2024 at 01:29:45PM -0800, Joshua Hahn wrote: > > This patch introduces mem_cgroup_charge_hugetlb, which combines the > > logic of mem_cgroup{try,commit}_hugetlb. This reduces the footprint of > > memcg in hugetlb code, and also consolidates the error path that memcg > > can take into just one point. > > > > Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com> > > - if (!memcg_charge_ret) > > - mem_cgroup_commit_charge(folio, memcg); > > - lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); > > - mem_cgroup_put(memcg); > > + ret = mem_cgroup_charge_hugetlb(folio, gfp); > > + if (ret == -ENOMEM) { > > + spin_unlock_irq(&hugetlb_lock); > > spin_unlock_irq?? Thank you for the catch. I completely missed this after I swapped the position of mem_cgroup_charge_hugetlb to be called without the lock. I will fix this. > > + free_huge_folio(folio); > > free_huge_folio() will call lruvec_stat_mod_folio() unconditionally but > you are only calling it on success. This may underflow the metric. I was actually thinking about this too. I was wondering what would make sense -- in the original draft of this patch, I had the charge increment be called unconditionally as well. The idea was that even though it would not make sense to have the stat incremented when there is an error, it would eventually be corrected by free_huge_folio's decrement. However, because there is nothing stopping the user from checking the stat in this period, they may temporarily see that the value is incremented in memory.stat, even though they were not able to obtain this page. With that said, maybe it makes sense to increment unconditionally, if free also decrements unconditionally. This race condition is not something that will cause a huge problem for the user, although users relying on userspace monitors for memory.stat to handle memory management may see some problems. Maybe what would make the most sense is to do both incrementing & decrementing conditionally as well. Thank you for your feedback, I will iterate on this for the next version! > > +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) > > +{ > > + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); > > + int ret = 0; > > + > > + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || > > + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) { > > + ret = -EOPNOTSUPP; > > why EOPNOTSUPP? You need to return 0 here. We do want > lruvec_stat_mod_folio() to be called. In this case, I was just preserving the original code's return statements. That is, in mem_cgroup_hugetlb_try_charge, the intended behavior was to return -EOPNOTSUPP if any of these conditions were met. If I understand the code correctly, calling lruvec_stat_mod_folio() on this failure will be a noop, since either memcg doesn't account hugetlb folios / there is no memcg / memcg is disabled. With all of this said, I think your feedback makes the most sense here, given the new semantics of the function: if there is no memcg or memcg doesn't account hugetlb, then there is no way that the limit can be reached! I will go forward with returning 0, and calling lruvec_stat_mod_folio (which will be a noop). Thank you for your detailed feedback. I wish I had caught these errors myself, thank you for your time in reviewing my patch. I hope you have a great rest of your weekend! Joshua
On Sat, 9 Nov 2024 13:41:31 -0500 Joshua Hahn <joshua.hahnjy@gmail.com> wrote: > Hello SJ, thank you for reviewing my patch! > > On Fri, Nov 8, 2024 at 8:03 PM SeongJae Park <sj@kernel.org> wrote: > > > > Hi Joshua, > > > > On Fri, 8 Nov 2024 13:29:45 -0800 Joshua Hahn <joshua.hahnjy@gmail.com> wrote: > > > > > This patch introduces mem_cgroup_charge_hugetlb, which combines the > > > logic of mem_cgroup{try,commit}_hugetlb. This reduces the footprint of > > > > Nit. Seems the regular expression is not technically correct? > > I see, I will change it expand it out to include both. What I meant to > say is that it combines the functionality of both the functions, but > I think there was a typo there. I will just expand it out so that it is > more clear to readers! Thank you :) > > > > +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) > > > > Can we add a kernel-doc comment for this function? Maybe that for > > mem_cgroup_hugetlb_try_charge() can be stolen with only small updates? > > Yes, I can definitely add a kernel-doc for this function. Would > you mind expanding on the "stolen only with small updates" part? > Do you mean that instead of writing a completely new section > in the kernel-doc, I can just change the name of the section > and modify small parts of the description? You're right. I just thought that might save some of your time if that makes sense. I don't really mind about this, so do whatever as you prefer :) Thanks, SJ [...]
On Sat, Nov 09, 2024 at 01:58:46PM -0500, Joshua Hahn wrote: [...] > > > > + free_huge_folio(folio); > > > > free_huge_folio() will call lruvec_stat_mod_folio() unconditionally but > > you are only calling it on success. This may underflow the metric. > > I was actually thinking about this too. I was wondering what would > make sense -- in the original draft of this patch, I had the charge > increment be called unconditionally as well. The idea was that > even though it would not make sense to have the stat incremented > when there is an error, it would eventually be corrected by > free_huge_folio's decrement. However, because there is nothing > stopping the user from checking the stat in this period, they may > temporarily see that the value is incremented in memory.stat, > even though they were not able to obtain this page. > On the alloc path, unconditionally calling lruvec_stat_mod_folio() after mem_cgroup_charge_hugetlb() but before free_huge_folio() is fine. Please note that if mem_cgroup_charge_hugetlb() has failed, lruvec_stat_mod_folio() will not be incrementing the memcg stat. The system level stats may get elevated for small time window but that is fine. > With that said, maybe it makes sense to increment unconditionally, > if free also decrements unconditionally. This race condition is > not something that will cause a huge problem for the user, > although users relying on userspace monitors for memory.stat > to handle memory management may see some problems. > > Maybe what would make the most sense is to do both > incrementing & decrementing conditionally as well. > Thank you for your feedback, I will iterate on this for the next version! > > > > +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) > > > +{ > > > + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); > > > + int ret = 0; > > > + > > > + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || > > > + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) { > > > + ret = -EOPNOTSUPP; > > > > why EOPNOTSUPP? You need to return 0 here. We do want > > lruvec_stat_mod_folio() to be called. > > In this case, I was just preserving the original code's return statements. > That is, in mem_cgroup_hugetlb_try_charge, the intended behavior > was to return -EOPNOTSUPP if any of these conditions were met. > If I understand the code correctly, calling lruvec_stat_mod_folio() > on this failure will be a noop, since either memcg doesn't account > hugetlb folios / there is no memcg / memcg is disabled. > lruvec_stat_mod_folio() does manipulate system level stats, so even if memcg accounting of hugetlb is not enabled we do want system level stats get updated.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bb49e0d4b377..d90c1ac791f1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -678,6 +678,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fbb10e52d7ea..6f6841483039 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2967,21 +2967,13 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); + long map_chg, map_commit; long gbl_chg; - int memcg_charge_ret, ret, idx; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); /* * Examine the region/reserve map to determine if the process @@ -2989,12 +2981,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg < 0) return ERR_PTR(-ENOMEM); - } /* * Processes that did not create the mapping will have no @@ -3092,10 +3080,15 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); - lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); - mem_cgroup_put(memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + if (ret == -ENOMEM) { + spin_unlock_irq(&hugetlb_lock); + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } + else if (!ret) + lruvec_stat_mod_folio(folio, NR_HUGETLB, + pages_per_huge_page(h)); return folio; @@ -3110,9 +3103,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugepage_subpool_put_pages(spool, 1); out_end_reservation: vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 97f63ec9c9fb..95ee77fe27af 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4529,6 +4529,25 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, return 0; } +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) +{ + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); + int ret = 0; + + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (charge_memcg(folio, memcg, gfp)) + ret = -ENOMEM; + +out: + mem_cgroup_put(memcg); + return ret; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge.
This patch introduces mem_cgroup_charge_hugetlb, which combines the logic of mem_cgroup{try,commit}_hugetlb. This reduces the footprint of memcg in hugetlb code, and also consolidates the error path that memcg can take into just one point. Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com> --- include/linux/memcontrol.h | 2 ++ mm/hugetlb.c | 34 ++++++++++++---------------------- mm/memcontrol.c | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+), 22 deletions(-)