[RFC,06/13] nexthop: Implement notifiers for resilient nexthop groups

Message ID	4664eeb381eefcc74bb1ff1e8a0f4da329bbf000.1612815058.git.petrm@nvidia.com (mailing list archive)
State	RFC
Delegated to:	Netdev Maintainers
Headers	show Return-Path: <netdev-owner@kernel.org> TLS: TLSv1.2, AES256-SHA) id <B6021a27d0000>; Mon, 08 Feb 2021 12:43:41 -0800 From: Petr Machata <petrm@nvidia.com> To: <netdev@vger.kernel.org> CC: David Ahern <dsahern@kernel.org>, "David S. Miller" <davem@davemloft.net>, Jakub Kicinski <kuba@kernel.org>, Ido Schimmel <idosch@nvidia.com>, "Petr Machata" <petrm@nvidia.com> Subject: [RFC PATCH 06/13] nexthop: Implement notifiers for resilient nexthop groups Date: Mon, 8 Feb 2021 21:42:49 +0100 Message-ID: <4664eeb381eefcc74bb1ff1e8a0f4da329bbf000.1612815058.git.petrm@nvidia.com> In-Reply-To: <cover.1612815057.git.petrm@nvidia.com> References: <cover.1612815057.git.petrm@nvidia.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain Precedence: bulk
Series	nexthop: Resilient next-hop groups \| expand [RFC,00/13] nexthop: Resilient next-hop groups [RFC,01/13] nexthop: Pass nh_config to replace_nexthop() [RFC,02/13] nexthop: __nh_notifier_single_info_init(): Make nh_info an argument [RFC,03/13] nexthop: Add netlink defines and enumerators for resilient NH groups [RFC,04/13] nexthop: Add implementation of resilient next-hop groups [RFC,05/13] nexthop: Add data structures for resilient group notifications [RFC,06/13] nexthop: Implement notifiers for resilient nexthop groups [RFC,07/13] nexthop: Allow setting "offload" and "trap" indication of nexthop buckets [RFC,08/13] nexthop: Allow reporting activity of nexthop buckets [RFC,09/13] nexthop: Add netlink handlers for resilient nexthop groups [RFC,10/13] nexthop: Add netlink handlers for bucket dump [RFC,11/13] nexthop: Add netlink handlers for bucket get [RFC,12/13] nexthop: Notify userspace about bucket migrations [RFC,13/13] nexthop: Enable resilient next-hop groups

Context	Check	Description
netdev/cover_letter	success	Link
netdev/fixes_present	success	Link
netdev/patch_count	success	Link
netdev/tree_selection	success	Guessed tree name to be net-next
netdev/subject_prefix	success	Link
netdev/cc_maintainers	warning	1 maintainers not CCed: yoshfuji@linux-ipv6.org
netdev/source_inline	success	Was 0 now: 0
netdev/verify_signedoff	success	Link
netdev/module_param	success	Was 0 now: 0
netdev/build_32bit	success	Errors and warnings before: 2 this patch: 2
netdev/kdoc	success	Errors and warnings before: 0 this patch: 0
netdev/verify_fixes	success	Link
netdev/checkpatch	warning	WARNING: line length of 85 exceeds 80 columns
netdev/build_allmodconfig_warn	success	Errors and warnings before: 2 this patch: 2
netdev/header_inline	success	Link
netdev/stable	success	Stable not CCed

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 4ce282b0a65f..fe91f3a0fb1e 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -115,6 +115,37 @@ static int nh_notifier_mp_info_init(struct nh_notifier_info *info, return 0; } +static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) +{ + struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); + u32 num_nh_buckets = res_table->num_nh_buckets; + unsigned long size; + u32 i; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; + size = struct_size(info->nh_res_table, nhs, num_nh_buckets); + info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | + __GFP_NOWARN); + if (!info->nh_res_table) + return -ENOMEM; + + info->nh_res_table->num_nh_buckets = num_nh_buckets; + + for (i = 0; i < num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + struct nh_info *nhi; + + nhge = rtnl_dereference(bucket->nh_entry); + nhi = rtnl_dereference(nhge->nh->nh_info); + __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], + nhi); + } + + return 0; +} + static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { @@ -122,6 +153,8 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info, if (nhg->mpath) return nh_notifier_mp_info_init(info, nhg); + else if (nhg->resilient) + return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } @@ -132,6 +165,8 @@ static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, if (nhg->mpath) kfree(info->nh_grp); + else if (nhg->resilient) + vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, @@ -183,6 +218,107 @@ static int call_nexthop_notifiers(struct net *net, return notifier_to_errno(err); } +static int +nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, + bool force, unsigned int *p_idle_timer_ms) +{ + struct nh_res_table *res_table; + struct nh_group *nhg; + struct nexthop *nh; + int err = 0; + + /* When 'force' is false, nexthop bucket replacement is performed + * because the bucket was deemed to be idle. In this case, capable + * listeners can choose to perform an atomic replacement: The bucket is + * only replaced if it is inactive. However, if the idle timer interval + * is smaller than the interval in which a listener is querying + * buckets' activity from the device, then atomic replacement should + * not be tried. Pass the idle timer value to listeners, so that they + * could determine which type of replacement to perform. + */ + if (force) { + *p_idle_timer_ms = 0; + return 0; + } + + rcu_read_lock(); + + nh = nexthop_find_by_id(info->net, info->id); + if (!nh) { + err = -EINVAL; + goto out; + } + + nhg = rcu_dereference(nh->nh_grp); + res_table = rcu_dereference(nhg->res_table); + *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); + +out: + rcu_read_unlock(); + + return err; +} + +static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, + u32 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi) +{ + unsigned int idle_timer_ms; + int err; + + err = nh_notifier_res_bucket_idle_timer_get(info, force, + &idle_timer_ms); + if (err) + return err; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; + info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), + GFP_KERNEL); + if (!info->nh_res_bucket) + return -ENOMEM; + + info->nh_res_bucket->bucket_index = bucket_index; + info->nh_res_bucket->idle_timer_ms = idle_timer_ms; + info->nh_res_bucket->force = force; + __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); + __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); + return 0; +} + +static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) +{ + kfree(info->nh_res_bucket); +} + +static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u32 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + .id = nhg_id, + }; + int err; + + if (nexthop_notifiers_is_empty(net)) + return 0; + + err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, + oldi, newi); + if (err) + return err; + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_BUCKET_REPLACE, &info); + nh_notifier_res_bucket_info_fini(&info); + + return notifier_to_errno(err); +} + /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under @@ -207,6 +343,53 @@ static int call_nexthop_notifiers(struct net *net, */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) +static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u32 bucket_index, bool force, + struct nexthop *old_nh, + struct nexthop *new_nh, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); + struct nh_info *newi = nh_res_dereference(new_nh->nh_info); + + return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, + force, oldi, newi, extack); +} + +static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; + struct nh_group *nhg; + int err; + + ASSERT_RTNL(); + + if (nexthop_notifiers_is_empty(net)) + return 0; + + /* At this point, the nexthop buckets are still not populated. Only + * emit a notification with the logical nexthops, so that a listener + * could potentially veto it in case of unsupported configuration. + */ + nhg = rtnl_dereference(nh->nh_grp); + err = nh_notifier_mp_info_init(&info, nhg); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); + return err; + } + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, + &info); + kfree(info.nh_grp); + + return notifier_to_errno(err); +} + static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, @@ -1144,10 +1327,12 @@ static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, - u32 bucket_index, bool force) + u32 bucket_index, bool notify, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; + struct netlink_ext_ack extack; + int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, @@ -1160,6 +1345,28 @@ static bool nh_res_bucket_migrate(struct nh_res_table *res_table, */ return false; + if (notify) { + struct nh_grp_entry *old_nhge; + + old_nhge = nh_res_dereference(bucket->nh_entry); + err = call_nexthop_res_bucket_notifiers(res_table->net, + res_table->nhg_id, + bucket_index, force, + old_nhge->nh, + new_nhge->nh, &extack); + if (err) { + pr_err_ratelimited("%s\n", extack._msg); + if (!force) + return false; + /* It is not possible to veto a forced replacement, so + * just clear the hardware flags from the nexthop + * bucket to indicate to user space that this bucket is + * not correctly populated in hardware. + */ + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + } + } + nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); @@ -1170,7 +1377,7 @@ static bool nh_res_bucket_migrate(struct nh_res_table *res_table, #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) -static void nh_res_table_upkeep(struct nh_res_table *res_table) +static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify) { unsigned long now = jiffies; unsigned long deadline; @@ -1194,7 +1401,8 @@ static void nh_res_table_upkeep(struct nh_res_table *res_table) if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { - if (!nh_res_bucket_migrate(res_table, i, force)) { + if (!nh_res_bucket_migrate(res_table, i, notify, + force)) { unsigned long idle_point; /* A driver can override the migration @@ -1235,7 +1443,7 @@ static void nh_res_table_upkeep_dw(struct work_struct *work) struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); - nh_res_table_upkeep(res_table); + nh_res_table_upkeep(res_table, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) @@ -1323,7 +1531,7 @@ static void replace_nexthop_grp_res(struct nh_group *oldg, nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; - nh_res_table_upkeep(old_res_table); + nh_res_table_upkeep(old_res_table, true); } static void nh_mp_group_rebalance(struct nh_group *nhg) @@ -1406,9 +1614,15 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, list_del(&nhge->nh_list); nexthop_put(nhge->nh); - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); - if (err) - pr_err("%s\n", extack._msg); + /* Removal of a NH from a resilient group is notified through + * bucket notifications. + */ + if (newg->mpath) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, + &extack); + if (err) + pr_err("%s\n", extack._msg); + } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); @@ -1561,6 +1775,16 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return -EINVAL; } + /* Emit a pre-replace notification so that listeners could veto + * a potentially unsupported configuration. Otherwise, + * individual bucket replacement notifications would need to be + * vetoed, which is something that should only happen if the + * bucket is currently active. + */ + err = call_nexthop_res_table_notifiers(net, new, extack); + if (err) + return err; + if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) @@ -1610,6 +1834,71 @@ static void nh_group_v4_update(struct nh_group *nhg) nhg->has_v4 = has_v4; } +static int replace_nexthop_single_notify_res(struct net *net, + struct nh_res_table *res_table, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + u32 nhg_id = res_table->nhg_id; + int err; + u32 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) { + err = __call_nexthop_res_bucket_notifiers(net, nhg_id, + i, true, + oldi, newi, + extack); + if (err) + goto err_notify; + } + } + + return 0; + +err_notify: + while (i-- > 0) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) + __call_nexthop_res_bucket_notifiers(net, nhg_id, i, + true, newi, oldi, + extack); + } + return err; +} + +static int replace_nexthop_single_notify(struct net *net, + struct nexthop *group_nh, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->mpath) { + return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, + group_nh, extack); + } else if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + return replace_nexthop_single_notify_res(net, res_table, + old, oldi, newi, + extack); + } + + return -EINVAL; +} + static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) @@ -1652,8 +1941,8 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, - extack); + err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, + extack); if (err) goto err_notify; } @@ -1683,7 +1972,7 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack); + replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; @@ -1851,13 +2140,20 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, } nh_res_group_rebalance(nhg, res_table); - nh_res_table_upkeep(res_table); + + /* Do not send bucket notifications, we do full + * notification below. + */ + nh_res_table_upkeep(res_table, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); + /* The initial insertion is a full notification for mpath as well + * as resilient groups. + */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);

[RFC,06/13] nexthop: Implement notifiers for resilient nexthop groups

Checks

Commit Message

Patch