Message ID | 20210913054121.616001-14-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/13] seq_file: mark seq_get_buf as deprecated | expand |
On Mon, Sep 13, 2021 at 07:41:21AM +0200, Christoph Hellwig wrote: > Trivial conversion to the seq_file based sysfs attributes. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/xfs_stats.c | 24 +++++------- > fs/xfs/xfs_stats.h | 2 +- > fs/xfs/xfs_sysfs.c | 96 +++++++++++++++++++++++----------------------- > 3 files changed, 58 insertions(+), 64 deletions(-) > > diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c > index 20e0534a772c9..71e7a84ba0403 100644 > --- a/fs/xfs/xfs_stats.c > +++ b/fs/xfs/xfs_stats.c > @@ -16,10 +16,9 @@ static int counter_val(struct xfsstats __percpu *stats, int idx) > return val; > } > > -int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > +void xfs_stats_format(struct xfsstats __percpu *stats, struct seq_file *sf) > { > int i, j; > - int len = 0; > uint64_t xs_xstrat_bytes = 0; > uint64_t xs_write_bytes = 0; > uint64_t xs_read_bytes = 0; > @@ -58,13 +57,12 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > /* Loop over all stats groups */ > > for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { > - len += scnprintf(buf + len, PATH_MAX - len, "%s", > - xstats[i].desc); > + seq_printf(sf, "%s", xstats[i].desc); > + > /* inner loop does each group */ > for (; j < xstats[i].endpoint; j++) > - len += scnprintf(buf + len, PATH_MAX - len, " %u", > - counter_val(stats, j)); > - len += scnprintf(buf + len, PATH_MAX - len, "\n"); > + seq_printf(sf, " %u", counter_val(stats, j)); > + seq_printf(sf, "\n"); > } > /* extra precision counters */ > for_each_possible_cpu(i) { > @@ -74,18 +72,14 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; > } > > - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", > + seq_printf(sf, "xpc %Lu %Lu %Lu\n", > xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); > - len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", > - defer_relog); > - len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", > + seq_printf(sf, "defer_relog %llu\n", defer_relog); > #if defined(DEBUG) > - 1); > + seq_printf(sf, "debug 1\n"); > #else > - 0); > + seq_printf(sf, "debug 0\n"); > #endif > - > - return len; > } That is a sysfs file? What happened to the "one value per file" rule here? Ugh. Anyway, I like the idea, but as you can see here, it could lead to even more abuse of sysfs files. We are just now getting people to use sysfs_emit() and that is showing us where people have been abusing the api in bad ways. Is there any way that sysfs can keep the existing show functionality and just do the seq_printf() for the buffer returned by the attribute file inside of the sysfs core? That would allow us to keep all of the existing attribute file functions as-is, and still get rid of the sysfs core usage here? thanks, greg k-h
On Mon, Sep 13, 2021 at 08:27:50AM +0200, Greg Kroah-Hartman wrote: > On Mon, Sep 13, 2021 at 07:41:21AM +0200, Christoph Hellwig wrote: > > Trivial conversion to the seq_file based sysfs attributes. > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > --- > > fs/xfs/xfs_stats.c | 24 +++++------- > > fs/xfs/xfs_stats.h | 2 +- > > fs/xfs/xfs_sysfs.c | 96 +++++++++++++++++++++++----------------------- > > 3 files changed, 58 insertions(+), 64 deletions(-) > > > > diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c > > index 20e0534a772c9..71e7a84ba0403 100644 > > --- a/fs/xfs/xfs_stats.c > > +++ b/fs/xfs/xfs_stats.c > > @@ -16,10 +16,9 @@ static int counter_val(struct xfsstats __percpu *stats, int idx) > > return val; > > } > > > > -int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > > +void xfs_stats_format(struct xfsstats __percpu *stats, struct seq_file *sf) > > { > > int i, j; > > - int len = 0; > > uint64_t xs_xstrat_bytes = 0; > > uint64_t xs_write_bytes = 0; > > uint64_t xs_read_bytes = 0; > > @@ -58,13 +57,12 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > > /* Loop over all stats groups */ > > > > for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { > > - len += scnprintf(buf + len, PATH_MAX - len, "%s", > > - xstats[i].desc); > > + seq_printf(sf, "%s", xstats[i].desc); > > + > > /* inner loop does each group */ > > for (; j < xstats[i].endpoint; j++) > > - len += scnprintf(buf + len, PATH_MAX - len, " %u", > > - counter_val(stats, j)); > > - len += scnprintf(buf + len, PATH_MAX - len, "\n"); > > + seq_printf(sf, " %u", counter_val(stats, j)); > > + seq_printf(sf, "\n"); > > } > > /* extra precision counters */ > > for_each_possible_cpu(i) { > > @@ -74,18 +72,14 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > > defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; > > } > > > > - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", > > + seq_printf(sf, "xpc %Lu %Lu %Lu\n", > > xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); > > - len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", > > - defer_relog); > > - len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", > > + seq_printf(sf, "defer_relog %llu\n", defer_relog); > > #if defined(DEBUG) > > - 1); > > + seq_printf(sf, "debug 1\n"); > > #else > > - 0); > > + seq_printf(sf, "debug 0\n"); > > #endif > > - > > - return len; > > } > > That is a sysfs file? What happened to the "one value per file" rule > here? There is no "rule" that says syfs files must contain one value per file; the documentation says that one value per file is the "preferred" format. Documentation/filesystems/sysfs.rst: [...] Attributes ... Attributes should be ASCII text files, preferably with only one value per file. It is noted that it may not be efficient to contain only one value per file, so it is socially acceptable to express an array of values of the same type. [...] We are exposing a large array of integer values here, so multiple values per file are explicitly considered an acceptible format. Further, as there are roughly 200 individual stats in this file and calculating each stat requires per-cpu aggregation, the the cost of calculating and reading each stat individually is prohibitive, not just inefficient. So, yes, we might have multiple lines in the file that you can frown about, but OTOH the file format has been exposed as a kernel ABI for a couple of decades via /proc/fs/xfs/stat. Hence exposing it in sysfs to provide a more fine-grained breakdown of the stats (per mount instead of global) is a no-brainer. We don't have to rewrite the parsing engines in multiple userspace monitoring programs to extract this information from the kernel - they just create a new instance and read a different file and it all just works. Indeed, there's precedence for such /proc file formats in more fine-grained sysfs files. e.g. /sys/bus/node/devices/node<n>/vmstat and /sys/bus/node/devices/node<n>/meminfo retain the same format (and hence userspace parsers) for the per-node stats as /proc/vmstat and /proc/meminfo use for the global stats... tl;dr: the file contains arrays of values, it's inefficient to read values one at a time, it's a pre-existing ABI-constrainted file format, there's precedence in core kernel statistics implementations and the documented guidelines allow this sort of usage in these cases. Cheers, Dave.
On Tue, Sep 14, 2021 at 11:20:29AM +1000, Dave Chinner wrote: > On Mon, Sep 13, 2021 at 08:27:50AM +0200, Greg Kroah-Hartman wrote: > > On Mon, Sep 13, 2021 at 07:41:21AM +0200, Christoph Hellwig wrote: > > > Trivial conversion to the seq_file based sysfs attributes. > > > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > > --- > > > fs/xfs/xfs_stats.c | 24 +++++------- > > > fs/xfs/xfs_stats.h | 2 +- > > > fs/xfs/xfs_sysfs.c | 96 +++++++++++++++++++++++----------------------- > > > 3 files changed, 58 insertions(+), 64 deletions(-) > > > > > > diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c > > > index 20e0534a772c9..71e7a84ba0403 100644 > > > --- a/fs/xfs/xfs_stats.c > > > +++ b/fs/xfs/xfs_stats.c > > > @@ -16,10 +16,9 @@ static int counter_val(struct xfsstats __percpu *stats, int idx) > > > return val; > > > } > > > > > > -int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > > > +void xfs_stats_format(struct xfsstats __percpu *stats, struct seq_file *sf) > > > { > > > int i, j; > > > - int len = 0; > > > uint64_t xs_xstrat_bytes = 0; > > > uint64_t xs_write_bytes = 0; > > > uint64_t xs_read_bytes = 0; > > > @@ -58,13 +57,12 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > > > /* Loop over all stats groups */ > > > > > > for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { > > > - len += scnprintf(buf + len, PATH_MAX - len, "%s", > > > - xstats[i].desc); > > > + seq_printf(sf, "%s", xstats[i].desc); > > > + > > > /* inner loop does each group */ > > > for (; j < xstats[i].endpoint; j++) > > > - len += scnprintf(buf + len, PATH_MAX - len, " %u", > > > - counter_val(stats, j)); > > > - len += scnprintf(buf + len, PATH_MAX - len, "\n"); > > > + seq_printf(sf, " %u", counter_val(stats, j)); > > > + seq_printf(sf, "\n"); > > > } > > > /* extra precision counters */ > > > for_each_possible_cpu(i) { > > > @@ -74,18 +72,14 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) > > > defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; > > > } > > > > > > - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", > > > + seq_printf(sf, "xpc %Lu %Lu %Lu\n", > > > xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); > > > - len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", > > > - defer_relog); > > > - len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", > > > + seq_printf(sf, "defer_relog %llu\n", defer_relog); > > > #if defined(DEBUG) > > > - 1); > > > + seq_printf(sf, "debug 1\n"); > > > #else > > > - 0); > > > + seq_printf(sf, "debug 0\n"); > > > #endif > > > - > > > - return len; > > > } > > > > That is a sysfs file? What happened to the "one value per file" rule > > here? > > > There is no "rule" that says syfs files must contain one value per > file; the documentation says that one value per file is the > "preferred" format. Documentation/filesystems/sysfs.rst: > > [...] > Attributes > ... > Attributes should be ASCII text files, preferably with only one value > per file. It is noted that it may not be efficient to contain only one > value per file, so it is socially acceptable to express an array of > values of the same type. > [...] > An array of values is one thing like "what is the power states for this device". A list of different key/value pairs is a totally different thing entirely. > We are exposing a large array of integer values here, so multiple > values per file are explicitly considered an acceptible format. Not really, that was not the goal of sysfs at all. > Further, as there are roughly 200 individual stats in this file and > calculating each stat requires per-cpu aggregation, the the cost of > calculating and reading each stat individually is prohibitive, not > just inefficient. Have you measured it? How often does the file get read and by what tools? We have learned from our past mistakes in /proc where we did this in the past and required keeping obsolete values and constantly tweaking userspace parsers. That is why we made sysfs one-value-per-file. If the file is not there, the value is not there, much easier to handle future changes. > So, yes, we might have multiple lines in the file that you can frown > about, but OTOH the file format has been exposed as a kernel ABI for > a couple of decades via /proc/fs/xfs/stat. proc had no such rules, but we have learned :) > Hence exposing it in > sysfs to provide a more fine-grained breakdown of the stats (per > mount instead of global) is a no-brainer. We don't have to rewrite > the parsing engines in multiple userspace monitoring programs to > extract this information from the kernel - they just create a new > instance and read a different file and it all just works. But then you run into the max size restriction on sysfs files (PAGE_SIZE) and things break down. Please don't do this. > Indeed, there's precedence for such /proc file formats in more > fine-grained sysfs files. e.g. /sys/bus/node/devices/node<n>/vmstat > and /sys/bus/node/devices/node<n>/meminfo retain the same format > (and hence userspace parsers) for the per-node stats as /proc/vmstat > and /proc/meminfo use for the global stats... And I have complained about those files in the past many times. And they are running into problems in places dealing with them too. > tl;dr: the file contains arrays of values, it's inefficient to read > values one at a time, it's a pre-existing ABI-constrainted file > format, there's precedence in core kernel statistics > implementations and the documented guidelines allow this sort of > usage in these cases. I would prefer not to do this, and I will not take core sysfs changes to make this any easier. Which is one big reason why I don't like just making sysfs use the seq file api, it would allow stuff like this to propagate to other places in the kernel. Maybe I should cut the file size of a sysfs file down to PAGE_SIZE/4 or less, that might be better :) thanks, greg k-h
On Mon, Sep 13, 2021 at 08:27:50AM +0200, Greg Kroah-Hartman wrote: > Anyway, I like the idea, but as you can see here, it could lead to even > more abuse of sysfs files. We are just now getting people to use > sysfs_emit() and that is showing us where people have been abusing the > api in bad ways. To be honest I've always seen sysfs_emit as at best a horrible band aid to enforce the PAGE_SIZE bounds checking. Better than nothing, but not a solution at all, as you can't force anyone to actually use it. > Is there any way that sysfs can keep the existing show functionality and > just do the seq_printf() for the buffer returned by the attribute file > inside of the sysfs core? Well, you'd need one page allocated in the seq_file code, and one in the sysfs code. At which point we might as well drop using seq_file at all. But in general seq_file seems like a very nice helper for over flow free printing into a buffer. If sysfs files actually were all limited to a single print we wouldn't really need it, and could just have something like sysfs_emit just with the buffer hidden inside a structure that is opaqueue to the caller. But looking at various attributes that is not exactly the case. While the majority certainly uses a single value and a single print statement there is plenty where this is not the case. Either because they use multiple values, or often also because they dynamically append to the string to print things like comma-separated flags.
On Tue, Sep 14, 2021 at 07:12:43AM +0200, Greg Kroah-Hartman wrote: > On Tue, Sep 14, 2021 at 11:20:29AM +1000, Dave Chinner wrote: > > On Mon, Sep 13, 2021 at 08:27:50AM +0200, Greg Kroah-Hartman wrote: > > > On Mon, Sep 13, 2021 at 07:41:21AM +0200, Christoph Hellwig wrote: > > > That is a sysfs file? What happened to the "one value per file" rule > > > here? > > > > > > There is no "rule" that says syfs files must contain one value per > > file; the documentation says that one value per file is the > > "preferred" format. Documentation/filesystems/sysfs.rst: > > > > [...] > > Attributes > > ... > > Attributes should be ASCII text files, preferably with only one value > > per file. It is noted that it may not be efficient to contain only one > > value per file, so it is socially acceptable to express an array of > > values of the same type. > > [...] > > > > An array of values is one thing like "what is the power states for this > device". A list of different key/value pairs is a totally different > thing entirely. Sure, that's your opinion. Nothing more, nothing less. I could easily argue that "what power states does this thing have" is a list and not an array, but this would be just another opinion and completely beside the point of the discussion. > > We are exposing a large array of integer values here, so multiple > > values per file are explicitly considered an acceptible format. > > Not really, that was not the goal of sysfs at all. Sure. We all know this and we follow the guidelines in most cases. Indeed, every other sysfs file that XFS exposes is a value-per-file object. This makes sense for most things we expose through sysfs, Not everything maps to value-per-file cleanly or efficiently and the documentation acknowledges that there are alternative use cases for sysfs files like this. Just because you don't like it doesn't mean there aren't cases where multiple values per file is the best thing to do. Blind obedience to rules almost always results in poor outcomes. > > Further, as there are roughly 200 individual stats in this file and > > calculating each stat requires per-cpu aggregation, the the cost of > > calculating and reading each stat individually is prohibitive, not > > just inefficient. > > Have you measured it? How often does the file get read and by what > tools? Of course I've measured it. I sample and graph hundreds of stats in realtime at 10-20Hz across multiple machines as part of my daily development routine with PCP? I see the overhead of reading stats out of the kernel in kernel profiles and dropped samples in recorded archives all the time. For example, reading the global XFS stats at 20Hz via pmcd/pmdaxfs during testing on a 32p machine results in xfs_stats_format() consuming about 1% of a CPU. From the flat perf top profile: 0.93 [kernel] [k] xfs_stats_format That's typical high frequency sampling overhead of a single filesystem being actively used. Note that micro-benchmarks give highly unrealistic results. If I pin a cpu in a tight loop doing like this: for (i = 0; i < cnt; i++) { fd = open("/sys/fs/xfs/stats/stats", O_RDONLY); read(fd, buf, 4096); close(fd); } It gives an number of 20,000 reads a second, with 90% of teh CPU usage being in xfs_stats_format(). This is unrealistic because it is hitting hot, clean caches and doing absolutely nothing with the data that is returned. Userspace needs to do work with the data, and that is as least as much CPU overhead as formatting the stats to turn them back into proper integer values and dumping them into a shared memory segment for another process to collate before it can read more data from the kernel (the underlying PCP userspace architecture). So, realistically, sampling from 5,000 XFS stats files per CPU per second from userspace on a 32p machine is about the max rate we can reliably sustain on a single CPU. The biggest cost is the read() cost (open/close is less than 10% of the cost of the above loop), and the flat profile shows: 27.47% [k] xfs_stats_format 16.11% [k] _find_next_bit 11.11% [k] cpumask_next 4.36% [k] vsnprintf 4.07% [k] format_decode 3.99% [k] number 70% of the "cpu usage" of xfs_stats_format() is the L1 cacheline miss in the for_each_possible_cpu() loop reading the percpu stats values. That's despite the cachelines being clean and likely hot in other larger local CPU caches. The percpu loop also accounts for the cpumask_next() CPU usage, and the _find_next_bit CPU usage comes from cpumask_next() calling it. IOWs, about half of the kernel CPU usage reading the XFS stats in a tight loop on a 32p machine comes from the per-cpu aggregation overhead. That aggregation overhead will be the overall limiting factor for XFS stats sampling. With 100 filesystems we could, in theory, sample them all at about 50Hz if we're doing nothing else with that CPU. But the data set size we are reading has gone up by a factor of 100, and the stats data is not going to be hot in CPU caches anymore (neither in the kernel or userspace). Per-cpu aggregation will also hit dirty remote cachelines on active systems, so it's *much* slower than the above loop would indicate. The original profile indicates maybe 20Hz is acheivable, not 50Hz. With this in mind, we then have to consider the overhead on machines with thousands of CPUs rather than a few tens of CPUs. The cost of per-cpu aggregation on those machines is a couple of magnitudes higher than these machines (aggregation overhead increases linearly with CPU count), so if we are burning the same amount of CPU time doing aggregation, then the number of times we can aggregate the stats goes down by a couple of orders of magnitude. That's the behavioural characteristics that we have to design for, not what my tiny little 32p system can do. IOWs, our "all stats in one file" setup on a system with a hundred filesystems and a thousand CPUs can currently manage (roughly) a 1Hz sampling rate on all filesystems using a single CPU and the existing "all stats in one file" setup. Now, let's put that in a value-per-file setup. This means we have to read 200 individual files per filesystem to build the entire sample set. This means we do 200 per-cpu aggregations instead of 1 for this sample set. Given that the majority of overhead is the per-cpu aggregation, the filesystem stats set takes 100-200x longer to sample than a "all in one file" setup. IOWs, a sample set for a *single* filesystem on a 1000 CPU machine now takes 1-2s to complete. At 100 filesystems, we now take 100-200s of CPU time to sample all the filesystem stats once. That's a massive amount of extra overhead, and simply not acceptible nor necessary. To compound the regression caused by moving to a value per file, we then have to spend another whole chunk of engineering work to mitigate it. We'd have to completely rewrite the stats aggregation code and we'd probably end up losing high frequency aggregation capabilities as a result. This is a lose-lose-lose-lose scenario, and no one in their right mind would consider a viable way forward. The "efficiency exception" in the documentation was intended for exactly this sort of situation. So, really it doesn't make what you like or not, it's the only viable solution we currently have. > We have learned from our past mistakes in /proc where we did this in the > past and required keeping obsolete values and constantly tweaking > userspace parsers. That is why we made sysfs one-value-per-file. If > the file is not there, the value is not there, much easier to handle > future changes. We've had rules for extending the xfs stats file to avoid breaking the userspace parsers for a couple of decades, too. There are some advantages to value-per-file when it comes to deprecation, but really that's not a problem we need to solve for the XFS stats. > > So, yes, we might have multiple lines in the file that you can frown > > about, but OTOH the file format has been exposed as a kernel ABI for > > a couple of decades via /proc/fs/xfs/stat. > > proc had no such rules, but we have learned :) And so now you know better than everyone else. :/ > > Hence exposing it in > > sysfs to provide a more fine-grained breakdown of the stats (per > > mount instead of global) is a no-brainer. We don't have to rewrite > > the parsing engines in multiple userspace monitoring programs to > > extract this information from the kernel - they just create a new > > instance and read a different file and it all just works. > > But then you run into the max size restriction on sysfs files > (PAGE_SIZE) and things break down. Yup, but we're nowhere near the max size restriction. Output currently requires 12 bytes per stat maximum, so we're still good to add another 100-150 or so stats before we have to worry about PAGE_SIZE limits... > Please don't do this. Please don't take us for idiots - you know full well that the ABI horse bolted long ago and we don't break userspace like this. If you've got a more efficient generic way of exporting large volumes of dynamically instanced stats arrays at high frequency than what we do right now, then I'm all ears. But if all you are going to say is "I say you can't do that" then you're not being helpful or constructive. > > Indeed, there's precedence for such /proc file formats in more > > fine-grained sysfs files. e.g. /sys/bus/node/devices/node<n>/vmstat > > and /sys/bus/node/devices/node<n>/meminfo retain the same format > > (and hence userspace parsers) for the per-node stats as /proc/vmstat > > and /proc/meminfo use for the global stats... > > And I have complained about those files in the past many times. And > they are running into problems in places dealing with them too. So come up with a generic, scalable, low overhead solution to the stats export problem, convert all the kernel code and userspace applications to use it, address all the regressions on large CPU count machines it causes, start a long term deprecation period for the existing stats files (because it's part of the kernel ABI), and then maybe in 10 or 15 years time we can get rid of this stuff. > > tl;dr: the file contains arrays of values, it's inefficient to read > > values one at a time, it's a pre-existing ABI-constrainted file > > format, there's precedence in core kernel statistics > > implementations and the documented guidelines allow this sort of > > usage in these cases. > > I would prefer not to do this, and I will not take core sysfs changes to > make this any easier. > > Which is one big reason why I don't like just making sysfs use the seq > file api, it would allow stuff like this to propagate to other places in > the kernel. > > Maybe I should cut the file size of a sysfs file down to PAGE_SIZE/4 or > less, that might be better :) Yeah, good on yah, Greg. That'll show everyone who's boss, won't it? -Dave.
On Tue, Sep 14, 2021 at 09:30:03AM +0200, Christoph Hellwig wrote: > On Mon, Sep 13, 2021 at 08:27:50AM +0200, Greg Kroah-Hartman wrote: > > Anyway, I like the idea, but as you can see here, it could lead to even > > more abuse of sysfs files. We are just now getting people to use > > sysfs_emit() and that is showing us where people have been abusing the > > api in bad ways. > > To be honest I've always seen sysfs_emit as at best a horrible band aid > to enforce the PAGE_SIZE bounds checking. Better than nothing, but > not a solution at all, as you can't force anyone to actually use it. We can "force" it by not allowing buffers to be bigger than that, which is what the code has always done. I think we want to keep that for now and not add the new seq_show api. I've taken patches 2-6 in this series now, as those were great sysfs and kernfs cleanups, thanks for those. I can also take patch 1 if no one objects (I can edit the typo.) I agree that getting rid of seq_get_buf() is good, and can work on getting rid of the use here in sysfs if it's annoying people. thanks, greg k-h
On Tue, Sep 14, 2021 at 05:28:08PM +0200, Greg Kroah-Hartman wrote: > We can "force" it by not allowing buffers to be bigger than that, which > is what the code has always done. I think we want to keep that for now > and not add the new seq_show api. The buffer already is not larger than that. The problem is that sysfs_emit does not actually work for the non-trivial attributes, which generally are the source of bugs.
On Tue, Sep 14, 2021 at 05:30:11PM +0200, Christoph Hellwig wrote: > On Tue, Sep 14, 2021 at 05:28:08PM +0200, Greg Kroah-Hartman wrote: > > We can "force" it by not allowing buffers to be bigger than that, which > > is what the code has always done. I think we want to keep that for now > > and not add the new seq_show api. > > The buffer already is not larger than that. The problem is that > sysfs_emit does not actually work for the non-trivial attributes, > which generally are the source of bugs. They huge majority of sysfs attributes are "trivial". So for maybe at least 95% of the users, if not more, using sysfs_emit() is just fine as all you "should" be doing is emitting a single value. For those that are non-trivial, yes, that will be harder, but as the xfs discussion shows, those are not normal at all, and I do not want to make creating them easier as that is not the model that sysfs was designed for if at all possible. thanks, greg k-h
On Tue, Sep 14, 2021 at 05:41:37PM +0200, Greg Kroah-Hartman wrote: > They huge majority of sysfs attributes are "trivial". So for maybe at > least 95% of the users, if not more, using sysfs_emit() is just fine as > all you "should" be doing is emitting a single value. It is just fine if no one does the obvious mistakes that an interface with a char * pointer leads to. And 5% of all attributes is still a huge attack surface.
On Wed, Sep 15, 2021 at 09:04:45AM +0200, Christoph Hellwig wrote: > On Tue, Sep 14, 2021 at 05:41:37PM +0200, Greg Kroah-Hartman wrote: > > They huge majority of sysfs attributes are "trivial". So for maybe at > > least 95% of the users, if not more, using sysfs_emit() is just fine as > > all you "should" be doing is emitting a single value. > > It is just fine if no one does the obvious mistakes that an interface > with a char * pointer leads to. And 5% of all attributes is still a huge > attack surface. It is probably less, I just pulled that number out of the air. With the other work we are doing to make sure we have documentation for all sysfs attributes in the kernel, we will soon know the real number. thanks, greg k-h
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 20e0534a772c9..71e7a84ba0403 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -16,10 +16,9 @@ static int counter_val(struct xfsstats __percpu *stats, int idx) return val; } -int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) +void xfs_stats_format(struct xfsstats __percpu *stats, struct seq_file *sf) { int i, j; - int len = 0; uint64_t xs_xstrat_bytes = 0; uint64_t xs_write_bytes = 0; uint64_t xs_read_bytes = 0; @@ -58,13 +57,12 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) /* Loop over all stats groups */ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - len += scnprintf(buf + len, PATH_MAX - len, "%s", - xstats[i].desc); + seq_printf(sf, "%s", xstats[i].desc); + /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) - len += scnprintf(buf + len, PATH_MAX - len, " %u", - counter_val(stats, j)); - len += scnprintf(buf + len, PATH_MAX - len, "\n"); + seq_printf(sf, " %u", counter_val(stats, j)); + seq_printf(sf, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { @@ -74,18 +72,14 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + seq_printf(sf, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", - defer_relog); - len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", + seq_printf(sf, "defer_relog %llu\n", defer_relog); #if defined(DEBUG) - 1); + seq_printf(sf, "debug 1\n"); #else - 0); + seq_printf(sf, "debug 0\n"); #endif - - return len; } void xfs_stats_clearall(struct xfsstats __percpu *stats) diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 43ffba74f045e..6bf80565893cf 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -156,7 +156,7 @@ struct xfsstats { (offsetof(struct __xfsstats, member) / (int)sizeof(uint32_t)) -int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); +void xfs_stats_format(struct xfsstats __percpu *stats, struct seq_file *sf); void xfs_stats_clearall(struct xfsstats __percpu *stats); extern struct xstats xfsstats; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 18dc5eca6c045..efa2aa07f1865 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -16,7 +16,7 @@ struct xfs_sysfs_attr { struct attribute attr; - ssize_t (*show)(struct kobject *kobject, char *buf); + void (*show)(struct kobject *kobject, struct seq_file *sf); ssize_t (*store)(struct kobject *kobject, const char *buf, size_t count); }; @@ -36,15 +36,17 @@ to_attr(struct attribute *attr) #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr -STATIC ssize_t -xfs_sysfs_object_show( +STATIC int +xfs_sysfs_object_seq_show( struct kobject *kobject, struct attribute *attr, - char *buf) + struct seq_file *sf) { struct xfs_sysfs_attr *xfs_attr = to_attr(attr); - return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0; + if (xfs_attr->show) + xfs_attr->show(kobject, sf); + return 0; } STATIC ssize_t @@ -60,8 +62,8 @@ xfs_sysfs_object_store( } static const struct sysfs_ops xfs_sysfs_ops = { - .show = xfs_sysfs_object_show, - .store = xfs_sysfs_object_store, + .seq_show = xfs_sysfs_object_seq_show, + .store = xfs_sysfs_object_store, }; static struct attribute *xfs_mp_attrs[] = { @@ -100,12 +102,12 @@ bug_on_assert_store( return count; } -STATIC ssize_t +STATIC void bug_on_assert_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0); + seq_printf(sf, "%d\n", xfs_globals.bug_on_assert ? 1 : 0); } XFS_SYSFS_ATTR_RW(bug_on_assert); @@ -130,12 +132,12 @@ log_recovery_delay_store( return count; } -STATIC ssize_t +STATIC void log_recovery_delay_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); + seq_printf(sf, "%d\n", xfs_globals.log_recovery_delay); } XFS_SYSFS_ATTR_RW(log_recovery_delay); @@ -160,12 +162,12 @@ mount_delay_store( return count; } -STATIC ssize_t +STATIC void mount_delay_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay); + seq_printf(sf, "%d\n", xfs_globals.mount_delay); } XFS_SYSFS_ATTR_RW(mount_delay); @@ -183,12 +185,12 @@ always_cow_store( return count; } -static ssize_t +static void always_cow_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); + seq_printf(sf, "%d\n", xfs_globals.always_cow); } XFS_SYSFS_ATTR_RW(always_cow); @@ -219,12 +221,12 @@ pwork_threads_store( return count; } -STATIC ssize_t +STATIC void pwork_threads_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); + seq_printf(sf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); #endif /* DEBUG */ @@ -258,14 +260,12 @@ to_xstats(struct kobject *kobject) return container_of(kobj, struct xstats, xs_kobj); } -STATIC ssize_t +STATIC void stats_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { - struct xstats *stats = to_xstats(kobject); - - return xfs_stats_format(stats->xs_stats, buf); + return xfs_stats_format(to_xstats(kobject)->xs_stats, sf); } XFS_SYSFS_ATTR_RO(stats); @@ -313,10 +313,10 @@ to_xlog(struct kobject *kobject) return container_of(kobj, struct xlog, l_kobj); } -STATIC ssize_t +STATIC void log_head_lsn_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { int cycle; int block; @@ -327,28 +327,28 @@ log_head_lsn_show( block = log->l_curr_block; spin_unlock(&log->l_icloglock); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); + seq_printf(sf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_head_lsn); -STATIC ssize_t +STATIC void log_tail_lsn_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { int cycle; int block; struct xlog *log = to_xlog(kobject); xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); + seq_printf(sf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_tail_lsn); -STATIC ssize_t +STATIC void reserve_grant_head_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { int cycle; @@ -356,21 +356,21 @@ reserve_grant_head_show( struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); + seq_printf(sf, "%d:%d\n", cycle, bytes); } XFS_SYSFS_ATTR_RO(reserve_grant_head); -STATIC ssize_t +STATIC void write_grant_head_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { int cycle; int bytes; struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); + seq_printf(sf, "%d:%d\n", cycle, bytes); } XFS_SYSFS_ATTR_RO(write_grant_head); @@ -412,10 +412,10 @@ err_to_mp(struct kobject *kobject) return container_of(kobj, struct xfs_mount, m_error_kobj); } -static ssize_t +static void max_retries_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); @@ -425,7 +425,7 @@ max_retries_show( else retries = cfg->max_retries; - return snprintf(buf, PAGE_SIZE, "%d\n", retries); + seq_printf(sf, "%d\n", retries); } static ssize_t @@ -453,10 +453,10 @@ max_retries_store( } XFS_SYSFS_ATTR_RW(max_retries); -static ssize_t +static void retry_timeout_seconds_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { int timeout; struct xfs_error_cfg *cfg = to_error_cfg(kobject); @@ -466,7 +466,7 @@ retry_timeout_seconds_show( else timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; - return snprintf(buf, PAGE_SIZE, "%d\n", timeout); + seq_printf(sf, "%d\n", timeout); } static ssize_t @@ -497,14 +497,14 @@ retry_timeout_seconds_store( } XFS_SYSFS_ATTR_RW(retry_timeout_seconds); -static ssize_t +static void fail_at_unmount_show( struct kobject *kobject, - char *buf) + struct seq_file *sf) { struct xfs_mount *mp = err_to_mp(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount); + seq_printf(sf, "%d\n", mp->m_fail_unmount); } static ssize_t
Trivial conversion to the seq_file based sysfs attributes. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_stats.c | 24 +++++------- fs/xfs/xfs_stats.h | 2 +- fs/xfs/xfs_sysfs.c | 96 +++++++++++++++++++++++----------------------- 3 files changed, 58 insertions(+), 64 deletions(-)