diff mbox series

[10/10] powerpc: remove address space overrides using set_fs()

Message ID 20200827150030.282762-11-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/10] fs: don't allow kernel reads and writes without iter ops | expand

Commit Message

Christoph Hellwig Aug. 27, 2020, 3 p.m. UTC
Stop providing the possibility to override the address space using
set_fs() now that there is no need for that any more.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/powerpc/Kconfig                   |  1 -
 arch/powerpc/include/asm/processor.h   |  7 ---
 arch/powerpc/include/asm/thread_info.h |  5 +--
 arch/powerpc/include/asm/uaccess.h     | 62 ++++++++------------------
 arch/powerpc/kernel/signal.c           |  3 --
 arch/powerpc/lib/sstep.c               |  6 +--
 6 files changed, 22 insertions(+), 62 deletions(-)

Comments

Christophe Leroy Sept. 2, 2020, 6:15 a.m. UTC | #1
Le 27/08/2020 à 17:00, Christoph Hellwig a écrit :
> Stop providing the possibility to override the address space using
> set_fs() now that there is no need for that any more.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   arch/powerpc/Kconfig                   |  1 -
>   arch/powerpc/include/asm/processor.h   |  7 ---
>   arch/powerpc/include/asm/thread_info.h |  5 +--
>   arch/powerpc/include/asm/uaccess.h     | 62 ++++++++------------------
>   arch/powerpc/kernel/signal.c           |  3 --
>   arch/powerpc/lib/sstep.c               |  6 +--
>   6 files changed, 22 insertions(+), 62 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
> index 7fe3531ad36a77..39727537d39701 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -8,62 +8,36 @@
>   #include <asm/extable.h>
>   #include <asm/kup.h>
>   
> -/*
> - * The fs value determines whether argument validity checking should be
> - * performed or not.  If get_fs() == USER_DS, checking is performed, with
> - * get_fs() == KERNEL_DS, checking is bypassed.
> - *
> - * For historical reasons, these macros are grossly misnamed.
> - *
> - * The fs/ds values are now the highest legal address in the "segment".
> - * This simplifies the checking in the routines below.
> - */
> -
> -#define MAKE_MM_SEG(s)  ((mm_segment_t) { (s) })
> -
> -#define KERNEL_DS	MAKE_MM_SEG(~0UL)
>   #ifdef __powerpc64__
>   /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */
> -#define USER_DS		MAKE_MM_SEG(TASK_SIZE_USER64 - 1)
> -#else
> -#define USER_DS		MAKE_MM_SEG(TASK_SIZE - 1)
> -#endif
> -
> -#define get_fs()	(current->thread.addr_limit)
> +#define TASK_SIZE_MAX		TASK_SIZE_USER64
>   
> -static inline void set_fs(mm_segment_t fs)
> +static inline bool __access_ok(unsigned long addr, unsigned long size)
>   {
> -	current->thread.addr_limit = fs;
> -	/* On user-mode return check addr_limit (fs) is correct */
> -	set_thread_flag(TIF_FSCHECK);
> +	if (addr >= TASK_SIZE_MAX)
> +		return false;
> +	/*
> +	 * This check is sufficient because there is a large enough gap between
> +	 * user addresses and the kernel addresses.
> +	 */
> +	return size <= TASK_SIZE_MAX;
>   }
> -
> -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
> -#define user_addr_max()	(get_fs().seg)
> -
> -#ifdef __powerpc64__
> -/*
> - * This check is sufficient because there is a large enough
> - * gap between user addresses and the kernel addresses
> - */
> -#define __access_ok(addr, size, segment)	\
> -	(((addr) <= (segment).seg) && ((size) <= (segment).seg))
> -
>   #else
> +#define TASK_SIZE_MAX		TASK_SIZE
>   
> -static inline int __access_ok(unsigned long addr, unsigned long size,
> -			mm_segment_t seg)
> +static inline bool __access_ok(unsigned long addr, unsigned long size)
>   {
> -	if (addr > seg.seg)
> -		return 0;
> -	return (size == 0 || size - 1 <= seg.seg - addr);
> +	if (addr >= TASK_SIZE_MAX)
> +		return false;
> +	if (size == 0)
> +		return false;

__access_ok() was returning true when size == 0 up to now. Any reason to 
return false now ?

> +	return size <= TASK_SIZE_MAX - addr;
>   }
> -
> -#endif
> +#endif /* __powerpc64__ */
>   
>   #define access_ok(addr, size)		\
>   	(__chk_user_ptr(addr),		\
> -	 __access_ok((__force unsigned long)(addr), (size), get_fs()))
> +	 __access_ok((unsigned long)(addr), (size)))
>   
>   /*
>    * These are the main single-value transfer routines.  They automatically

Christophe
Christoph Hellwig Sept. 2, 2020, 12:36 p.m. UTC | #2
On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:
>> -		return 0;
>> -	return (size == 0 || size - 1 <= seg.seg - addr);
>> +	if (addr >= TASK_SIZE_MAX)
>> +		return false;
>> +	if (size == 0)
>> +		return false;
>
> __access_ok() was returning true when size == 0 up to now. Any reason to 
> return false now ?

No, this is accidental and broken.  Can you re-run your benchmark with
this fixed?
David Laight Sept. 2, 2020, 1:13 p.m. UTC | #3
From: Christoph Hellwig
> Sent: 02 September 2020 13:37
> 
> On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:
> >> -		return 0;
> >> -	return (size == 0 || size - 1 <= seg.seg - addr);
> >> +	if (addr >= TASK_SIZE_MAX)
> >> +		return false;
> >> +	if (size == 0)
> >> +		return false;
> >
> > __access_ok() was returning true when size == 0 up to now. Any reason to
> > return false now ?
> 
> No, this is accidental and broken.  Can you re-run your benchmark with
> this fixed?

Is TASK_SIZE_MASK defined such that you can do:

	return (addr | size) < TASK_SIZE_MAX) || !size;

    David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Christophe Leroy Sept. 2, 2020, 1:24 p.m. UTC | #4
Le 02/09/2020 à 15:13, David Laight a écrit :
> From: Christoph Hellwig
>> Sent: 02 September 2020 13:37
>>
>> On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:
>>>> -		return 0;
>>>> -	return (size == 0 || size - 1 <= seg.seg - addr);
>>>> +	if (addr >= TASK_SIZE_MAX)
>>>> +		return false;
>>>> +	if (size == 0)
>>>> +		return false;
>>>
>>> __access_ok() was returning true when size == 0 up to now. Any reason to
>>> return false now ?
>>
>> No, this is accidental and broken.  Can you re-run your benchmark with
>> this fixed?
> 
> Is TASK_SIZE_MASK defined such that you can do:
> 
> 	return (addr | size) < TASK_SIZE_MAX) || !size;

TASK_SIZE_MAX will usually be 0xc0000000

With:
addr = 0x80000000;
size = 0x80000000;

I expect it to fail ....

With the formula you propose it will succeed, won't it ?

Christophe
David Laight Sept. 2, 2020, 1:51 p.m. UTC | #5
From: Christophe Leroy
> Sent: 02 September 2020 14:25
> Le 02/09/2020 à 15:13, David Laight a écrit :
> > From: Christoph Hellwig
> >> Sent: 02 September 2020 13:37
> >>
> >> On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:
> >>>> -		return 0;
> >>>> -	return (size == 0 || size - 1 <= seg.seg - addr);
> >>>> +	if (addr >= TASK_SIZE_MAX)
> >>>> +		return false;
> >>>> +	if (size == 0)
> >>>> +		return false;
> >>>
> >>> __access_ok() was returning true when size == 0 up to now. Any reason to
> >>> return false now ?
> >>
> >> No, this is accidental and broken.  Can you re-run your benchmark with
> >> this fixed?
> >
> > Is TASK_SIZE_MASK defined such that you can do:
> >
> > 	return (addr | size) < TASK_SIZE_MAX) || !size;
> 
> TASK_SIZE_MAX will usually be 0xc0000000
> 
> With:
> addr = 0x80000000;
> size = 0x80000000;
> 
> I expect it to fail ....
> 
> With the formula you propose it will succeed, won't it ?

Hmmm... Was i getting confused about some comments for 64bit
about there being such a big hole between valid user and kernel
addresses that it was enough to check that 'size < TASK_SIZE_MAX'.

That would be true for 64bit x86 (and probably ppc (& arm??))
if TASK_SIZE_MAX were 0x4 << 60.
IIUC the highest user address is (much) less than 0x0 << 60
and the lowest kernel address (much) greater than 0xf << 60
on all these 64bit platforms.

Actually if doing access_ok() inside get_user() you don't
need to check the size at all.
You don't even need to in copy_to/from_user() provided
it always does a forwards copy.
(Rather that copying the last word first for misaligned lengths.)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Christophe Leroy Sept. 2, 2020, 2:12 p.m. UTC | #6
Le 02/09/2020 à 15:51, David Laight a écrit :
> From: Christophe Leroy
>> Sent: 02 September 2020 14:25
>> Le 02/09/2020 à 15:13, David Laight a écrit :
>>> From: Christoph Hellwig
>>>> Sent: 02 September 2020 13:37
>>>>
>>>> On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:
>>>>>> -		return 0;
>>>>>> -	return (size == 0 || size - 1 <= seg.seg - addr);
>>>>>> +	if (addr >= TASK_SIZE_MAX)
>>>>>> +		return false;
>>>>>> +	if (size == 0)
>>>>>> +		return false;
>>>>>
>>>>> __access_ok() was returning true when size == 0 up to now. Any reason to
>>>>> return false now ?
>>>>
>>>> No, this is accidental and broken.  Can you re-run your benchmark with
>>>> this fixed?
>>>
>>> Is TASK_SIZE_MASK defined such that you can do:
>>>
>>> 	return (addr | size) < TASK_SIZE_MAX) || !size;
>>
>> TASK_SIZE_MAX will usually be 0xc0000000
>>
>> With:
>> addr = 0x80000000;
>> size = 0x80000000;
>>
>> I expect it to fail ....
>>
>> With the formula you propose it will succeed, won't it ?
> 
> Hmmm... Was i getting confused about some comments for 64bit
> about there being such a big hole between valid user and kernel
> addresses that it was enough to check that 'size < TASK_SIZE_MAX'.
> 
> That would be true for 64bit x86 (and probably ppc (& arm??))
> if TASK_SIZE_MAX were 0x4 << 60.
> IIUC the highest user address is (much) less than 0x0 << 60
> and the lowest kernel address (much) greater than 0xf << 60
> on all these 64bit platforms.
> 
> Actually if doing access_ok() inside get_user() you don't
> need to check the size at all.

You mean on 64 bit or on any platform ?

What about a word write to 0xbffffffe, won't it overwrite 0xc0000000 ?

> You don't even need to in copy_to/from_user() provided
> it always does a forwards copy.

Do you mean due to the gap ?
Is it garantied to be a gap ? Even on a 32 bits having TASK_SIZE set to 
0xc0000000 and PAGE_OFFSET set to the same ?


Christophe
David Laight Sept. 2, 2020, 3:02 p.m. UTC | #7
From: Christophe Leroy
> Sent: 02 September 2020 15:13
> 
> 
> Le 02/09/2020 à 15:51, David Laight a écrit :
> > From: Christophe Leroy
> >> Sent: 02 September 2020 14:25
> >> Le 02/09/2020 à 15:13, David Laight a écrit :
> >>> From: Christoph Hellwig
> >>>> Sent: 02 September 2020 13:37
> >>>>
> >>>> On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:
> >>>>>> -		return 0;
> >>>>>> -	return (size == 0 || size - 1 <= seg.seg - addr);
> >>>>>> +	if (addr >= TASK_SIZE_MAX)
> >>>>>> +		return false;
> >>>>>> +	if (size == 0)
> >>>>>> +		return false;
> >>>>>
> >>>>> __access_ok() was returning true when size == 0 up to now. Any reason to
> >>>>> return false now ?
> >>>>
> >>>> No, this is accidental and broken.  Can you re-run your benchmark with
> >>>> this fixed?
> >>>
> >>> Is TASK_SIZE_MASK defined such that you can do:
> >>>
> >>> 	return (addr | size) < TASK_SIZE_MAX) || !size;
> >>
> >> TASK_SIZE_MAX will usually be 0xc0000000
> >>
> >> With:
> >> addr = 0x80000000;
> >> size = 0x80000000;
> >>
> >> I expect it to fail ....
> >>
> >> With the formula you propose it will succeed, won't it ?
> >
> > Hmmm... Was i getting confused about some comments for 64bit
> > about there being such a big hole between valid user and kernel
> > addresses that it was enough to check that 'size < TASK_SIZE_MAX'.
> >
> > That would be true for 64bit x86 (and probably ppc (& arm??))
> > if TASK_SIZE_MAX were 0x4 << 60.
> > IIUC the highest user address is (much) less than 0x0 << 60
> > and the lowest kernel address (much) greater than 0xf << 60
> > on all these 64bit platforms.
> >
> > Actually if doing access_ok() inside get_user() you don't
> > need to check the size at all.
> 
> You mean on 64 bit or on any platform ?

64bit and 32bit

> What about a word write to 0xbffffffe, won't it overwrite 0xc0000000 ?
> 
> > You don't even need to in copy_to/from_user() provided
> > it always does a forwards copy.
> 
> Do you mean due to the gap ?
> Is it garantied to be a gap ? Even on a 32 bits having TASK_SIZE set to
> 0xc0000000 and PAGE_OFFSET set to the same ?

I read somewhere (I won't find it again) that the last 4k page
(below 0xc0000000) must not be allocated on i386 because some
cpu (both intel and amd) do 'horrid things' if they try to
(IIRC) do instruction prefetches across the boundary.
So the accesses to 0xbffffffe will fault and the one to 0xc0000000
won't happen (in any useful way at least).

I'd suspect that not allocating the 3G-4k page would be a safe
bet on all architectures - even 68k.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Christophe Leroy Sept. 2, 2020, 3:17 p.m. UTC | #8
Le 02/09/2020 à 14:36, Christoph Hellwig a écrit :
> On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:
>>> -		return 0;
>>> -	return (size == 0 || size - 1 <= seg.seg - addr);
>>> +	if (addr >= TASK_SIZE_MAX)
>>> +		return false;
>>> +	if (size == 0)
>>> +		return false;
>>
>> __access_ok() was returning true when size == 0 up to now. Any reason to
>> return false now ?
> 
> No, this is accidental and broken.  Can you re-run your benchmark with
> this fixed?
> 

With this fix, I get

root@vgoippro:~# time dd if=/dev/zero of=/dev/null count=1M
1048576+0 records in
1048576+0 records out
536870912 bytes (512.0MB) copied, 6.776327 seconds, 75.6MB/s
real    0m 6.78s
user    0m 1.64s
sys     0m 5.13s

That's still far from the 91.7MB/s I get with 5.9-rc2, but better than 
the 65.8MB/s I got yesterday with your series. Still some way to go thought.

Christophe
Linus Torvalds Sept. 2, 2020, 6:02 p.m. UTC | #9
On Wed, Sep 2, 2020 at 8:17 AM Christophe Leroy
<christophe.leroy@csgroup.eu> wrote:
>
>
> With this fix, I get
>
> root@vgoippro:~# time dd if=/dev/zero of=/dev/null count=1M
> 536870912 bytes (512.0MB) copied, 6.776327 seconds, 75.6MB/s
>
> That's still far from the 91.7MB/s I get with 5.9-rc2, but better than
> the 65.8MB/s I got yesterday with your series. Still some way to go thought.

I don't see why this change would make any difference.

And btw, why do the 32-bit and 64-bit checks even differ? It's not
like the extra (single) instruction should even matter. I think the
main reason is that the simpler 64-bit case could stay as a macro
(because it only uses "addr" and "size" once), but honestly, that
"simplification" doesn't help when you then need to have that #ifdef
for the 32-bit case and an inline function anyway.

So why isn't it just

  static inline int __access_ok(unsigned long addr, unsigned long size)
  { return addr <= TASK_SIZE_MAX && size <= TASK_SIZE_MAX-addr; }

for both and be done with it?

The "size=0" check is only relevant for the "addr == TASK_SIZE_MAX"
case, and existed in the old code because it had that "-1" thing
becasue "seg.seg" was actually TASK_SIZE-1.

Now that we don't have any TASK_SIZE-1, zero isn't special any more.

However, I suspect a bigger reason for the actual performance
degradation would be the patch that makes things use "write_iter()"
for writing, even when a simpler "write()" exists.

For writing to /dev/null, the cost of setting up iterators and all the
pointless indirection is all kinds of stupid.

So I think "write()" should just go back to default to using
"->write()" rather than "->write_iter()" if the simpler case exists.

                   Linus
Christoph Hellwig Sept. 3, 2020, 7:11 a.m. UTC | #10
On Wed, Sep 02, 2020 at 11:02:22AM -0700, Linus Torvalds wrote:
> I don't see why this change would make any difference.

Me neither, but while looking at a different project I did spot places
that actually do an access_ok with len 0, that's why I wanted him to
try.

That being said: Christophe are these number stables?  Do you get
similar numbers with multiple runs?

> And btw, why do the 32-bit and 64-bit checks even differ? It's not
> like the extra (single) instruction should even matter. I think the
> main reason is that the simpler 64-bit case could stay as a macro
> (because it only uses "addr" and "size" once), but honestly, that
> "simplification" doesn't help when you then need to have that #ifdef
> for the 32-bit case and an inline function anyway.

I'll have to leave that to the powerpc folks.  The intent was to not
change the behavior (and I even fucked that up for the the size == 0
case).

> However, I suspect a bigger reason for the actual performance
> degradation would be the patch that makes things use "write_iter()"
> for writing, even when a simpler "write()" exists.

Except that we do not actually have such a patch.  For normal user
writes we only use ->write_iter if ->write is not present.  But what
shows up in the profile is that /dev/zero only has a read_iter op and
not a normal read.  I've added a patch below that implements a normal
read which might help a tad with this workload, but should not be part
of a regression.

Also Christophe:  can you bisect which patch starts this?  Is it really
this last patch in the series?

---
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index abd4ffdc8cdebc..1dc99ab158457a 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -726,6 +726,27 @@ static ssize_t read_iter_zero(struct kiocb *iocb, struct iov_iter *iter)
 	return written;
 }
 
+static ssize_t read_zero(struct file *file, char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	size_t cleared = 0;
+
+	while (count) {
+		size_t chunk = min_t(size_t, count, PAGE_SIZE);
+
+		if (clear_user(buf + cleared, chunk))
+			return cleared ? cleared : -EFAULT;
+		cleared += chunk;
+		count -= chunk;
+
+		if (signal_pending(current))
+			return cleared ? cleared : -ERESTARTSYS;
+		cond_resched();
+	}
+
+	return cleared;
+}
+
 static int mmap_zero(struct file *file, struct vm_area_struct *vma)
 {
 #ifndef CONFIG_MMU
@@ -921,6 +942,7 @@ static const struct file_operations zero_fops = {
 	.llseek		= zero_lseek,
 	.write		= write_zero,
 	.read_iter	= read_iter_zero,
+	.read		= read_zero,
 	.write_iter	= write_iter_zero,
 	.mmap		= mmap_zero,
 	.get_unmapped_area = get_unmapped_area_zero,
Christophe Leroy Sept. 3, 2020, 7:20 a.m. UTC | #11
Le 02/09/2020 à 20:02, Linus Torvalds a écrit :
> On Wed, Sep 2, 2020 at 8:17 AM Christophe Leroy
> <christophe.leroy@csgroup.eu> wrote:
>>
>>
>> With this fix, I get
>>
>> root@vgoippro:~# time dd if=/dev/zero of=/dev/null count=1M
>> 536870912 bytes (512.0MB) copied, 6.776327 seconds, 75.6MB/s
>>
>> That's still far from the 91.7MB/s I get with 5.9-rc2, but better than
>> the 65.8MB/s I got yesterday with your series. Still some way to go thought.
> 
> I don't see why this change would make any difference.
> 

Neither do I.

Looks like nowadays, CONFIG_STACKPROTECTOR has become a default.
I rebuilt the kernel without it, I now get a throughput of 99.8MB/s both 
without and with this series.

Looking at the generated code (GCC 10.1), a small change in a function 
seems to make large changes in the generated code when 
CONFIG_STACKPROTECTOR is set.

In addition to that, trivial functions which don't use the stack at all 
get a stack frame anyway when CONFIG_STACKPROTECTOR is set, allthough 
that's only -fstack-protector-strong. And there is no canary check.

Without CONFIG_STACKPROTECTOR:

c01572a0 <no_llseek>:
c01572a0:	38 60 ff ff 	li      r3,-1
c01572a4:	38 80 ff e3 	li      r4,-29
c01572a8:	4e 80 00 20 	blr

With CONFIG_STACKPROTECTOR (regardless of CONFIG_STACKPROTECTOR_STRONG 
or not):

c0164e08 <no_llseek>:
c0164e08:	94 21 ff f0 	stwu    r1,-16(r1)
c0164e0c:	38 60 ff ff 	li      r3,-1
c0164e10:	38 80 ff e3 	li      r4,-29
c0164e14:	38 21 00 10 	addi    r1,r1,16
c0164e18:	4e 80 00 20 	blr

Wondering why CONFIG_STACKPROTECTOR has become the default. It seems to 
imply a 10% performance loss even in the best case (91.7MB/s versus 
99.8MB/s)

Note that without CONFIG_STACKPROTECTOR_STRONG, I'm at 99.3MB/s, so 
that's really the _STRONG alternative that hurts.

Christophe
Christophe Leroy Sept. 3, 2020, 7:27 a.m. UTC | #12
Le 03/09/2020 à 09:11, Christoph Hellwig a écrit :
> On Wed, Sep 02, 2020 at 11:02:22AM -0700, Linus Torvalds wrote:
>> I don't see why this change would make any difference.
> 
> Me neither, but while looking at a different project I did spot places
> that actually do an access_ok with len 0, that's why I wanted him to
> try.
> 
> That being said: Christophe are these number stables?  Do you get
> similar numbers with multiple runs?

Yes the numbers are similar with multiple runs and multiple reboots.

> 
>> And btw, why do the 32-bit and 64-bit checks even differ? It's not
>> like the extra (single) instruction should even matter. I think the
>> main reason is that the simpler 64-bit case could stay as a macro
>> (because it only uses "addr" and "size" once), but honestly, that
>> "simplification" doesn't help when you then need to have that #ifdef
>> for the 32-bit case and an inline function anyway.
> 
> I'll have to leave that to the powerpc folks.  The intent was to not
> change the behavior (and I even fucked that up for the the size == 0
> case).
> 
>> However, I suspect a bigger reason for the actual performance
>> degradation would be the patch that makes things use "write_iter()"
>> for writing, even when a simpler "write()" exists.
> 
> Except that we do not actually have such a patch.  For normal user
> writes we only use ->write_iter if ->write is not present.  But what
> shows up in the profile is that /dev/zero only has a read_iter op and
> not a normal read.  I've added a patch below that implements a normal
> read which might help a tad with this workload, but should not be part
> of a regression.
> 
> Also Christophe:  can you bisect which patch starts this?  Is it really
> this last patch in the series?

5.9-rc2: 91.5MB/s
Patch 1: 74.9MB/s
Patch 2: 97.9MB/s
Patch 3: 97.7MB/s
Patch 4 to 9: 97.9MB/s
Patch 10: 85.3MB/s
Patch 11: 75.4MB/s

See my other mail, when removing CONFIG_STACKPROTECTOR, I get a stable 
99.8MB/s throughput.

Christophe
Christophe Leroy Sept. 3, 2020, 8:55 a.m. UTC | #13
Le 03/09/2020 à 09:11, Christoph Hellwig a écrit :
> 
> Except that we do not actually have such a patch.  For normal user
> writes we only use ->write_iter if ->write is not present.  But what
> shows up in the profile is that /dev/zero only has a read_iter op and
> not a normal read.  I've added a patch below that implements a normal
> read which might help a tad with this workload, but should not be part
> of a regression.
> 

With that patch below, throughput is 113.5MB/s (instead of 99.9MB/s).
So a 14% improvement. That's not bad.

Christophe

> 
> ---
> diff --git a/drivers/char/mem.c b/drivers/char/mem.c
> index abd4ffdc8cdebc..1dc99ab158457a 100644
> --- a/drivers/char/mem.c
> +++ b/drivers/char/mem.c
> @@ -726,6 +726,27 @@ static ssize_t read_iter_zero(struct kiocb *iocb, struct iov_iter *iter)
>   	return written;
>   }
>   
> +static ssize_t read_zero(struct file *file, char __user *buf,
> +			 size_t count, loff_t *ppos)
> +{
> +	size_t cleared = 0;
> +
> +	while (count) {
> +		size_t chunk = min_t(size_t, count, PAGE_SIZE);
> +
> +		if (clear_user(buf + cleared, chunk))
> +			return cleared ? cleared : -EFAULT;
> +		cleared += chunk;
> +		count -= chunk;
> +
> +		if (signal_pending(current))
> +			return cleared ? cleared : -ERESTARTSYS;
> +		cond_resched();
> +	}
> +
> +	return cleared;
> +}
> +
>   static int mmap_zero(struct file *file, struct vm_area_struct *vma)
>   {
>   #ifndef CONFIG_MMU
> @@ -921,6 +942,7 @@ static const struct file_operations zero_fops = {
>   	.llseek		= zero_lseek,
>   	.write		= write_zero,
>   	.read_iter	= read_iter_zero,
> +	.read		= read_zero,
>   	.write_iter	= write_iter_zero,
>   	.mmap		= mmap_zero,
>   	.get_unmapped_area = get_unmapped_area_zero,
>
diff mbox series

Patch

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3f09d6fdf89405..1f48bbfb3ce99d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -249,7 +249,6 @@  config PPC
 	select PCI_SYSCALL			if PCI
 	select PPC_DAWR				if PPC64
 	select RTC_LIB
-	select SET_FS
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index ed0d633ab5aa42..f01e4d650c520a 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -83,10 +83,6 @@  struct task_struct;
 void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp);
 void release_thread(struct task_struct *);
 
-typedef struct {
-	unsigned long seg;
-} mm_segment_t;
-
 #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET]
 #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET]
 
@@ -148,7 +144,6 @@  struct thread_struct {
 	unsigned long	ksp_vsid;
 #endif
 	struct pt_regs	*regs;		/* Pointer to saved register state */
-	mm_segment_t	addr_limit;	/* for get_fs() validation */
 #ifdef CONFIG_BOOKE
 	/* BookE base exception scratch space; align on cacheline */
 	unsigned long	normsave[8] ____cacheline_aligned;
@@ -295,7 +290,6 @@  struct thread_struct {
 #define INIT_THREAD { \
 	.ksp = INIT_SP, \
 	.ksp_limit = INIT_SP_LIMIT, \
-	.addr_limit = KERNEL_DS, \
 	.pgdir = swapper_pg_dir, \
 	.fpexc_mode = MSR_FE0 | MSR_FE1, \
 	SPEFSCR_INIT \
@@ -303,7 +297,6 @@  struct thread_struct {
 #else
 #define INIT_THREAD  { \
 	.ksp = INIT_SP, \
-	.addr_limit = KERNEL_DS, \
 	.fpexc_mode = 0, \
 }
 #endif
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index ca6c9702570494..46a210b03d2b80 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -90,7 +90,6 @@  void arch_setup_new_exec(void);
 #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
 #define TIF_SIGPENDING		1	/* signal pending */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_FSCHECK		3	/* Check FS is USER_DS on return */
 #define TIF_SYSCALL_EMU		4	/* syscall emulation active */
 #define TIF_RESTORE_TM		5	/* need to restore TM FP/VEC/VSX */
 #define TIF_PATCH_PENDING	6	/* pending live patching update */
@@ -130,7 +129,6 @@  void arch_setup_new_exec(void);
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
 #define _TIF_NOHZ		(1<<TIF_NOHZ)
-#define _TIF_FSCHECK		(1<<TIF_FSCHECK)
 #define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_DOTRACE	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
@@ -138,8 +136,7 @@  void arch_setup_new_exec(void);
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
-				 _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
-				 _TIF_FSCHECK)
+				 _TIF_RESTORE_TM | _TIF_PATCH_PENDING)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
 
 /* Bits in local_flags */
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 7fe3531ad36a77..39727537d39701 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -8,62 +8,36 @@ 
 #include <asm/extable.h>
 #include <asm/kup.h>
 
-/*
- * The fs value determines whether argument validity checking should be
- * performed or not.  If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- *
- * The fs/ds values are now the highest legal address in the "segment".
- * This simplifies the checking in the routines below.
- */
-
-#define MAKE_MM_SEG(s)  ((mm_segment_t) { (s) })
-
-#define KERNEL_DS	MAKE_MM_SEG(~0UL)
 #ifdef __powerpc64__
 /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */
-#define USER_DS		MAKE_MM_SEG(TASK_SIZE_USER64 - 1)
-#else
-#define USER_DS		MAKE_MM_SEG(TASK_SIZE - 1)
-#endif
-
-#define get_fs()	(current->thread.addr_limit)
+#define TASK_SIZE_MAX		TASK_SIZE_USER64
 
-static inline void set_fs(mm_segment_t fs)
+static inline bool __access_ok(unsigned long addr, unsigned long size)
 {
-	current->thread.addr_limit = fs;
-	/* On user-mode return check addr_limit (fs) is correct */
-	set_thread_flag(TIF_FSCHECK);
+	if (addr >= TASK_SIZE_MAX)
+		return false;
+	/*
+	 * This check is sufficient because there is a large enough gap between
+	 * user addresses and the kernel addresses.
+	 */
+	return size <= TASK_SIZE_MAX;
 }
-
-#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
-#define user_addr_max()	(get_fs().seg)
-
-#ifdef __powerpc64__
-/*
- * This check is sufficient because there is a large enough
- * gap between user addresses and the kernel addresses
- */
-#define __access_ok(addr, size, segment)	\
-	(((addr) <= (segment).seg) && ((size) <= (segment).seg))
-
 #else
+#define TASK_SIZE_MAX		TASK_SIZE
 
-static inline int __access_ok(unsigned long addr, unsigned long size,
-			mm_segment_t seg)
+static inline bool __access_ok(unsigned long addr, unsigned long size)
 {
-	if (addr > seg.seg)
-		return 0;
-	return (size == 0 || size - 1 <= seg.seg - addr);
+	if (addr >= TASK_SIZE_MAX)
+		return false;
+	if (size == 0)
+		return false;
+	return size <= TASK_SIZE_MAX - addr;
 }
-
-#endif
+#endif /* __powerpc64__ */
 
 #define access_ok(addr, size)		\
 	(__chk_user_ptr(addr),		\
-	 __access_ok((__force unsigned long)(addr), (size), get_fs()))
+	 __access_ok((unsigned long)(addr), (size)))
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index d15a98c758b8b4..df547d8e31e49c 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -312,9 +312,6 @@  void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 {
 	user_exit();
 
-	/* Check valid addr_limit, TIF check is done there */
-	addr_limit_user_check();
-
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);
 
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index caee8cc77e1954..8342188ea1acd0 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -108,11 +108,11 @@  static nokprobe_inline long address_ok(struct pt_regs *regs,
 {
 	if (!user_mode(regs))
 		return 1;
-	if (__access_ok(ea, nb, USER_DS))
+	if (__access_ok(ea, nb))
 		return 1;
-	if (__access_ok(ea, 1, USER_DS))
+	if (__access_ok(ea, 1))
 		/* Access overlaps the end of the user region */
-		regs->dar = USER_DS.seg;
+		regs->dar = TASK_SIZE_MAX - 1;
 	else
 		regs->dar = ea;
 	return 0;