Patchwork [RFC] bpf: Add helpers to read useful task_struct members

login
register
mail settings
Submitter Sandipan Das
Date Nov. 3, 2017, 6:58 a.m.
Message ID <20171103065833.8076-1-sandipan@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/10039583/
State New
Headers show

Comments

Sandipan Das - Nov. 3, 2017, 6:58 a.m.
For added security, the layout of some structures can be
randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
such structure is task_struct. To build BPF programs, we
use Clang which does not support this feature. So, if we
attempt to read a field of a structure with a randomized
layout within a BPF program, we do not get the expected
value because of incorrect offsets. To observe this, it
is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
enabled because the structure annotations/members added
for this purpose are enough to cause this. So, all kernel
builds are affected.

For example, considering samples/bpf/offwaketime_kern.c,
if we try to print the values of pid and comm inside the
task_struct passed to waker() by adding the following
lines of code at the appropriate place

  char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
  bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));

it is seen that upon rebuilding and running this sample
followed by inspecting /sys/kernel/debug/tracing/trace,
the output looks like the following

                               _-----=> irqs-off
                              / _----=> need-resched
                             | / _---=> hardirq/softirq
                             || / _--=> preempt-depth
                             ||| /     delay
            TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
               | |       |   ||||       |         |
          <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
 systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): p->pid = 0, p->comm =
 systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): p->pid = 0, p->comm =
 systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): p->pid = 0, p->comm =

To avoid this, we add new BPF helpers that read the
correct values for some of the important task_struct
members such as pid, tgid, comm and flags which are
extensively used in BPF-based analysis tools such as
bcc. Since these helpers are built with GCC, they use
the correct offsets when referencing a member.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 include/linux/bpf.h                       |  3 ++
 include/uapi/linux/bpf.h                  | 13 ++++++
 kernel/bpf/core.c                         |  3 ++
 kernel/bpf/helpers.c                      | 75 +++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c                  |  6 +++
 tools/testing/selftests/bpf/bpf_helpers.h |  9 ++++
 6 files changed, 109 insertions(+)
Alexei Starovoitov - Nov. 4, 2017, 9:34 a.m.
On 11/3/17 3:58 PM, Sandipan Das wrote:
> For added security, the layout of some structures can be
> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
> such structure is task_struct. To build BPF programs, we
> use Clang which does not support this feature. So, if we
> attempt to read a field of a structure with a randomized
> layout within a BPF program, we do not get the expected
> value because of incorrect offsets. To observe this, it
> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
> enabled because the structure annotations/members added
> for this purpose are enough to cause this. So, all kernel
> builds are affected.
>
> For example, considering samples/bpf/offwaketime_kern.c,
> if we try to print the values of pid and comm inside the
> task_struct passed to waker() by adding the following
> lines of code at the appropriate place
>
>   char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>   bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
>
> it is seen that upon rebuilding and running this sample
> followed by inspecting /sys/kernel/debug/tracing/trace,
> the output looks like the following
>
>                                _-----=> irqs-off
>                               / _----=> need-resched
>                              | / _---=> hardirq/softirq
>                              || / _--=> preempt-depth
>                              ||| /     delay
>             TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>                | |       |   ||||       |         |
>           <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
>  systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): p->pid = 0, p->comm =
>  systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): p->pid = 0, p->comm =
>  systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): p->pid = 0, p->comm =
>
> To avoid this, we add new BPF helpers that read the
> correct values for some of the important task_struct
> members such as pid, tgid, comm and flags which are
> extensively used in BPF-based analysis tools such as
> bcc. Since these helpers are built with GCC, they use
> the correct offsets when referencing a member.
>
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
...
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f90860d1f897..324508d27bd2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -338,6 +338,16 @@ union bpf_attr {
>   *     @skb: pointer to skb
>   *     Return: classid if != 0
>   *
> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
> + *     Return: task->tgid << 32 | task->pid
> + *
> + * int bpf_get_task_comm(struct task_struct *task)
> + *     Stores task->comm into buf
> + *     Return: 0 on success or negative error
> + *
> + * u32 bpf_get_task_flags(struct task_struct *task)
> + *     Return: task->flags
> + *

I don't think it's a solution.
Tracing scripts read other fields too.
Making it work for these 3 fields is a drop in a bucket.
If randomization is used I think we have to accept
that existing bpf scripts won't be usable.
Long term solution is to support 'BPF Type Format' or BTF
(which is old C-Type Format) for kernel data structures,
so bcc scripts wouldn't need to use kernel headers and clang.
The proper offsets will be described in BTF.
We were planning to use it initially to describe map key/value,
but it applies for this case as well.
There will be a tool that will take dwarf from vmlinux and
compress it into BTF. Kernel will also be able to verify
that BTF is a valid BTF.
I'm assuming that gcc randomization plugin produces dwarf
with correct offsets, if not, it would have to be fixed.
Naveen N. Rao - Nov. 4, 2017, 5:31 p.m.
Hi Alexei,

Alexei Starovoitov wrote:
> On 11/3/17 3:58 PM, Sandipan Das wrote:
>> For added security, the layout of some structures can be
>> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
>> such structure is task_struct. To build BPF programs, we
>> use Clang which does not support this feature. So, if we
>> attempt to read a field of a structure with a randomized
>> layout within a BPF program, we do not get the expected
>> value because of incorrect offsets. To observe this, it
>> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
>> enabled because the structure annotations/members added
>> for this purpose are enough to cause this. So, all kernel
>> builds are affected.
>>
>> For example, considering samples/bpf/offwaketime_kern.c,
>> if we try to print the values of pid and comm inside the
>> task_struct passed to waker() by adding the following
>> lines of code at the appropriate place
>>
>>   char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>>   bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
>>
>> it is seen that upon rebuilding and running this sample
>> followed by inspecting /sys/kernel/debug/tracing/trace,
>> the output looks like the following
>>
>>                                _-----=> irqs-off
>>                               / _----=> need-resched
>>                              | / _---=> hardirq/softirq
>>                              || / _--=> preempt-depth
>>                              ||| /     delay
>>             TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>>                | |       |   ||||       |         |
>>           <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
>>  systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): 
>>  p->pid = 0, p->comm =
>>  systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): 
>>  p->pid = 0, p->comm =
>>  systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): 
>>  p->pid = 0, p->comm =
>>
>> To avoid this, we add new BPF helpers that read the
>> correct values for some of the important task_struct
>> members such as pid, tgid, comm and flags which are
>> extensively used in BPF-based analysis tools such as
>> bcc. Since these helpers are built with GCC, they use
>> the correct offsets when referencing a member.
>>
>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
> ...
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index f90860d1f897..324508d27bd2 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -338,6 +338,16 @@ union bpf_attr {
>>   *     @skb: pointer to skb
>>   *     Return: classid if != 0
>>   *
>> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
>> + *     Return: task->tgid << 32 | task->pid
>> + *
>> + * int bpf_get_task_comm(struct task_struct *task)
>> + *     Stores task->comm into buf
>> + *     Return: 0 on success or negative error
>> + *
>> + * u32 bpf_get_task_flags(struct task_struct *task)
>> + *     Return: task->flags
>> + *
> 
> I don't think it's a solution.
> Tracing scripts read other fields too.
> Making it work for these 3 fields is a drop in a bucket.

Indeed. However...

> If randomization is used I think we have to accept
> that existing bpf scripts won't be usable.

... the actual issue is that randomization isn't necessary for this to 
show up. The annotations added to mark off the structure members results 
in some structure members being moved into an anonymous structure, which 
would then get padded differently. So, *all* kernels since v4.13 are 
affected, afaict.

As such, we wanted to propose this as a short term solution, but I do 
agree that this doesn't solve the real issue.

> Long term solution is to support 'BPF Type Format' or BTF
> (which is old C-Type Format) for kernel data structures,
> so bcc scripts wouldn't need to use kernel headers and clang.
> The proper offsets will be described in BTF.
> We were planning to use it initially to describe map key/value,
> but it applies for this case as well.
> There will be a tool that will take dwarf from vmlinux and
> compress it into BTF. Kernel will also be able to verify
> that BTF is a valid BTF.

This is the first that I've heard about BTF. Can you share more details 
about it, or point me to some place where it has been discussed?

We considered having tools derive the structure offsets from debuginfo, 
but debuginfo may not always be present on production systems. So, it 
isn't clear if having that dependency is fine. I'm not sure how BTF will
be different.

> I'm assuming that gcc randomization plugin produces dwarf
> with correct offsets, if not, it would have to be fixed.

I think the offsets described in dwarf were incorrect with 
CONFIG_GCC_PLUGIN_RANDSTRUCT, but I'll let Sandipan confirm that.


- Naveen
Alexei Starovoitov - Nov. 4, 2017, 9:10 p.m.
On 11/5/17 2:31 AM, Naveen N. Rao wrote:
> Hi Alexei,
>
> Alexei Starovoitov wrote:
>> On 11/3/17 3:58 PM, Sandipan Das wrote:
>>> For added security, the layout of some structures can be
>>> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
>>> such structure is task_struct. To build BPF programs, we
>>> use Clang which does not support this feature. So, if we
>>> attempt to read a field of a structure with a randomized
>>> layout within a BPF program, we do not get the expected
>>> value because of incorrect offsets. To observe this, it
>>> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
>>> enabled because the structure annotations/members added
>>> for this purpose are enough to cause this. So, all kernel
>>> builds are affected.
>>>
>>> For example, considering samples/bpf/offwaketime_kern.c,
>>> if we try to print the values of pid and comm inside the
>>> task_struct passed to waker() by adding the following
>>> lines of code at the appropriate place
>>>
>>>   char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>>>   bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
>>>
>>> it is seen that upon rebuilding and running this sample
>>> followed by inspecting /sys/kernel/debug/tracing/trace,
>>> the output looks like the following
>>>
>>>                                _-----=> irqs-off
>>>                               / _----=> need-resched
>>>                              | / _---=> hardirq/softirq
>>>                              || / _--=> preempt-depth
>>>                              ||| /     delay
>>>             TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>>>                | |       |   ||||       |         |
>>>           <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>  systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker():
>>>  p->pid = 0, p->comm =
>>>  systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker():
>>>  p->pid = 0, p->comm =
>>>  systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker():
>>>  p->pid = 0, p->comm =
>>>
>>> To avoid this, we add new BPF helpers that read the
>>> correct values for some of the important task_struct
>>> members such as pid, tgid, comm and flags which are
>>> extensively used in BPF-based analysis tools such as
>>> bcc. Since these helpers are built with GCC, they use
>>> the correct offsets when referencing a member.
>>>
>>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>> ...
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index f90860d1f897..324508d27bd2 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -338,6 +338,16 @@ union bpf_attr {
>>>   *     @skb: pointer to skb
>>>   *     Return: classid if != 0
>>>   *
>>> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
>>> + *     Return: task->tgid << 32 | task->pid
>>> + *
>>> + * int bpf_get_task_comm(struct task_struct *task)
>>> + *     Stores task->comm into buf
>>> + *     Return: 0 on success or negative error
>>> + *
>>> + * u32 bpf_get_task_flags(struct task_struct *task)
>>> + *     Return: task->flags
>>> + *
>>
>> I don't think it's a solution.
>> Tracing scripts read other fields too.
>> Making it work for these 3 fields is a drop in a bucket.
>
> Indeed. However...
>
>> If randomization is used I think we have to accept
>> that existing bpf scripts won't be usable.
>
> ... the actual issue is that randomization isn't necessary for this to
> show up. The annotations added to mark off the structure members results
> in some structure members being moved into an anonymous structure, which
> would then get padded differently. So, *all* kernels since v4.13 are
> affected, afaict.

hmm. why would all 4.13+ be affected?
It's just an anonymous struct inside task_struct.
Are you saying that due to clang not adding this 'struct { };' treatment 
to task_struct?
I thought such struct shouldn't change layout.
If it is we need to fix include/linux/compiler-clang.h to do that
anon struct as well.

> As such, we wanted to propose this as a short term solution, but I do
> agree that this doesn't solve the real issue.
>
>> Long term solution is to support 'BPF Type Format' or BTF
>> (which is old C-Type Format) for kernel data structures,
>> so bcc scripts wouldn't need to use kernel headers and clang.
>> The proper offsets will be described in BTF.
>> We were planning to use it initially to describe map key/value,
>> but it applies for this case as well.
>> There will be a tool that will take dwarf from vmlinux and
>> compress it into BTF. Kernel will also be able to verify
>> that BTF is a valid BTF.
>
> This is the first that I've heard about BTF. Can you share more details
> about it, or point me to some place where it has been discussed?
>
> We considered having tools derive the structure offsets from debuginfo,
> but debuginfo may not always be present on production systems. So, it
> isn't clear if having that dependency is fine. I'm not sure how BTF will
> be different.

It was discussed at this year plumbers:
https://lwn.net/Articles/734453/

btw the name BTF is work in progress. We started with CTF, but
it conflicts with all other meanings of this abbreviation.
Likely we will call it something different at the end.

Initial goal was to describe key/map values of bpf maps to
make debugging easier, but now we want to use this compressed
type format for tracing as well, since installing kernel headers
everywhere doesn't scale well while CTF can be embedded in vmlinux

We were also thinking to improve verifier with CTF knowledge too.
Like if CTF describes that map value is two u32, but bpf program
is doing 8-byte access then something is wrong and either warn
or reject such program.
Sandipan Das - Nov. 6, 2017, 5:16 a.m.
Hi Alexei, Naveen,

On 11/04/2017 11:01 PM, Naveen N. Rao wrote:
> 
> I think the offsets described in dwarf were incorrect with CONFIG_GCC_PLUGIN_RANDSTRUCT, but I'll let Sandipan confirm that.
> 

I think that the offsets described in dwarf are probably incorrect when
CONFIG_GCC_PLUGIN_RANDSTRUCT is enabled. To verify this, I used perf
to attach a probe to try_to_wake_up() which is the also the function to
which waker() is attached in the previously mentioned kernel sample. So,
if the run the following:

# perf probe "try_to_wake_up" "p->pid"
# perf record -a -e probe:try_to_wake_up
# perf script

The value of p->pid is reported as 0. Similarly, if I try to read
p->comm, it is reported to be an empty string. The same problem is
seen with systemtap as well.

Also, if I do a printk with offsetof(struct task_struct, pid) and
offsetof(struct task_struct, comm) inside the kernel code and then
compare the values with the offsets reported by pahole, they are
completely different.

- Sandipan
Naveen N. Rao - Nov. 6, 2017, 3:55 p.m.
Alexei Starovoitov wrote:
> On 11/5/17 2:31 AM, Naveen N. Rao wrote:
>> Hi Alexei,
>>
>> Alexei Starovoitov wrote:
>>> On 11/3/17 3:58 PM, Sandipan Das wrote:
>>>> For added security, the layout of some structures can be
>>>> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
>>>> such structure is task_struct. To build BPF programs, we
>>>> use Clang which does not support this feature. So, if we
>>>> attempt to read a field of a structure with a randomized
>>>> layout within a BPF program, we do not get the expected
>>>> value because of incorrect offsets. To observe this, it
>>>> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
>>>> enabled because the structure annotations/members added
>>>> for this purpose are enough to cause this. So, all kernel
>>>> builds are affected.
>>>>

...

>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index f90860d1f897..324508d27bd2 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -338,6 +338,16 @@ union bpf_attr {
>>>>   *     @skb: pointer to skb
>>>>   *     Return: classid if != 0
>>>>   *
>>>> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
>>>> + *     Return: task->tgid << 32 | task->pid
>>>> + *
>>>> + * int bpf_get_task_comm(struct task_struct *task)
>>>> + *     Stores task->comm into buf
>>>> + *     Return: 0 on success or negative error
>>>> + *
>>>> + * u32 bpf_get_task_flags(struct task_struct *task)
>>>> + *     Return: task->flags
>>>> + *
>>>
>>> I don't think it's a solution.
>>> Tracing scripts read other fields too.
>>> Making it work for these 3 fields is a drop in a bucket.
>>
>> Indeed. However...
>>
>>> If randomization is used I think we have to accept
>>> that existing bpf scripts won't be usable.
>>
>> ... the actual issue is that randomization isn't necessary for this to
>> show up. The annotations added to mark off the structure members results
>> in some structure members being moved into an anonymous structure, which
>> would then get padded differently. So, *all* kernels since v4.13 are
>> affected, afaict.
> 
> hmm. why would all 4.13+ be affected?
> It's just an anonymous struct inside task_struct.
> Are you saying that due to clang not adding this 'struct { };' treatment 
> to task_struct?

Yes, that's what it looked like.

> I thought such struct shouldn't change layout.
> If it is we need to fix include/linux/compiler-clang.h to do that
> anon struct as well.

We considered that, but it looked to be very dependent on the version of 
gcc used to build the kernel. But, this may be a simpler approach for 
the shorter term.

> 
>> As such, we wanted to propose this as a short term solution, but I do
>> agree that this doesn't solve the real issue.
>>
>>> Long term solution is to support 'BPF Type Format' or BTF
>>> (which is old C-Type Format) for kernel data structures,
>>> so bcc scripts wouldn't need to use kernel headers and clang.
>>> The proper offsets will be described in BTF.
>>> We were planning to use it initially to describe map key/value,
>>> but it applies for this case as well.
>>> There will be a tool that will take dwarf from vmlinux and
>>> compress it into BTF. Kernel will also be able to verify
>>> that BTF is a valid BTF.
>>
>> This is the first that I've heard about BTF. Can you share more details
>> about it, or point me to some place where it has been discussed?
>>
>> We considered having tools derive the structure offsets from debuginfo,
>> but debuginfo may not always be present on production systems. So, it
>> isn't clear if having that dependency is fine. I'm not sure how BTF will
>> be different.
> 
> It was discussed at this year plumbers:
> https://lwn.net/Articles/734453/
> 
> btw the name BTF is work in progress. We started with CTF, but
> it conflicts with all other meanings of this abbreviation.
> Likely we will call it something different at the end.
> 
> Initial goal was to describe key/map values of bpf maps to
> make debugging easier, but now we want to use this compressed
> type format for tracing as well, since installing kernel headers
> everywhere doesn't scale well while CTF can be embedded in vmlinux

Makes sense, though I'm curious on how you're planning to have this work
without the kernel headers :)

> 
> We were also thinking to improve verifier with CTF knowledge too.
> Like if CTF describes that map value is two u32, but bpf program
> is doing 8-byte access then something is wrong and either warn
> or reject such program.

Sounds good. I look forward to more details/patches on this front once 
you're ready to share more.

Thanks,
- Naveen
Tushar Dave - Nov. 7, 2017, 12:16 a.m.
On 11/02/2017 11:58 PM, Sandipan Das wrote:
> For added security, the layout of some structures can be
> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
> such structure is task_struct. To build BPF programs, we
> use Clang which does not support this feature. So, if we
> attempt to read a field of a structure with a randomized
> layout within a BPF program, we do not get the expected
> value because of incorrect offsets. To observe this, it
> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
> enabled because the structure annotations/members added
> for this purpose are enough to cause this. So, all kernel
> builds are affected.
> 
> For example, considering samples/bpf/offwaketime_kern.c,
> if we try to print the values of pid and comm inside the
> task_struct passed to waker() by adding the following
> lines of code at the appropriate place
> 
>    char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>    bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
> 
> it is seen that upon rebuilding and running this sample
> followed by inspecting /sys/kernel/debug/tracing/trace,
> the output looks like the following
> 
>                                 _-----=> irqs-off
>                                / _----=> need-resched
>                               | / _---=> hardirq/softirq
>                               || / _--=> preempt-depth
>                               ||| /     delay
>              TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>                 | |       |   ||||       |         |
>            <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
>   systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): p->pid = 0, p->comm =
>   systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): p->pid = 0, p->comm =
>   systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): p->pid = 0, p->comm =
> 
> To avoid this, we add new BPF helpers that read the
> correct values for some of the important task_struct
> members such as pid, tgid, comm and flags which are
> extensively used in BPF-based analysis tools such as
> bcc. Since these helpers are built with GCC, they use
> the correct offsets when referencing a member.
Just to add that we were seeing the same issue (but had no clue until 
looked at this patch , thanks). Its easy to reproduce by running bcc 
example task_switch.py where pid (prev_pid) is retrieved from struct 
task_struct and that is always zero. we tried printing other task_struct 
members such as 'comm' and see that as empty string as well.


-Tushar
> 
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
> ---
>   include/linux/bpf.h                       |  3 ++
>   include/uapi/linux/bpf.h                  | 13 ++++++
>   kernel/bpf/core.c                         |  3 ++
>   kernel/bpf/helpers.c                      | 75 +++++++++++++++++++++++++++++++
>   kernel/trace/bpf_trace.c                  |  6 +++
>   tools/testing/selftests/bpf/bpf_helpers.h |  9 ++++
>   6 files changed, 109 insertions(+)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f1af7d63d678..5993a0f5262b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -418,6 +418,9 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
>   extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
>   extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
>   extern const struct bpf_func_proto bpf_get_current_comm_proto;
> +extern const struct bpf_func_proto bpf_get_task_pid_tgid_proto;
> +extern const struct bpf_func_proto bpf_get_task_comm_proto;
> +extern const struct bpf_func_proto bpf_get_task_flags_proto;
>   extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
>   extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
>   extern const struct bpf_func_proto bpf_get_stackid_proto;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f90860d1f897..324508d27bd2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -338,6 +338,16 @@ union bpf_attr {
>    *     @skb: pointer to skb
>    *     Return: classid if != 0
>    *
> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
> + *     Return: task->tgid << 32 | task->pid
> + *
> + * int bpf_get_task_comm(struct task_struct *task)
> + *     Stores task->comm into buf
> + *     Return: 0 on success or negative error
> + *
> + * u32 bpf_get_task_flags(struct task_struct *task)
> + *     Return: task->flags
> + *
>    * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
>    *     Return: 0 on success or negative error
>    *
> @@ -602,6 +612,9 @@ union bpf_attr {
>   	FN(get_current_uid_gid),	\
>   	FN(get_current_comm),		\
>   	FN(get_cgroup_classid),		\
> +	FN(get_task_pid_tgid),		\
> +	FN(get_task_comm),		\
> +	FN(get_task_flags),		\
>   	FN(skb_vlan_push),		\
>   	FN(skb_vlan_pop),		\
>   	FN(skb_get_tunnel_key),		\
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 7b62df86be1d..c69c17d6514a 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1438,6 +1438,9 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
>   const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
>   const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
>   const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> +const struct bpf_func_proto bpf_get_task_pid_tgid_proto __weak;
> +const struct bpf_func_proto bpf_get_task_comm_proto __weak;
> +const struct bpf_func_proto bpf_get_task_flags_proto __weak;
>   const struct bpf_func_proto bpf_sock_map_update_proto __weak;
>   
>   const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 3d24e238221e..f45259dce117 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -179,3 +179,78 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
>   	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
>   	.arg2_type	= ARG_CONST_SIZE,
>   };
> +
> +BPF_CALL_1(bpf_get_task_pid_tgid, struct task_struct *, task)
> +{
> +	int ret;
> +	u32 pid, tgid;
> +
> +	ret = probe_kernel_read(&pid, &task->pid, sizeof(pid));
> +	if (unlikely(ret < 0))
> +		goto err;
> +
> +	ret = probe_kernel_read(&tgid, &task->tgid, sizeof(tgid));
> +	if (unlikely(ret < 0))
> +		goto err;
> +
> +	return (u64) tgid << 32 | pid;
> +err:
> +	return -EINVAL;
> +}
> +
> +const struct bpf_func_proto bpf_get_task_pid_tgid_proto = {
> +	.func		= bpf_get_task_pid_tgid,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_3(bpf_get_task_comm, struct task_struct *, task, char *, buf, u32, size)
> +{
> +	int ret;
> +	char comm[TASK_COMM_LEN];
> +
> +	ret = probe_kernel_read(comm, task->comm, sizeof(comm));
> +	if (unlikely(ret < 0))
> +		goto err_clear;
> +
> +	strncpy(buf, comm, size);
> +
> +	/* Verifier guarantees that size > 0. For task->comm exceeding
> +	 * size, guarantee that buf is %NUL-terminated. Unconditionally
> +	 * done here to save the size test.
> +	 */
> +	buf[size - 1] = 0;
> +	return 0;
> +err_clear:
> +	memset(buf, 0, size);
> +	return -EINVAL;
> +}
> +
> +const struct bpf_func_proto bpf_get_task_comm_proto = {
> +	.func		= bpf_get_task_comm,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_ANYTHING,
> +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +};
> +
> +BPF_CALL_1(bpf_get_task_flags, struct task_struct *, task)
> +{
> +	int ret;
> +	unsigned int flags;
> +
> +	ret = probe_kernel_read(&flags, &task->flags, sizeof(flags));
> +	if (unlikely(ret < 0))
> +		return -EINVAL;
> +
> +	return flags;
> +}
> +
> +const struct bpf_func_proto bpf_get_task_flags_proto = {
> +	.func		= bpf_get_task_flags,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_ANYTHING,
> +};
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index dc498b605d5d..a31f5cf68cbc 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -477,6 +477,12 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
>   		return &bpf_get_smp_processor_id_proto;
>   	case BPF_FUNC_get_numa_node_id:
>   		return &bpf_get_numa_node_id_proto;
> +	case BPF_FUNC_get_task_pid_tgid:
> +		return &bpf_get_task_pid_tgid_proto;
> +	case BPF_FUNC_get_task_comm:
> +		return &bpf_get_task_comm_proto;
> +	case BPF_FUNC_get_task_flags:
> +		return &bpf_get_task_flags_proto;
>   	case BPF_FUNC_perf_event_read:
>   		return &bpf_perf_event_read_proto;
>   	case BPF_FUNC_probe_write_user:
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> index b2e02bdcd098..8c64df027d2c 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -1,6 +1,8 @@
>   #ifndef __BPF_HELPERS_H
>   #define __BPF_HELPERS_H
>   
> +struct task_struct;
> +
>   /* helper macro to place programs, maps, license in
>    * different sections in elf_bpf file. Section names
>    * are interpreted by elf_bpf loader
> @@ -31,6 +33,13 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
>   	(void *) BPF_FUNC_get_current_uid_gid;
>   static int (*bpf_get_current_comm)(void *buf, int buf_size) =
>   	(void *) BPF_FUNC_get_current_comm;
> +static unsigned long long (*bpf_get_task_pid_tgid)(struct task_struct *task) =
> +	(void *) BPF_FUNC_get_task_pid_tgid;
> +static int (*bpf_get_task_comm)(struct task_struct *task,
> +				void *buf, int buf_size) =
> +	(void *) BPF_FUNC_get_task_comm;
> +static unsigned int (*bpf_get_task_flags)(struct task_struct *task) =
> +	(void *) BPF_FUNC_get_task_flags;
>   static unsigned long long (*bpf_perf_event_read)(void *map,
>   						 unsigned long long flags) =
>   	(void *) BPF_FUNC_perf_event_read;
>

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f1af7d63d678..5993a0f5262b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -418,6 +418,9 @@  extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
+extern const struct bpf_func_proto bpf_get_task_pid_tgid_proto;
+extern const struct bpf_func_proto bpf_get_task_comm_proto;
+extern const struct bpf_func_proto bpf_get_task_flags_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f90860d1f897..324508d27bd2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -338,6 +338,16 @@  union bpf_attr {
  *     @skb: pointer to skb
  *     Return: classid if != 0
  *
+ * u64 bpf_get_task_pid_tgid(struct task_struct *task)
+ *     Return: task->tgid << 32 | task->pid
+ *
+ * int bpf_get_task_comm(struct task_struct *task)
+ *     Stores task->comm into buf
+ *     Return: 0 on success or negative error
+ *
+ * u32 bpf_get_task_flags(struct task_struct *task)
+ *     Return: task->flags
+ *
  * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
  *     Return: 0 on success or negative error
  *
@@ -602,6 +612,9 @@  union bpf_attr {
 	FN(get_current_uid_gid),	\
 	FN(get_current_comm),		\
 	FN(get_cgroup_classid),		\
+	FN(get_task_pid_tgid),		\
+	FN(get_task_comm),		\
+	FN(get_task_flags),		\
 	FN(skb_vlan_push),		\
 	FN(skb_vlan_pop),		\
 	FN(skb_get_tunnel_key),		\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..c69c17d6514a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1438,6 +1438,9 @@  const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_get_task_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_task_comm_proto __weak;
+const struct bpf_func_proto bpf_get_task_flags_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..f45259dce117 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,78 @@  const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE,
 };
+
+BPF_CALL_1(bpf_get_task_pid_tgid, struct task_struct *, task)
+{
+	int ret;
+	u32 pid, tgid;
+
+	ret = probe_kernel_read(&pid, &task->pid, sizeof(pid));
+	if (unlikely(ret < 0))
+		goto err;
+
+	ret = probe_kernel_read(&tgid, &task->tgid, sizeof(tgid));
+	if (unlikely(ret < 0))
+		goto err;
+
+	return (u64) tgid << 32 | pid;
+err:
+	return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_task_pid_tgid_proto = {
+	.func		= bpf_get_task_pid_tgid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_get_task_comm, struct task_struct *, task, char *, buf, u32, size)
+{
+	int ret;
+	char comm[TASK_COMM_LEN];
+
+	ret = probe_kernel_read(comm, task->comm, sizeof(comm));
+	if (unlikely(ret < 0))
+		goto err_clear;
+
+	strncpy(buf, comm, size);
+
+	/* Verifier guarantees that size > 0. For task->comm exceeding
+	 * size, guarantee that buf is %NUL-terminated. Unconditionally
+	 * done here to save the size test.
+	 */
+	buf[size - 1] = 0;
+	return 0;
+err_clear:
+	memset(buf, 0, size);
+	return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_task_comm_proto = {
+	.func		= bpf_get_task_comm,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_1(bpf_get_task_flags, struct task_struct *, task)
+{
+	int ret;
+	unsigned int flags;
+
+	ret = probe_kernel_read(&flags, &task->flags, sizeof(flags));
+	if (unlikely(ret < 0))
+		return -EINVAL;
+
+	return flags;
+}
+
+const struct bpf_func_proto bpf_get_task_flags_proto = {
+	.func		= bpf_get_task_flags,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..a31f5cf68cbc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -477,6 +477,12 @@  static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_get_numa_node_id:
 		return &bpf_get_numa_node_id_proto;
+	case BPF_FUNC_get_task_pid_tgid:
+		return &bpf_get_task_pid_tgid_proto;
+	case BPF_FUNC_get_task_comm:
+		return &bpf_get_task_comm_proto;
+	case BPF_FUNC_get_task_flags:
+		return &bpf_get_task_flags_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_probe_write_user:
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index b2e02bdcd098..8c64df027d2c 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -1,6 +1,8 @@ 
 #ifndef __BPF_HELPERS_H
 #define __BPF_HELPERS_H
 
+struct task_struct;
+
 /* helper macro to place programs, maps, license in
  * different sections in elf_bpf file. Section names
  * are interpreted by elf_bpf loader
@@ -31,6 +33,13 @@  static unsigned long long (*bpf_get_current_uid_gid)(void) =
 	(void *) BPF_FUNC_get_current_uid_gid;
 static int (*bpf_get_current_comm)(void *buf, int buf_size) =
 	(void *) BPF_FUNC_get_current_comm;
+static unsigned long long (*bpf_get_task_pid_tgid)(struct task_struct *task) =
+	(void *) BPF_FUNC_get_task_pid_tgid;
+static int (*bpf_get_task_comm)(struct task_struct *task,
+				void *buf, int buf_size) =
+	(void *) BPF_FUNC_get_task_comm;
+static unsigned int (*bpf_get_task_flags)(struct task_struct *task) =
+	(void *) BPF_FUNC_get_task_flags;
 static unsigned long long (*bpf_perf_event_read)(void *map,
 						 unsigned long long flags) =
 	(void *) BPF_FUNC_perf_event_read;