diff mbox series

[net-next,v4,01/12] inet: homa: define user-visible API for Homa

Message ID 20241217000626.2958-2-ouster@cs.stanford.edu (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Begin upstreaming Homa transport protocol | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success Errors and warnings before: 0 (+23) this patch: 0 (+23)
netdev/cc_maintainers success CCed 1 of 1 maintainers
netdev/build_clang success Errors and warnings before: 135 this patch: 135
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 183 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 0 this patch: 2
netdev/source_inline success Was 0 now: 0

Commit Message

John Ousterhout Dec. 17, 2024, 12:06 a.m. UTC
Note: for man pages, see the Homa Wiki at:
https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview

Signed-off-by: John Ousterhout <ouster@cs.stanford.edu>
---
 MAINTAINERS               |   7 ++
 include/uapi/linux/homa.h | 170 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+)
 create mode 100644 include/uapi/linux/homa.h

Comments

Jakub Kicinski Dec. 19, 2024, 1:43 a.m. UTC | #1
On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote:
> +#ifdef __cplusplus
> +extern "C"
> +{
> +#endif

I'm not aware of any networking header wrapped in extern "C"
Let's not make this precedent?

> +/* IANA-assigned Internet Protocol number for Homa. */
> +#define IPPROTO_HOMA 146
> +
> +/**
> + * define HOMA_MAX_MESSAGE_LENGTH - Maximum bytes of payload in a Homa
> + * request or response message.
> + */
> +#define HOMA_MAX_MESSAGE_LENGTH 1000000
> +
> +/**
> + * define HOMA_BPAGE_SIZE - Number of bytes in pages used for receive
> + * buffers. Must be power of two.
> + */
> +#define HOMA_BPAGE_SIZE (1 << HOMA_BPAGE_SHIFT)
> +#define HOMA_BPAGE_SHIFT 16
> +
> +/**
> + * define HOMA_MAX_BPAGES - The largest number of bpages that will be required
> + * to store an incoming message.
> + */
> +#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) \
> +		>> HOMA_BPAGE_SHIFT)
> +
> +/**
> + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into
> + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively
> + * for well-defined server ports. The remaining ports are used for client
> + * ports; these are allocated automatically by Homa. Port 0 is reserved.
> + */
> +#define HOMA_MIN_DEFAULT_PORT 0x8000

Not sure why but ./scripts/kernel-doc does not like this:

include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead

> +/**
> + * struct homa_sendmsg_args - Provides information needed by Homa's
> + * sendmsg; passed to sendmsg using the msg_control field.
> + */
> +struct homa_sendmsg_args {
> +	/**
> +	 * @id: (in/out) An initial value of 0 means a new request is
> +	 * being sent; nonzero means the message is a reply to the given
> +	 * id. If the message is a request, then the value is modified to
> +	 * hold the id of the new RPC.
> +	 */
> +	uint64_t id;

Please use Linux uapi types, __u64

> +	/**
> +	 * @completion_cookie: (in) Used only for request messages; will be
> +	 * returned by recvmsg when the RPC completes. Typically used to
> +	 * locate app-specific info about the RPC.
> +	 */
> +	uint64_t completion_cookie;
> +};
> +
> +#if !defined(__cplusplus)
> +_Static_assert(sizeof(struct homa_sendmsg_args) >= 16,
> +	       "homa_sendmsg_args shrunk");
> +_Static_assert(sizeof(struct homa_sendmsg_args) <= 16,
> +	       "homa_sendmsg_args grew");
> +#endif
> +
> +/**
> + * struct homa_recvmsg_args - Provides information needed by Homa's
> + * recvmsg; passed to recvmsg using the msg_control field.
> + */
> +struct homa_recvmsg_args {
> +	/**
> +	 * @id: (in/out) Initially specifies the id of the desired RPC, or 0
> +	 * if any RPC is OK; returns the actual id received.
> +	 */
> +	uint64_t id;
> +
> +	/**
> +	 * @completion_cookie: (out) If the incoming message is a response,
> +	 * this will return the completion cookie specified when the
> +	 * request was sent. For requests this will always be zero.
> +	 */
> +	uint64_t completion_cookie;
> +
> +	/**
> +	 * @flags: (in) OR-ed combination of bits that control the operation.
> +	 * See below for values.
> +	 */
> +	uint32_t flags;
> +
> +	/**
> +	 * @num_bpages: (in/out) Number of valid entries in @bpage_offsets.
> +	 * Passes in bpages from previous messages that can now be
> +	 * recycled; returns bpages from the new message.
> +	 */
> +	uint32_t num_bpages;
> +
> +	/**
> +	 * @bpage_offsets: (in/out) Each entry is an offset into the buffer
> +	 * region for the socket pool. When returned from recvmsg, the
> +	 * offsets indicate where fragments of the new message are stored. All
> +	 * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE
> +	 * bytes) and are bpage-aligned. The last entry may refer to a bpage
> +	 * fragment and is not necessarily aligned. The application now owns
> +	 * these bpages and must eventually return them to Homa, using
> +	 * bpage_offsets in a future recvmsg invocation.
> +	 */
> +	uint32_t bpage_offsets[HOMA_MAX_BPAGES];
> +};
> +
> +#if !defined(__cplusplus)
> +_Static_assert(sizeof(struct homa_recvmsg_args) >= 88,
> +	       "homa_recvmsg_args shrunk");
> +_Static_assert(sizeof(struct homa_recvmsg_args) <= 88,
> +	       "homa_recvmsg_args grew");
> +#endif
> +
> +/* Flag bits for homa_recvmsg_args.flags (see man page for documentation):
> + */
> +#define HOMA_RECVMSG_REQUEST       0x01
> +#define HOMA_RECVMSG_RESPONSE      0x02
> +#define HOMA_RECVMSG_NONBLOCKING   0x04
> +#define HOMA_RECVMSG_VALID_FLAGS   0x07
> +
> +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */
> +#define SO_HOMA_RCVBUF 10
> +
> +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */
> +struct homa_rcvbuf_args {
> +	/** @start: First byte of buffer region. */
> +	void *start;

I'm not sure if pointers are legal in uAPI.
I *think* we are supposed to use __aligned_u64, because pointers
will be different size for 32b binaries running in compat mode
on 64b kernels, or some such.

> +	/** @length: Total number of bytes available at @start. */
> +	size_t length;
> +};
> +
> +/* Meanings of the bits in Homa's flag word, which can be set using
> + * "sysctl /net/homa/flags".
> + */
> +
> +/**
> + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism:
> + * always send all packets immediately.
> + */

Also makes kernel-doc unhappy:

include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead

Note that next patch adds more kernel-doc warnings, you probably want
to TAL at those as well. Use

  ./scripts/kernel-doc -none -Wall $file

> +#define HOMA_FLAG_DONT_THROTTLE   2
> +
> +/* I/O control calls on Homa sockets. These are mapped into the
> + * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef.
> + */
> +
> +#define HOMAIOCFREEZE _IO(0x89, 0xef)
> +
> +#ifdef __cplusplus
> +}
> +#endif
John Ousterhout Dec. 19, 2024, 6:57 p.m. UTC | #2
On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote:
> > +#ifdef __cplusplus
> > +extern "C"
> > +{
> > +#endif
>
> I'm not aware of any networking header wrapped in extern "C"
> Let's not make this precedent?

Without this I don't seem to be able to use this header in C++ files:
I end up getting linker errors such as 'undefined reference to
`homa_replyv(int, iovec const*, int, sockaddr const*, unsigned int,
unsigned long)'.

Any suggestions on how to make the header file work with C++ files
without the #ifdef __cplusplus?

> > +/**
> > + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into
> > + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively
> > + * for well-defined server ports. The remaining ports are used for client
> > + * ports; these are allocated automatically by Homa. Port 0 is reserved.
> > + */
> > +#define HOMA_MIN_DEFAULT_PORT 0x8000
>
> Not sure why but ./scripts/kernel-doc does not like this:
>
> include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead

I saw this warning from kernel-doc before I posted the patch, but I
couldn't figure out why it is happening. After staring at the error
message some more I figured it out: kernel-doc is getting confused by
the "-" in "16-bit" (it seems to use the last "-" on the line rather
than the first). I've modified the comment to replace "16-bit" with
"16 bit" and filed a bug report for kernel-doc.

> > +/**
> > + * struct homa_sendmsg_args - Provides information needed by Homa's
> > + * sendmsg; passed to sendmsg using the msg_control field.
> > + */
> > +struct homa_sendmsg_args {
> > +     /**
> > +      * @id: (in/out) An initial value of 0 means a new request is
> > +      * being sent; nonzero means the message is a reply to the given
> > +      * id. If the message is a request, then the value is modified to
> > +      * hold the id of the new RPC.
> > +      */
> > +     uint64_t id;
>
> Please use Linux uapi types, __u64

Done. In the process I discovered that the underlying type for __u64
is not the same as that for uint64_t; this results in awkwardness in
programs that use both...

> > +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */
> > +#define SO_HOMA_RCVBUF 10
> > +
> > +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */
> > +struct homa_rcvbuf_args {
> > +     /** @start: First byte of buffer region. */
> > +     void *start;
>
> I'm not sure if pointers are legal in uAPI.
> I *think* we are supposed to use __aligned_u64, because pointers
> will be different size for 32b binaries running in compat mode
> on 64b kernels, or some such.

I see that "void *" is used in the declaration for struct msghdr
(along with some other pointer types as well) and struct msghdr is
part of several uAPI interfaces, no?

> > +/**
> > + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism:
> > + * always send all packets immediately.
> > + */
>
> Also makes kernel-doc unhappy:
>
> include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead

It seems that the ":" also confuses kernel-doc. I've worked around this as well.

> Note that next patch adds more kernel-doc warnings, you probably want
> to TAL at those as well. Use
>
>   ./scripts/kernel-doc -none -Wall $file

Hmm, I did run kernel-doc before posting the patch, but maybe I missed
some stuff. I'll take another look. There are a few things kernel-doc
complained about where the requested documentation would add no useful
information; it would just end up repeating what is already obvious
from the code. Is any discretion allowed for cases like this? If the
expectation is that there will be zero kernel-doc complaints, then
I'll go ahead and add the useless documentation.

> > +#ifdef __cplusplus
> > +}
> > +#endif
> --
> pw-bot: cr

I'm not sure what "pw-bot: cr" means; I assume this is related to the
"#ifdef __cplusplus" discussion above?

Thanks for the comments.

-John-
Przemek Kitszel Dec. 19, 2024, 9:57 p.m. UTC | #3
On 12/19/24 02:43, Jakub Kicinski wrote:
> On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote:


>> + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into
>> + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively
>> + * for well-defined server ports. The remaining ports are used for client
>> + * ports; these are allocated automatically by Homa. Port 0 is reserved.
>> + */
>> +#define HOMA_MIN_DEFAULT_PORT 0x8000
> 
> Not sure why but ./scripts/kernel-doc does not like this:
> 
> include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead
> 


>> +/**
>> + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism:
>> + * always send all packets immediately.
>> + */
> 
> Also makes kernel-doc unhappy:
> 
> include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead
> 
> Note that next patch adds more kernel-doc warnings, you probably want
> to TAL at those as well. Use
> 
>    ./scripts/kernel-doc -none -Wall $file
> 
turns out that when you have the word "define" at the front
and then a colon (:) or two dashes (-) it complains (?!!)

my advice is to remove the word "define" from the first kdoc line,
as it does not happen to bring any value (the emitted doc still has
a "function" instead of "define" used as a type)
Przemek Kitszel Dec. 19, 2024, 10:05 p.m. UTC | #4
On 12/19/24 19:57, John Ousterhout wrote:
> On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote:
>>
>> On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote:


>>> +#ifdef __cplusplus
>>> +}
>>> +#endif
>> --
>> pw-bot: cr
> 
> I'm not sure what "pw-bot: cr" means; I assume this is related to the
> "#ifdef __cplusplus" discussion above?

it's a shortcut for:
Patchwork Bot, please mark that as "ChangesRequested" [by Maintainer]

C++ program could simply wrap all includes by extern "C" section

not sure what is better, but there is very little precedence for any
courtesy for C++ in the kernel ATM

> 
> Thanks for the comments.
> 
> -John-
>
Andrew Lunn Dec. 19, 2024, 10:32 p.m. UTC | #5
> > --
> > pw-bot: cr
> 
> I'm not sure what "pw-bot: cr" means; I assume this is related to the
> "#ifdef __cplusplus" discussion above?

It is documented here:

https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html#updating-patch-status

although the documentation spells it out in full, but the patchworks
bot understands a few abbreviations like cr for change-request.

Marking a patchset as changes-requested will drop it out of the list
of patches waiting to be merged, so included it at the end of a review
comment can save the Maintainers some time.

	Andrew
Jakub Kicinski Dec. 20, 2024, 1:41 a.m. UTC | #6
On Thu, 19 Dec 2024 10:57:22 -0800 John Ousterhout wrote:
> On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote:  
> > > +#ifdef __cplusplus
> > > +extern "C"
> > > +{
> > > +#endif  
> >
> > I'm not aware of any networking header wrapped in extern "C"
> > Let's not make this precedent?  
> 
> Without this I don't seem to be able to use this header in C++ files:
> I end up getting linker errors such as 'undefined reference to
> `homa_replyv(int, iovec const*, int, sockaddr const*, unsigned int,
> unsigned long)'.

No idea TBH, I don't see homa_reply anywhere in the submission

> Any suggestions on how to make the header file work with C++ files
> without the #ifdef __cplusplus?

With the little C++ understanding I have, I _think_ the include site
can wrap:

extern "C" {
#include "<linux/homa.h>"
}

> > > +/**
> > > + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into
> > > + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively
> > > + * for well-defined server ports. The remaining ports are used for client
> > > + * ports; these are allocated automatically by Homa. Port 0 is reserved.
> > > + */
> > > +#define HOMA_MIN_DEFAULT_PORT 0x8000  
> >
> > Not sure why but ./scripts/kernel-doc does not like this:
> >
> > include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead  
> 
> I saw this warning from kernel-doc before I posted the patch, but I
> couldn't figure out why it is happening. After staring at the error
> message some more I figured it out: kernel-doc is getting confused by
> the "-" in "16-bit" (it seems to use the last "-" on the line rather
> than the first). I've modified the comment to replace "16-bit" with
> "16 bit" and filed a bug report for kernel-doc.

Unless you're planning to render the docs on docs.kernel.org you can
just switch from /** to /* comments. IMVHO kdoc is overused, unless
there's full documentation with API rendered like
https://www.kernel.org/doc/html/latest/networking/net_dim.html
kdoc is more trouble than gain.

> > > +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */
> > > +#define SO_HOMA_RCVBUF 10
> > > +
> > > +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */
> > > +struct homa_rcvbuf_args {
> > > +     /** @start: First byte of buffer region. */
> > > +     void *start;  
> >
> > I'm not sure if pointers are legal in uAPI.
> > I *think* we are supposed to use __aligned_u64, because pointers
> > will be different size for 32b binaries running in compat mode
> > on 64b kernels, or some such.  
> 
> I see that "void *" is used in the declaration for struct msghdr
> (along with some other pointer types as well) and struct msghdr is
> part of several uAPI interfaces, no?

Off the top off my head this use is a source of major pain, grep around
for compat_msghdr.

> > > +/**
> > > + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism:
> > > + * always send all packets immediately.
> > > + */  
> >
> > Also makes kernel-doc unhappy:
> >
> > include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead  
> 
> It seems that the ":" also confuses kernel-doc. I've worked around this as well.
> 
> > Note that next patch adds more kernel-doc warnings, you probably want
> > to TAL at those as well. Use
> >
> >   ./scripts/kernel-doc -none -Wall $file  
> 
> Hmm, I did run kernel-doc before posting the patch, but maybe I missed
> some stuff. I'll take another look. There are a few things kernel-doc
> complained about where the requested documentation would add no useful
> information; it would just end up repeating what is already obvious
> from the code. Is any discretion allowed for cases like this? If the
> expectation is that there will be zero kernel-doc complaints, then
> I'll go ahead and add the useless documentation.

My recommendation is to use normal comments where kdoc just repeats
obvious stuff. All these warnings sooner or later will result in some
semi-automated and often poor quality patch submissions to "fix" it.
Which is just work for maintainers to deal with :(
John Ousterhout Dec. 20, 2024, 5:25 p.m. UTC | #7
On Thu, Dec 19, 2024 at 2:06 PM Przemek Kitszel
<przemyslaw.kitszel@intel.com> wrote:
>
> On 12/19/24 19:57, John Ousterhout wrote:
> > On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >>
> >> On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote:
>
>
> >>> +#ifdef __cplusplus
> >>> +}
> >>> +#endif
> >> --
> >> pw-bot: cr
> >
> > I'm not sure what "pw-bot: cr" means; I assume this is related to the
> > "#ifdef __cplusplus" discussion above?
>
> it's a shortcut for:
> Patchwork Bot, please mark that as "ChangesRequested" [by Maintainer]
>
> C++ program could simply wrap all includes by extern "C" section
>
> not sure what is better, but there is very little precedence for any
> courtesy for C++ in the kernel ATM

I have now removed the 'extern "C"' from homa.h and moved it to all of
the programs that #include it.

-John-
John Ousterhout Dec. 20, 2024, 5:59 p.m. UTC | #8
On Thu, Dec 19, 2024 at 5:41 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> > Any suggestions on how to make the header file work with C++ files
> > without the #ifdef __cplusplus?
>
> With the little C++ understanding I have, I _think_ the include site
> can wrap:
>
> extern "C" {
> #include "<linux/homa.h>"
> }

I have done this now. I was hesitant to do that because it seemed like
it was creating unnecessary extra work for anyone who uses homa.h in a
C++ program, but since this seems to be the convention I have
conformed to it.

> > > > +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */
> > > > +#define SO_HOMA_RCVBUF 10
> > > > +
> > > > +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */
> > > > +struct homa_rcvbuf_args {
> > > > +     /** @start: First byte of buffer region. */
> > > > +     void *start;
> > >
> > > I'm not sure if pointers are legal in uAPI.
> > > I *think* we are supposed to use __aligned_u64, because pointers
> > > will be different size for 32b binaries running in compat mode
> > > on 64b kernels, or some such.
> >
> > I see that "void *" is used in the declaration for struct msghdr
> > (along with some other pointer types as well) and struct msghdr is
> > part of several uAPI interfaces, no?
>
> Off the top off my head this use is a source of major pain, grep around
> for compat_msghdr.

How should I go about confirming that this __aligned_u64 is indeed the
expected convention (sounds like you aren't certain)? Also, any idea
why it needs to be aligned rather than just __u64?

> My recommendation is to use normal comments where kdoc just repeats
> obvious stuff. All these warnings sooner or later will result in some
> semi-automated and often poor quality patch submissions to "fix" it.
> Which is just work for maintainers to deal with :(

I have done this now. I had assumed that kdoc would also complain if
there was a declaration without official kdoc documentation, but now
that I see that it won't, I'll use that approach for places where kdoc
style is awkward. Personally I'm agnostic about whether to use kdoc; I
assumed that I should use it since it exists.

-John-
Jakub Kicinski Dec. 20, 2024, 7:31 p.m. UTC | #9
On Fri, 20 Dec 2024 09:59:53 -0800 John Ousterhout wrote:
> > > I see that "void *" is used in the declaration for struct msghdr
> > > (along with some other pointer types as well) and struct msghdr is
> > > part of several uAPI interfaces, no?  
> >
> > Off the top off my head this use is a source of major pain, grep around
> > for compat_msghdr.  
> 
> How should I go about confirming that this __aligned_u64 is indeed the
> expected convention (sounds like you aren't certain)?

Let me add Arnd Bergmann to the CC list, he will correct me if 
I'm wrong. Otherwise you can trust my intuition :)

> Also, any idea why it needs to be aligned rather than just __u64?

The main problem is that if __u64 doesn't force alignment on a 32b
version of a platform the struct may have holes and padding in
different places on 32b vs 64b compat.

Double checking, I don't think that's the case for your structs, 
so you can most likely go with a plain __u64.
Arnd Bergmann Dec. 20, 2024, 9:12 p.m. UTC | #10
On Fri, Dec 20, 2024, at 20:31, Jakub Kicinski wrote:
> On Fri, 20 Dec 2024 09:59:53 -0800 John Ousterhout wrote:
>> > > I see that "void *" is used in the declaration for struct msghdr
>> > > (along with some other pointer types as well) and struct msghdr is
>> > > part of several uAPI interfaces, no?  
>> >
>> > Off the top off my head this use is a source of major pain, grep around
>> > for compat_msghdr.  
>> 
>> How should I go about confirming that this __aligned_u64 is indeed the
>> expected convention (sounds like you aren't certain)?
>
> Let me add Arnd Bergmann to the CC list, he will correct me if 
> I'm wrong. Otherwise you can trust my intuition :)

You are right that for the purposes of the user API, structures
should use __u64 or __aligned_u64 in place of pointers, there are
some more details on this in Documentation/driver-api/ioctl.rst.

What worries me more in this particular case is the way that
this pointer is passed through setsockopt(), which really doesn't
take any pointers in other protocols.

I have not fully understood what is behind the pointer, but
it looks like this gets stored in the kernel in a per-socket
structure that is annotated as a kernel pointer, not a user
pointer, which may cause additional problems.

I don't know if the same pointer ever points to a kernel
structure, but if it does, that needs to be fixed first.

Assuming this is actually meant as a persistent __user
pointer, I'm still unsure what this means if the socket is
available to more than one process, e.g. through a fork()
or explicit file descriptor passing, or if the original
process dies while there is still a transfer in progress.
I realize that there is a lot of information already out
there that I haven't all read, so this is probably explained
somewhere, but it would be nice to point to that documentation
somewhere near the code to clarify the corner cases.

That probably also explains what type of memory the
__user buffer can point to, but I would like to make
sure that this has well-defined behavior e.g. if that
buffer is an mmap()ed file on NFS that was itself
mounted over a homa socket. Is there any guarantee that
this is either prohibited or is free of deadlocks and
recursion?

      Arnd
Andrew Lunn Dec. 20, 2024, 9:18 p.m. UTC | #11
On Fri, Dec 20, 2024 at 09:59:53AM -0800, John Ousterhout wrote:
> On Thu, Dec 19, 2024 at 5:41 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > > Any suggestions on how to make the header file work with C++ files
> > > without the #ifdef __cplusplus?
> >
> > With the little C++ understanding I have, I _think_ the include site
> > can wrap:
> >
> > extern "C" {
> > #include "<linux/homa.h>"
> > }
> 
> I have done this now. I was hesitant to do that because it seemed like
> it was creating unnecessary extra work for anyone who uses homa.h in a
> C++ program, but since this seems to be the convention I have
> conformed to it.

Do C++ programmes directly use this header, or is there a library in
the middle?

Often kernel APIs are too low level for an application to use
directly, and a library is used to provide a higher level API. That
would allow you to provide a different header file which is more C++
friendly.

	Andrew
John Ousterhout Dec. 20, 2024, 11:42 p.m. UTC | #12
On Fri, Dec 20, 2024 at 1:13 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Fri, Dec 20, 2024, at 20:31, Jakub Kicinski wrote:
> > On Fri, 20 Dec 2024 09:59:53 -0800 John Ousterhout wrote:
> >> > > I see that "void *" is used in the declaration for struct msghdr
> >> > > (along with some other pointer types as well) and struct msghdr is
> >> > > part of several uAPI interfaces, no?
> >> >
> >> > Off the top off my head this use is a source of major pain, grep around
> >> > for compat_msghdr.
> >>
> >> How should I go about confirming that this __aligned_u64 is indeed the
> >> expected convention (sounds like you aren't certain)?
> >
> > Let me add Arnd Bergmann to the CC list, he will correct me if
> > I'm wrong. Otherwise you can trust my intuition :)
>
> You are right that for the purposes of the user API, structures
> should use __u64 or __aligned_u64 in place of pointers, there are
> some more details on this in Documentation/driver-api/ioctl.rst.

I have now changed the type from void * to __u64.

> What worries me more in this particular case is the way that
> this pointer is passed through setsockopt(), which really doesn't
> take any pointers in other protocols.
>
> I have not fully understood what is behind the pointer, but
> it looks like this gets stored in the kernel in a per-socket
> structure that is annotated as a kernel pointer, not a user
> pointer, which may cause additional problems.

It is actually a user pointer; I had forgotten the __user annotation.
I have fixed this now as well.

> I don't know if the same pointer ever points to a kernel
> structure, but if it does, that needs to be fixed first.

It doesn't.

> Assuming this is actually meant as a persistent __user
> pointer, I'm still unsure what this means if the socket is
> available to more than one process, e.g. through a fork()
> or explicit file descriptor passing, or if the original
> process dies while there is still a transfer in progress.
> I realize that there is a lot of information already out
> there that I haven't all read, so this is probably explained
> somewhere, but it would be nice to point to that documentation
> somewhere near the code to clarify the corner cases.

I hadn't considered this, but the buffering mechanism prevents the
same socket from being shared across processes. I'm okay with that:
I'm not sure that sharing between processes adds much value for Homa,
and the performance benefit from the buffer mechanism is quite large.
I will document this. Is there a way to prevent a socket from being
shared across processes (e.g. can I set close-on-exec from within the
kernel?) I don't think there is any risk to kernel integrity if the
socket does end up shared; the worst that will happen is that the
memory of one of the processes will get trashed because Homa will
write to memory that isn't actually buffer space in that process.

> That probably also explains what type of memory the
> __user buffer can point to, but I would like to make
> sure that this has well-defined behavior e.g. if that
> buffer is an mmap()ed file on NFS that was itself
> mounted over a homa socket. Is there any guarantee that
> this is either prohibited or is free of deadlocks and
> recursion?

Given the API incompatibilities between Homa and TCP, I don't think it
is possible to have NFS mounted over a Homa socket. But you raise the
issue of whether some kinds of addresses might not be suitable for
Homa's buffer use this way. I don't know enough about the various
possible kinds of memory to know what kinds of problems could occur.
My assumption is that the buffer area will be a simple mmap()ed
region. The only use Homa makes of the buffer address is to call
import_ubuf with addresses in the buffer region, followed by
skb_copy_datagram_iter with the resulting iov_iter.

Is there some way I can check the "kind" of memory behind the buffer
pointer, so Homa could reject anything other than the simple case?
However, even if Homa checks when it sets up the buffer region, the
application could always rearrange its memory later, so I'm not sure
this would be effective. Also, if there is some sort of weird memory
that would cause problems for Homa, couldn't an application pass this
same memory into a call to recvmsg on a normal socket? Given that
Homa's use of the address is similar to what already happens in other
sockets, I don't think that the fact that Homa keeps the address
around in the kernel for a long time should cause additional problems,
no?

-John-
John Ousterhout Dec. 20, 2024, 11:51 p.m. UTC | #13
On Fri, Dec 20, 2024 at 3:42 PM John Ousterhout <ouster@cs.stanford.edu> wrote:
>
> I hadn't considered this, but the buffering mechanism prevents the
> same socket from being shared across processes.

It just occurred to me that a Homa socket should be sharable by
multiple processes as long as the buffer region is also shared at the
same virtual address in each process.

-John-
Arnd Bergmann Dec. 21, 2024, 1:43 p.m. UTC | #14
On Sat, Dec 21, 2024, at 00:42, John Ousterhout wrote:
> On Fri, Dec 20, 2024 at 1:13 PM Arnd Bergmann <arnd@arndb.de> wrote:
>> Assuming this is actually meant as a persistent __user
>> pointer, I'm still unsure what this means if the socket is
>> available to more than one process, e.g. through a fork()
>> or explicit file descriptor passing, or if the original
>> process dies while there is still a transfer in progress.
>> I realize that there is a lot of information already out
>> there that I haven't all read, so this is probably explained
>> somewhere, but it would be nice to point to that documentation
>> somewhere near the code to clarify the corner cases.
>
> I hadn't considered this, but the buffering mechanism prevents the
> same socket from being shared across processes. I'm okay with that:
> I'm not sure that sharing between processes adds much value for Homa,
> and the performance benefit from the buffer mechanism is quite large.
> I will document this. Is there a way to prevent a socket from being
> shared across processes (e.g. can I set close-on-exec from within the
> kernel?) I don't think there is any risk to kernel integrity if the
> socket does end up shared; the worst that will happen is that the
> memory of one of the processes will get trashed because Homa will
> write to memory that isn't actually buffer space in that process.

It would definitely be nicer to ensure that it's only available
for a particular 'struct mm'. Setting O_CLOEXEC is probably not
be enough since this does not close the fd on a fork without exec,
and does not prevent the flag from being reset through fcntl().

Maybe see what io_uring() does to handle userspace pointers
here, I think the problem is quite similar there.

>> That probably also explains what type of memory the
>> __user buffer can point to, but I would like to make
>> sure that this has well-defined behavior e.g. if that
>> buffer is an mmap()ed file on NFS that was itself
>> mounted over a homa socket. Is there any guarantee that
>> this is either prohibited or is free of deadlocks and
>> recursion?
>
> Given the API incompatibilities between Homa and TCP, I don't think it
> is possible to have NFS mounted over a Homa socket. But you raise the
> issue of whether some kinds of addresses might not be suitable for
> Homa's buffer use this way. I don't know enough about the various
> possible kinds of memory to know what kinds of problems could occur.
> My assumption is that the buffer area will be a simple mmap()ed
> region. The only use Homa makes of the buffer address is to call
> import_ubuf with addresses in the buffer region, followed by
> skb_copy_datagram_iter with the resulting iov_iter.

Right, NFS was just an example, but there are other interesting
cases. You certainly have to deal with buffers in userspace
memory that are blocked indefinitely. Another interesting case
is memory that has additional constraints, e.g. the MMIO
space of a PCI device like a GPU, which may fault when writing
data into it, or which cannot be mapped into the DMA space
of a network device.

> Is there some way I can check the "kind" of memory behind the buffer
> pointer, so Homa could reject anything other than the simple case?

I don't think so. I still don't know what the exact constraints
are that you have here, but I suspect this would all be a lot
simpler if you could change the interface to not pass arbitrary
user addresses but instead have a single file descriptor that
backs the buffers, either by passing a tmpfs/hugetlbfs file into
the socket instead of a pointer, or by using mmap() on the
socket to map it into userspace like we do for packet sockets.

      Arnd
John Ousterhout Dec. 23, 2024, 5:17 p.m. UTC | #15
On Sat, Dec 21, 2024 at 5:43 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> >> That probably also explains what type of memory the
> >> __user buffer can point to, but I would like to make
> >> sure that this has well-defined behavior e.g. if that
> >> buffer is an mmap()ed file on NFS that was itself
> >> mounted over a homa socket. Is there any guarantee that
> >> this is either prohibited or is free of deadlocks and
> >> recursion?
> >
> > Given the API incompatibilities between Homa and TCP, I don't think it
> > is possible to have NFS mounted over a Homa socket. But you raise the
> > issue of whether some kinds of addresses might not be suitable for
> > Homa's buffer use this way. I don't know enough about the various
> > possible kinds of memory to know what kinds of problems could occur.
> > My assumption is that the buffer area will be a simple mmap()ed
> > region. The only use Homa makes of the buffer address is to call
> > import_ubuf with addresses in the buffer region, followed by
> > skb_copy_datagram_iter with the resulting iov_iter.
>
> Right, NFS was just an example, but there are other interesting
> cases. You certainly have to deal with buffers in userspace
> memory that are blocked indefinitely. Another interesting case
> is memory that has additional constraints, e.g. the MMIO
> space of a PCI device like a GPU, which may fault when writing
> data into it, or which cannot be mapped into the DMA space
> of a network device.

Homa doesn't map the user receive buffers into DMA space, so this last
issue won't come up.

> > Is there some way I can check the "kind" of memory behind the buffer
> > pointer, so Homa could reject anything other than the simple case?
>
> I don't think so. I still don't know what the exact constraints
> are that you have here, but I suspect this would all be a lot
> simpler if you could change the interface to not pass arbitrary
> user addresses but instead have a single file descriptor that
> backs the buffers, either by passing a tmpfs/hugetlbfs file into
> the socket instead of a pointer, or by using mmap() on the
> socket to map it into userspace like we do for packet sockets.

After thinking about this some more, I don't think there should be any
need for Homa to constrain the addresses passed into it. Homa's usage
of buffer addresses is the same as TCP's; the only difference is that
instead of receiving a buffer address in a recvmsg call, the address
is passed in earlier in a setsockopt call. In either case, the
application could pass in an arbitrary address. Assuming that TCP
properly handles all of the possible variations in addresses that
could be passed in, Homa should properly handle them as well, since it
invokes the same underlying functions. Is there a specific example you
have in mind of an address that would be problematic for Homa but not
also problematic for TCP?

-John-
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 1389704c7d8d..935d1e995018 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10391,6 +10391,13 @@  F:	lib/test_hmm*
 F:	mm/hmm*
 F:	tools/testing/selftests/mm/*hmm*
 
+HOMA TRANSPORT PROTOCOL
+M:	John Ousterhout <ouster@cs.stanford.edu>
+S:	Maintained
+W:	https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview
+F:	include/uapi/linux/homa.h
+F:	net/homa/
+
 HONEYWELL HSC030PA PRESSURE SENSOR SERIES IIO DRIVER
 M:	Petre Rodan <petre.rodan@subdimension.ro>
 L:	linux-iio@vger.kernel.org
diff --git a/include/uapi/linux/homa.h b/include/uapi/linux/homa.h
new file mode 100644
index 000000000000..555efa600b3f
--- /dev/null
+++ b/include/uapi/linux/homa.h
@@ -0,0 +1,170 @@ 
+/* SPDX-License-Identifier: BSD-2-Clause */
+
+/* This file defines the kernel call interface for the Homa
+ * transport protocol.
+ */
+
+#ifndef _UAPI_LINUX_HOMA_H
+#define _UAPI_LINUX_HOMA_H
+
+#include <linux/types.h>
+#ifndef __KERNEL__
+#include <netinet/in.h>
+#include <sys/socket.h>
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* IANA-assigned Internet Protocol number for Homa. */
+#define IPPROTO_HOMA 146
+
+/**
+ * define HOMA_MAX_MESSAGE_LENGTH - Maximum bytes of payload in a Homa
+ * request or response message.
+ */
+#define HOMA_MAX_MESSAGE_LENGTH 1000000
+
+/**
+ * define HOMA_BPAGE_SIZE - Number of bytes in pages used for receive
+ * buffers. Must be power of two.
+ */
+#define HOMA_BPAGE_SIZE (1 << HOMA_BPAGE_SHIFT)
+#define HOMA_BPAGE_SHIFT 16
+
+/**
+ * define HOMA_MAX_BPAGES - The largest number of bpages that will be required
+ * to store an incoming message.
+ */
+#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) \
+		>> HOMA_BPAGE_SHIFT)
+
+/**
+ * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into
+ * two nonoverlapping regions. Ports 1-32767 are reserved exclusively
+ * for well-defined server ports. The remaining ports are used for client
+ * ports; these are allocated automatically by Homa. Port 0 is reserved.
+ */
+#define HOMA_MIN_DEFAULT_PORT 0x8000
+
+/**
+ * struct homa_sendmsg_args - Provides information needed by Homa's
+ * sendmsg; passed to sendmsg using the msg_control field.
+ */
+struct homa_sendmsg_args {
+	/**
+	 * @id: (in/out) An initial value of 0 means a new request is
+	 * being sent; nonzero means the message is a reply to the given
+	 * id. If the message is a request, then the value is modified to
+	 * hold the id of the new RPC.
+	 */
+	uint64_t id;
+
+	/**
+	 * @completion_cookie: (in) Used only for request messages; will be
+	 * returned by recvmsg when the RPC completes. Typically used to
+	 * locate app-specific info about the RPC.
+	 */
+	uint64_t completion_cookie;
+};
+
+#if !defined(__cplusplus)
+_Static_assert(sizeof(struct homa_sendmsg_args) >= 16,
+	       "homa_sendmsg_args shrunk");
+_Static_assert(sizeof(struct homa_sendmsg_args) <= 16,
+	       "homa_sendmsg_args grew");
+#endif
+
+/**
+ * struct homa_recvmsg_args - Provides information needed by Homa's
+ * recvmsg; passed to recvmsg using the msg_control field.
+ */
+struct homa_recvmsg_args {
+	/**
+	 * @id: (in/out) Initially specifies the id of the desired RPC, or 0
+	 * if any RPC is OK; returns the actual id received.
+	 */
+	uint64_t id;
+
+	/**
+	 * @completion_cookie: (out) If the incoming message is a response,
+	 * this will return the completion cookie specified when the
+	 * request was sent. For requests this will always be zero.
+	 */
+	uint64_t completion_cookie;
+
+	/**
+	 * @flags: (in) OR-ed combination of bits that control the operation.
+	 * See below for values.
+	 */
+	uint32_t flags;
+
+	/**
+	 * @num_bpages: (in/out) Number of valid entries in @bpage_offsets.
+	 * Passes in bpages from previous messages that can now be
+	 * recycled; returns bpages from the new message.
+	 */
+	uint32_t num_bpages;
+
+	/**
+	 * @bpage_offsets: (in/out) Each entry is an offset into the buffer
+	 * region for the socket pool. When returned from recvmsg, the
+	 * offsets indicate where fragments of the new message are stored. All
+	 * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE
+	 * bytes) and are bpage-aligned. The last entry may refer to a bpage
+	 * fragment and is not necessarily aligned. The application now owns
+	 * these bpages and must eventually return them to Homa, using
+	 * bpage_offsets in a future recvmsg invocation.
+	 */
+	uint32_t bpage_offsets[HOMA_MAX_BPAGES];
+};
+
+#if !defined(__cplusplus)
+_Static_assert(sizeof(struct homa_recvmsg_args) >= 88,
+	       "homa_recvmsg_args shrunk");
+_Static_assert(sizeof(struct homa_recvmsg_args) <= 88,
+	       "homa_recvmsg_args grew");
+#endif
+
+/* Flag bits for homa_recvmsg_args.flags (see man page for documentation):
+ */
+#define HOMA_RECVMSG_REQUEST       0x01
+#define HOMA_RECVMSG_RESPONSE      0x02
+#define HOMA_RECVMSG_NONBLOCKING   0x04
+#define HOMA_RECVMSG_VALID_FLAGS   0x07
+
+/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */
+#define SO_HOMA_RCVBUF 10
+
+/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */
+struct homa_rcvbuf_args {
+	/** @start: First byte of buffer region. */
+	void *start;
+
+	/** @length: Total number of bytes available at @start. */
+	size_t length;
+};
+
+/* Meanings of the bits in Homa's flag word, which can be set using
+ * "sysctl /net/homa/flags".
+ */
+
+/**
+ * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism:
+ * always send all packets immediately.
+ */
+#define HOMA_FLAG_DONT_THROTTLE   2
+
+/* I/O control calls on Homa sockets. These are mapped into the
+ * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef.
+ */
+
+#define HOMAIOCFREEZE _IO(0x89, 0xef)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _UAPI_LINUX_HOMA_H */