Message ID | 20241217000626.2958-2-ouster@cs.stanford.edu (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | Begin upstreaming Homa transport protocol | expand |
On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote: > +#ifdef __cplusplus > +extern "C" > +{ > +#endif I'm not aware of any networking header wrapped in extern "C" Let's not make this precedent? > +/* IANA-assigned Internet Protocol number for Homa. */ > +#define IPPROTO_HOMA 146 > + > +/** > + * define HOMA_MAX_MESSAGE_LENGTH - Maximum bytes of payload in a Homa > + * request or response message. > + */ > +#define HOMA_MAX_MESSAGE_LENGTH 1000000 > + > +/** > + * define HOMA_BPAGE_SIZE - Number of bytes in pages used for receive > + * buffers. Must be power of two. > + */ > +#define HOMA_BPAGE_SIZE (1 << HOMA_BPAGE_SHIFT) > +#define HOMA_BPAGE_SHIFT 16 > + > +/** > + * define HOMA_MAX_BPAGES - The largest number of bpages that will be required > + * to store an incoming message. > + */ > +#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) \ > + >> HOMA_BPAGE_SHIFT) > + > +/** > + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into > + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively > + * for well-defined server ports. The remaining ports are used for client > + * ports; these are allocated automatically by Homa. Port 0 is reserved. > + */ > +#define HOMA_MIN_DEFAULT_PORT 0x8000 Not sure why but ./scripts/kernel-doc does not like this: include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead > +/** > + * struct homa_sendmsg_args - Provides information needed by Homa's > + * sendmsg; passed to sendmsg using the msg_control field. > + */ > +struct homa_sendmsg_args { > + /** > + * @id: (in/out) An initial value of 0 means a new request is > + * being sent; nonzero means the message is a reply to the given > + * id. If the message is a request, then the value is modified to > + * hold the id of the new RPC. > + */ > + uint64_t id; Please use Linux uapi types, __u64 > + /** > + * @completion_cookie: (in) Used only for request messages; will be > + * returned by recvmsg when the RPC completes. Typically used to > + * locate app-specific info about the RPC. > + */ > + uint64_t completion_cookie; > +}; > + > +#if !defined(__cplusplus) > +_Static_assert(sizeof(struct homa_sendmsg_args) >= 16, > + "homa_sendmsg_args shrunk"); > +_Static_assert(sizeof(struct homa_sendmsg_args) <= 16, > + "homa_sendmsg_args grew"); > +#endif > + > +/** > + * struct homa_recvmsg_args - Provides information needed by Homa's > + * recvmsg; passed to recvmsg using the msg_control field. > + */ > +struct homa_recvmsg_args { > + /** > + * @id: (in/out) Initially specifies the id of the desired RPC, or 0 > + * if any RPC is OK; returns the actual id received. > + */ > + uint64_t id; > + > + /** > + * @completion_cookie: (out) If the incoming message is a response, > + * this will return the completion cookie specified when the > + * request was sent. For requests this will always be zero. > + */ > + uint64_t completion_cookie; > + > + /** > + * @flags: (in) OR-ed combination of bits that control the operation. > + * See below for values. > + */ > + uint32_t flags; > + > + /** > + * @num_bpages: (in/out) Number of valid entries in @bpage_offsets. > + * Passes in bpages from previous messages that can now be > + * recycled; returns bpages from the new message. > + */ > + uint32_t num_bpages; > + > + /** > + * @bpage_offsets: (in/out) Each entry is an offset into the buffer > + * region for the socket pool. When returned from recvmsg, the > + * offsets indicate where fragments of the new message are stored. All > + * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE > + * bytes) and are bpage-aligned. The last entry may refer to a bpage > + * fragment and is not necessarily aligned. The application now owns > + * these bpages and must eventually return them to Homa, using > + * bpage_offsets in a future recvmsg invocation. > + */ > + uint32_t bpage_offsets[HOMA_MAX_BPAGES]; > +}; > + > +#if !defined(__cplusplus) > +_Static_assert(sizeof(struct homa_recvmsg_args) >= 88, > + "homa_recvmsg_args shrunk"); > +_Static_assert(sizeof(struct homa_recvmsg_args) <= 88, > + "homa_recvmsg_args grew"); > +#endif > + > +/* Flag bits for homa_recvmsg_args.flags (see man page for documentation): > + */ > +#define HOMA_RECVMSG_REQUEST 0x01 > +#define HOMA_RECVMSG_RESPONSE 0x02 > +#define HOMA_RECVMSG_NONBLOCKING 0x04 > +#define HOMA_RECVMSG_VALID_FLAGS 0x07 > + > +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */ > +#define SO_HOMA_RCVBUF 10 > + > +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ > +struct homa_rcvbuf_args { > + /** @start: First byte of buffer region. */ > + void *start; I'm not sure if pointers are legal in uAPI. I *think* we are supposed to use __aligned_u64, because pointers will be different size for 32b binaries running in compat mode on 64b kernels, or some such. > + /** @length: Total number of bytes available at @start. */ > + size_t length; > +}; > + > +/* Meanings of the bits in Homa's flag word, which can be set using > + * "sysctl /net/homa/flags". > + */ > + > +/** > + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: > + * always send all packets immediately. > + */ Also makes kernel-doc unhappy: include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead Note that next patch adds more kernel-doc warnings, you probably want to TAL at those as well. Use ./scripts/kernel-doc -none -Wall $file > +#define HOMA_FLAG_DONT_THROTTLE 2 > + > +/* I/O control calls on Homa sockets. These are mapped into the > + * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef. > + */ > + > +#define HOMAIOCFREEZE _IO(0x89, 0xef) > + > +#ifdef __cplusplus > +} > +#endif
On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote: > > +#ifdef __cplusplus > > +extern "C" > > +{ > > +#endif > > I'm not aware of any networking header wrapped in extern "C" > Let's not make this precedent? Without this I don't seem to be able to use this header in C++ files: I end up getting linker errors such as 'undefined reference to `homa_replyv(int, iovec const*, int, sockaddr const*, unsigned int, unsigned long)'. Any suggestions on how to make the header file work with C++ files without the #ifdef __cplusplus? > > +/** > > + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into > > + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively > > + * for well-defined server ports. The remaining ports are used for client > > + * ports; these are allocated automatically by Homa. Port 0 is reserved. > > + */ > > +#define HOMA_MIN_DEFAULT_PORT 0x8000 > > Not sure why but ./scripts/kernel-doc does not like this: > > include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead I saw this warning from kernel-doc before I posted the patch, but I couldn't figure out why it is happening. After staring at the error message some more I figured it out: kernel-doc is getting confused by the "-" in "16-bit" (it seems to use the last "-" on the line rather than the first). I've modified the comment to replace "16-bit" with "16 bit" and filed a bug report for kernel-doc. > > +/** > > + * struct homa_sendmsg_args - Provides information needed by Homa's > > + * sendmsg; passed to sendmsg using the msg_control field. > > + */ > > +struct homa_sendmsg_args { > > + /** > > + * @id: (in/out) An initial value of 0 means a new request is > > + * being sent; nonzero means the message is a reply to the given > > + * id. If the message is a request, then the value is modified to > > + * hold the id of the new RPC. > > + */ > > + uint64_t id; > > Please use Linux uapi types, __u64 Done. In the process I discovered that the underlying type for __u64 is not the same as that for uint64_t; this results in awkwardness in programs that use both... > > +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */ > > +#define SO_HOMA_RCVBUF 10 > > + > > +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ > > +struct homa_rcvbuf_args { > > + /** @start: First byte of buffer region. */ > > + void *start; > > I'm not sure if pointers are legal in uAPI. > I *think* we are supposed to use __aligned_u64, because pointers > will be different size for 32b binaries running in compat mode > on 64b kernels, or some such. I see that "void *" is used in the declaration for struct msghdr (along with some other pointer types as well) and struct msghdr is part of several uAPI interfaces, no? > > +/** > > + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: > > + * always send all packets immediately. > > + */ > > Also makes kernel-doc unhappy: > > include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead It seems that the ":" also confuses kernel-doc. I've worked around this as well. > Note that next patch adds more kernel-doc warnings, you probably want > to TAL at those as well. Use > > ./scripts/kernel-doc -none -Wall $file Hmm, I did run kernel-doc before posting the patch, but maybe I missed some stuff. I'll take another look. There are a few things kernel-doc complained about where the requested documentation would add no useful information; it would just end up repeating what is already obvious from the code. Is any discretion allowed for cases like this? If the expectation is that there will be zero kernel-doc complaints, then I'll go ahead and add the useless documentation. > > +#ifdef __cplusplus > > +} > > +#endif > -- > pw-bot: cr I'm not sure what "pw-bot: cr" means; I assume this is related to the "#ifdef __cplusplus" discussion above? Thanks for the comments. -John-
On 12/19/24 02:43, Jakub Kicinski wrote: > On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote: >> + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into >> + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively >> + * for well-defined server ports. The remaining ports are used for client >> + * ports; these are allocated automatically by Homa. Port 0 is reserved. >> + */ >> +#define HOMA_MIN_DEFAULT_PORT 0x8000 > > Not sure why but ./scripts/kernel-doc does not like this: > > include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead > >> +/** >> + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: >> + * always send all packets immediately. >> + */ > > Also makes kernel-doc unhappy: > > include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead > > Note that next patch adds more kernel-doc warnings, you probably want > to TAL at those as well. Use > > ./scripts/kernel-doc -none -Wall $file > turns out that when you have the word "define" at the front and then a colon (:) or two dashes (-) it complains (?!!) my advice is to remove the word "define" from the first kdoc line, as it does not happen to bring any value (the emitted doc still has a "function" instead of "define" used as a type)
On 12/19/24 19:57, John Ousterhout wrote: > On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote: >> >> On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote: >>> +#ifdef __cplusplus >>> +} >>> +#endif >> -- >> pw-bot: cr > > I'm not sure what "pw-bot: cr" means; I assume this is related to the > "#ifdef __cplusplus" discussion above? it's a shortcut for: Patchwork Bot, please mark that as "ChangesRequested" [by Maintainer] C++ program could simply wrap all includes by extern "C" section not sure what is better, but there is very little precedence for any courtesy for C++ in the kernel ATM > > Thanks for the comments. > > -John- >
> > -- > > pw-bot: cr > > I'm not sure what "pw-bot: cr" means; I assume this is related to the > "#ifdef __cplusplus" discussion above? It is documented here: https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html#updating-patch-status although the documentation spells it out in full, but the patchworks bot understands a few abbreviations like cr for change-request. Marking a patchset as changes-requested will drop it out of the list of patches waiting to be merged, so included it at the end of a review comment can save the Maintainers some time. Andrew
On Thu, 19 Dec 2024 10:57:22 -0800 John Ousterhout wrote: > On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote: > > > > On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote: > > > +#ifdef __cplusplus > > > +extern "C" > > > +{ > > > +#endif > > > > I'm not aware of any networking header wrapped in extern "C" > > Let's not make this precedent? > > Without this I don't seem to be able to use this header in C++ files: > I end up getting linker errors such as 'undefined reference to > `homa_replyv(int, iovec const*, int, sockaddr const*, unsigned int, > unsigned long)'. No idea TBH, I don't see homa_reply anywhere in the submission > Any suggestions on how to make the header file work with C++ files > without the #ifdef __cplusplus? With the little C++ understanding I have, I _think_ the include site can wrap: extern "C" { #include "<linux/homa.h>" } > > > +/** > > > + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into > > > + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively > > > + * for well-defined server ports. The remaining ports are used for client > > > + * ports; these are allocated automatically by Homa. Port 0 is reserved. > > > + */ > > > +#define HOMA_MIN_DEFAULT_PORT 0x8000 > > > > Not sure why but ./scripts/kernel-doc does not like this: > > > > include/uapi/linux/homa.h:51: warning: expecting prototype for HOMA_MIN_DEFAULT_PORT - The 16(). Prototype was for HOMA_MIN_DEFAULT_PORT() instead > > I saw this warning from kernel-doc before I posted the patch, but I > couldn't figure out why it is happening. After staring at the error > message some more I figured it out: kernel-doc is getting confused by > the "-" in "16-bit" (it seems to use the last "-" on the line rather > than the first). I've modified the comment to replace "16-bit" with > "16 bit" and filed a bug report for kernel-doc. Unless you're planning to render the docs on docs.kernel.org you can just switch from /** to /* comments. IMVHO kdoc is overused, unless there's full documentation with API rendered like https://www.kernel.org/doc/html/latest/networking/net_dim.html kdoc is more trouble than gain. > > > +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */ > > > +#define SO_HOMA_RCVBUF 10 > > > + > > > +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ > > > +struct homa_rcvbuf_args { > > > + /** @start: First byte of buffer region. */ > > > + void *start; > > > > I'm not sure if pointers are legal in uAPI. > > I *think* we are supposed to use __aligned_u64, because pointers > > will be different size for 32b binaries running in compat mode > > on 64b kernels, or some such. > > I see that "void *" is used in the declaration for struct msghdr > (along with some other pointer types as well) and struct msghdr is > part of several uAPI interfaces, no? Off the top off my head this use is a source of major pain, grep around for compat_msghdr. > > > +/** > > > + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: > > > + * always send all packets immediately. > > > + */ > > > > Also makes kernel-doc unhappy: > > > > include/uapi/linux/homa.h:159: warning: expecting prototype for HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism(). Prototype was for HOMA_FLAG_DONT_THROTTLE() instead > > It seems that the ":" also confuses kernel-doc. I've worked around this as well. > > > Note that next patch adds more kernel-doc warnings, you probably want > > to TAL at those as well. Use > > > > ./scripts/kernel-doc -none -Wall $file > > Hmm, I did run kernel-doc before posting the patch, but maybe I missed > some stuff. I'll take another look. There are a few things kernel-doc > complained about where the requested documentation would add no useful > information; it would just end up repeating what is already obvious > from the code. Is any discretion allowed for cases like this? If the > expectation is that there will be zero kernel-doc complaints, then > I'll go ahead and add the useless documentation. My recommendation is to use normal comments where kdoc just repeats obvious stuff. All these warnings sooner or later will result in some semi-automated and often poor quality patch submissions to "fix" it. Which is just work for maintainers to deal with :(
On Thu, Dec 19, 2024 at 2:06 PM Przemek Kitszel <przemyslaw.kitszel@intel.com> wrote: > > On 12/19/24 19:57, John Ousterhout wrote: > > On Wed, Dec 18, 2024 at 5:43 PM Jakub Kicinski <kuba@kernel.org> wrote: > >> > >> On Mon, 16 Dec 2024 16:06:14 -0800 John Ousterhout wrote: > > > >>> +#ifdef __cplusplus > >>> +} > >>> +#endif > >> -- > >> pw-bot: cr > > > > I'm not sure what "pw-bot: cr" means; I assume this is related to the > > "#ifdef __cplusplus" discussion above? > > it's a shortcut for: > Patchwork Bot, please mark that as "ChangesRequested" [by Maintainer] > > C++ program could simply wrap all includes by extern "C" section > > not sure what is better, but there is very little precedence for any > courtesy for C++ in the kernel ATM I have now removed the 'extern "C"' from homa.h and moved it to all of the programs that #include it. -John-
On Thu, Dec 19, 2024 at 5:41 PM Jakub Kicinski <kuba@kernel.org> wrote: > > > Any suggestions on how to make the header file work with C++ files > > without the #ifdef __cplusplus? > > With the little C++ understanding I have, I _think_ the include site > can wrap: > > extern "C" { > #include "<linux/homa.h>" > } I have done this now. I was hesitant to do that because it seemed like it was creating unnecessary extra work for anyone who uses homa.h in a C++ program, but since this seems to be the convention I have conformed to it. > > > > +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */ > > > > +#define SO_HOMA_RCVBUF 10 > > > > + > > > > +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ > > > > +struct homa_rcvbuf_args { > > > > + /** @start: First byte of buffer region. */ > > > > + void *start; > > > > > > I'm not sure if pointers are legal in uAPI. > > > I *think* we are supposed to use __aligned_u64, because pointers > > > will be different size for 32b binaries running in compat mode > > > on 64b kernels, or some such. > > > > I see that "void *" is used in the declaration for struct msghdr > > (along with some other pointer types as well) and struct msghdr is > > part of several uAPI interfaces, no? > > Off the top off my head this use is a source of major pain, grep around > for compat_msghdr. How should I go about confirming that this __aligned_u64 is indeed the expected convention (sounds like you aren't certain)? Also, any idea why it needs to be aligned rather than just __u64? > My recommendation is to use normal comments where kdoc just repeats > obvious stuff. All these warnings sooner or later will result in some > semi-automated and often poor quality patch submissions to "fix" it. > Which is just work for maintainers to deal with :( I have done this now. I had assumed that kdoc would also complain if there was a declaration without official kdoc documentation, but now that I see that it won't, I'll use that approach for places where kdoc style is awkward. Personally I'm agnostic about whether to use kdoc; I assumed that I should use it since it exists. -John-
On Fri, 20 Dec 2024 09:59:53 -0800 John Ousterhout wrote: > > > I see that "void *" is used in the declaration for struct msghdr > > > (along with some other pointer types as well) and struct msghdr is > > > part of several uAPI interfaces, no? > > > > Off the top off my head this use is a source of major pain, grep around > > for compat_msghdr. > > How should I go about confirming that this __aligned_u64 is indeed the > expected convention (sounds like you aren't certain)? Let me add Arnd Bergmann to the CC list, he will correct me if I'm wrong. Otherwise you can trust my intuition :) > Also, any idea why it needs to be aligned rather than just __u64? The main problem is that if __u64 doesn't force alignment on a 32b version of a platform the struct may have holes and padding in different places on 32b vs 64b compat. Double checking, I don't think that's the case for your structs, so you can most likely go with a plain __u64.
On Fri, Dec 20, 2024, at 20:31, Jakub Kicinski wrote: > On Fri, 20 Dec 2024 09:59:53 -0800 John Ousterhout wrote: >> > > I see that "void *" is used in the declaration for struct msghdr >> > > (along with some other pointer types as well) and struct msghdr is >> > > part of several uAPI interfaces, no? >> > >> > Off the top off my head this use is a source of major pain, grep around >> > for compat_msghdr. >> >> How should I go about confirming that this __aligned_u64 is indeed the >> expected convention (sounds like you aren't certain)? > > Let me add Arnd Bergmann to the CC list, he will correct me if > I'm wrong. Otherwise you can trust my intuition :) You are right that for the purposes of the user API, structures should use __u64 or __aligned_u64 in place of pointers, there are some more details on this in Documentation/driver-api/ioctl.rst. What worries me more in this particular case is the way that this pointer is passed through setsockopt(), which really doesn't take any pointers in other protocols. I have not fully understood what is behind the pointer, but it looks like this gets stored in the kernel in a per-socket structure that is annotated as a kernel pointer, not a user pointer, which may cause additional problems. I don't know if the same pointer ever points to a kernel structure, but if it does, that needs to be fixed first. Assuming this is actually meant as a persistent __user pointer, I'm still unsure what this means if the socket is available to more than one process, e.g. through a fork() or explicit file descriptor passing, or if the original process dies while there is still a transfer in progress. I realize that there is a lot of information already out there that I haven't all read, so this is probably explained somewhere, but it would be nice to point to that documentation somewhere near the code to clarify the corner cases. That probably also explains what type of memory the __user buffer can point to, but I would like to make sure that this has well-defined behavior e.g. if that buffer is an mmap()ed file on NFS that was itself mounted over a homa socket. Is there any guarantee that this is either prohibited or is free of deadlocks and recursion? Arnd
On Fri, Dec 20, 2024 at 09:59:53AM -0800, John Ousterhout wrote: > On Thu, Dec 19, 2024 at 5:41 PM Jakub Kicinski <kuba@kernel.org> wrote: > > > > > Any suggestions on how to make the header file work with C++ files > > > without the #ifdef __cplusplus? > > > > With the little C++ understanding I have, I _think_ the include site > > can wrap: > > > > extern "C" { > > #include "<linux/homa.h>" > > } > > I have done this now. I was hesitant to do that because it seemed like > it was creating unnecessary extra work for anyone who uses homa.h in a > C++ program, but since this seems to be the convention I have > conformed to it. Do C++ programmes directly use this header, or is there a library in the middle? Often kernel APIs are too low level for an application to use directly, and a library is used to provide a higher level API. That would allow you to provide a different header file which is more C++ friendly. Andrew
On Fri, Dec 20, 2024 at 1:13 PM Arnd Bergmann <arnd@arndb.de> wrote: > > On Fri, Dec 20, 2024, at 20:31, Jakub Kicinski wrote: > > On Fri, 20 Dec 2024 09:59:53 -0800 John Ousterhout wrote: > >> > > I see that "void *" is used in the declaration for struct msghdr > >> > > (along with some other pointer types as well) and struct msghdr is > >> > > part of several uAPI interfaces, no? > >> > > >> > Off the top off my head this use is a source of major pain, grep around > >> > for compat_msghdr. > >> > >> How should I go about confirming that this __aligned_u64 is indeed the > >> expected convention (sounds like you aren't certain)? > > > > Let me add Arnd Bergmann to the CC list, he will correct me if > > I'm wrong. Otherwise you can trust my intuition :) > > You are right that for the purposes of the user API, structures > should use __u64 or __aligned_u64 in place of pointers, there are > some more details on this in Documentation/driver-api/ioctl.rst. I have now changed the type from void * to __u64. > What worries me more in this particular case is the way that > this pointer is passed through setsockopt(), which really doesn't > take any pointers in other protocols. > > I have not fully understood what is behind the pointer, but > it looks like this gets stored in the kernel in a per-socket > structure that is annotated as a kernel pointer, not a user > pointer, which may cause additional problems. It is actually a user pointer; I had forgotten the __user annotation. I have fixed this now as well. > I don't know if the same pointer ever points to a kernel > structure, but if it does, that needs to be fixed first. It doesn't. > Assuming this is actually meant as a persistent __user > pointer, I'm still unsure what this means if the socket is > available to more than one process, e.g. through a fork() > or explicit file descriptor passing, or if the original > process dies while there is still a transfer in progress. > I realize that there is a lot of information already out > there that I haven't all read, so this is probably explained > somewhere, but it would be nice to point to that documentation > somewhere near the code to clarify the corner cases. I hadn't considered this, but the buffering mechanism prevents the same socket from being shared across processes. I'm okay with that: I'm not sure that sharing between processes adds much value for Homa, and the performance benefit from the buffer mechanism is quite large. I will document this. Is there a way to prevent a socket from being shared across processes (e.g. can I set close-on-exec from within the kernel?) I don't think there is any risk to kernel integrity if the socket does end up shared; the worst that will happen is that the memory of one of the processes will get trashed because Homa will write to memory that isn't actually buffer space in that process. > That probably also explains what type of memory the > __user buffer can point to, but I would like to make > sure that this has well-defined behavior e.g. if that > buffer is an mmap()ed file on NFS that was itself > mounted over a homa socket. Is there any guarantee that > this is either prohibited or is free of deadlocks and > recursion? Given the API incompatibilities between Homa and TCP, I don't think it is possible to have NFS mounted over a Homa socket. But you raise the issue of whether some kinds of addresses might not be suitable for Homa's buffer use this way. I don't know enough about the various possible kinds of memory to know what kinds of problems could occur. My assumption is that the buffer area will be a simple mmap()ed region. The only use Homa makes of the buffer address is to call import_ubuf with addresses in the buffer region, followed by skb_copy_datagram_iter with the resulting iov_iter. Is there some way I can check the "kind" of memory behind the buffer pointer, so Homa could reject anything other than the simple case? However, even if Homa checks when it sets up the buffer region, the application could always rearrange its memory later, so I'm not sure this would be effective. Also, if there is some sort of weird memory that would cause problems for Homa, couldn't an application pass this same memory into a call to recvmsg on a normal socket? Given that Homa's use of the address is similar to what already happens in other sockets, I don't think that the fact that Homa keeps the address around in the kernel for a long time should cause additional problems, no? -John-
On Fri, Dec 20, 2024 at 3:42 PM John Ousterhout <ouster@cs.stanford.edu> wrote: > > I hadn't considered this, but the buffering mechanism prevents the > same socket from being shared across processes. It just occurred to me that a Homa socket should be sharable by multiple processes as long as the buffer region is also shared at the same virtual address in each process. -John-
On Sat, Dec 21, 2024, at 00:42, John Ousterhout wrote: > On Fri, Dec 20, 2024 at 1:13 PM Arnd Bergmann <arnd@arndb.de> wrote: >> Assuming this is actually meant as a persistent __user >> pointer, I'm still unsure what this means if the socket is >> available to more than one process, e.g. through a fork() >> or explicit file descriptor passing, or if the original >> process dies while there is still a transfer in progress. >> I realize that there is a lot of information already out >> there that I haven't all read, so this is probably explained >> somewhere, but it would be nice to point to that documentation >> somewhere near the code to clarify the corner cases. > > I hadn't considered this, but the buffering mechanism prevents the > same socket from being shared across processes. I'm okay with that: > I'm not sure that sharing between processes adds much value for Homa, > and the performance benefit from the buffer mechanism is quite large. > I will document this. Is there a way to prevent a socket from being > shared across processes (e.g. can I set close-on-exec from within the > kernel?) I don't think there is any risk to kernel integrity if the > socket does end up shared; the worst that will happen is that the > memory of one of the processes will get trashed because Homa will > write to memory that isn't actually buffer space in that process. It would definitely be nicer to ensure that it's only available for a particular 'struct mm'. Setting O_CLOEXEC is probably not be enough since this does not close the fd on a fork without exec, and does not prevent the flag from being reset through fcntl(). Maybe see what io_uring() does to handle userspace pointers here, I think the problem is quite similar there. >> That probably also explains what type of memory the >> __user buffer can point to, but I would like to make >> sure that this has well-defined behavior e.g. if that >> buffer is an mmap()ed file on NFS that was itself >> mounted over a homa socket. Is there any guarantee that >> this is either prohibited or is free of deadlocks and >> recursion? > > Given the API incompatibilities between Homa and TCP, I don't think it > is possible to have NFS mounted over a Homa socket. But you raise the > issue of whether some kinds of addresses might not be suitable for > Homa's buffer use this way. I don't know enough about the various > possible kinds of memory to know what kinds of problems could occur. > My assumption is that the buffer area will be a simple mmap()ed > region. The only use Homa makes of the buffer address is to call > import_ubuf with addresses in the buffer region, followed by > skb_copy_datagram_iter with the resulting iov_iter. Right, NFS was just an example, but there are other interesting cases. You certainly have to deal with buffers in userspace memory that are blocked indefinitely. Another interesting case is memory that has additional constraints, e.g. the MMIO space of a PCI device like a GPU, which may fault when writing data into it, or which cannot be mapped into the DMA space of a network device. > Is there some way I can check the "kind" of memory behind the buffer > pointer, so Homa could reject anything other than the simple case? I don't think so. I still don't know what the exact constraints are that you have here, but I suspect this would all be a lot simpler if you could change the interface to not pass arbitrary user addresses but instead have a single file descriptor that backs the buffers, either by passing a tmpfs/hugetlbfs file into the socket instead of a pointer, or by using mmap() on the socket to map it into userspace like we do for packet sockets. Arnd
On Sat, Dec 21, 2024 at 5:43 AM Arnd Bergmann <arnd@arndb.de> wrote: > > >> That probably also explains what type of memory the > >> __user buffer can point to, but I would like to make > >> sure that this has well-defined behavior e.g. if that > >> buffer is an mmap()ed file on NFS that was itself > >> mounted over a homa socket. Is there any guarantee that > >> this is either prohibited or is free of deadlocks and > >> recursion? > > > > Given the API incompatibilities between Homa and TCP, I don't think it > > is possible to have NFS mounted over a Homa socket. But you raise the > > issue of whether some kinds of addresses might not be suitable for > > Homa's buffer use this way. I don't know enough about the various > > possible kinds of memory to know what kinds of problems could occur. > > My assumption is that the buffer area will be a simple mmap()ed > > region. The only use Homa makes of the buffer address is to call > > import_ubuf with addresses in the buffer region, followed by > > skb_copy_datagram_iter with the resulting iov_iter. > > Right, NFS was just an example, but there are other interesting > cases. You certainly have to deal with buffers in userspace > memory that are blocked indefinitely. Another interesting case > is memory that has additional constraints, e.g. the MMIO > space of a PCI device like a GPU, which may fault when writing > data into it, or which cannot be mapped into the DMA space > of a network device. Homa doesn't map the user receive buffers into DMA space, so this last issue won't come up. > > Is there some way I can check the "kind" of memory behind the buffer > > pointer, so Homa could reject anything other than the simple case? > > I don't think so. I still don't know what the exact constraints > are that you have here, but I suspect this would all be a lot > simpler if you could change the interface to not pass arbitrary > user addresses but instead have a single file descriptor that > backs the buffers, either by passing a tmpfs/hugetlbfs file into > the socket instead of a pointer, or by using mmap() on the > socket to map it into userspace like we do for packet sockets. After thinking about this some more, I don't think there should be any need for Homa to constrain the addresses passed into it. Homa's usage of buffer addresses is the same as TCP's; the only difference is that instead of receiving a buffer address in a recvmsg call, the address is passed in earlier in a setsockopt call. In either case, the application could pass in an arbitrary address. Assuming that TCP properly handles all of the possible variations in addresses that could be passed in, Homa should properly handle them as well, since it invokes the same underlying functions. Is there a specific example you have in mind of an address that would be problematic for Homa but not also problematic for TCP? -John-
diff --git a/MAINTAINERS b/MAINTAINERS index 1389704c7d8d..935d1e995018 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10391,6 +10391,13 @@ F: lib/test_hmm* F: mm/hmm* F: tools/testing/selftests/mm/*hmm* +HOMA TRANSPORT PROTOCOL +M: John Ousterhout <ouster@cs.stanford.edu> +S: Maintained +W: https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview +F: include/uapi/linux/homa.h +F: net/homa/ + HONEYWELL HSC030PA PRESSURE SENSOR SERIES IIO DRIVER M: Petre Rodan <petre.rodan@subdimension.ro> L: linux-iio@vger.kernel.org diff --git a/include/uapi/linux/homa.h b/include/uapi/linux/homa.h new file mode 100644 index 000000000000..555efa600b3f --- /dev/null +++ b/include/uapi/linux/homa.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ + +/* This file defines the kernel call interface for the Homa + * transport protocol. + */ + +#ifndef _UAPI_LINUX_HOMA_H +#define _UAPI_LINUX_HOMA_H + +#include <linux/types.h> +#ifndef __KERNEL__ +#include <netinet/in.h> +#include <sys/socket.h> +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* IANA-assigned Internet Protocol number for Homa. */ +#define IPPROTO_HOMA 146 + +/** + * define HOMA_MAX_MESSAGE_LENGTH - Maximum bytes of payload in a Homa + * request or response message. + */ +#define HOMA_MAX_MESSAGE_LENGTH 1000000 + +/** + * define HOMA_BPAGE_SIZE - Number of bytes in pages used for receive + * buffers. Must be power of two. + */ +#define HOMA_BPAGE_SIZE (1 << HOMA_BPAGE_SHIFT) +#define HOMA_BPAGE_SHIFT 16 + +/** + * define HOMA_MAX_BPAGES - The largest number of bpages that will be required + * to store an incoming message. + */ +#define HOMA_MAX_BPAGES ((HOMA_MAX_MESSAGE_LENGTH + HOMA_BPAGE_SIZE - 1) \ + >> HOMA_BPAGE_SHIFT) + +/** + * define HOMA_MIN_DEFAULT_PORT - The 16-bit port space is divided into + * two nonoverlapping regions. Ports 1-32767 are reserved exclusively + * for well-defined server ports. The remaining ports are used for client + * ports; these are allocated automatically by Homa. Port 0 is reserved. + */ +#define HOMA_MIN_DEFAULT_PORT 0x8000 + +/** + * struct homa_sendmsg_args - Provides information needed by Homa's + * sendmsg; passed to sendmsg using the msg_control field. + */ +struct homa_sendmsg_args { + /** + * @id: (in/out) An initial value of 0 means a new request is + * being sent; nonzero means the message is a reply to the given + * id. If the message is a request, then the value is modified to + * hold the id of the new RPC. + */ + uint64_t id; + + /** + * @completion_cookie: (in) Used only for request messages; will be + * returned by recvmsg when the RPC completes. Typically used to + * locate app-specific info about the RPC. + */ + uint64_t completion_cookie; +}; + +#if !defined(__cplusplus) +_Static_assert(sizeof(struct homa_sendmsg_args) >= 16, + "homa_sendmsg_args shrunk"); +_Static_assert(sizeof(struct homa_sendmsg_args) <= 16, + "homa_sendmsg_args grew"); +#endif + +/** + * struct homa_recvmsg_args - Provides information needed by Homa's + * recvmsg; passed to recvmsg using the msg_control field. + */ +struct homa_recvmsg_args { + /** + * @id: (in/out) Initially specifies the id of the desired RPC, or 0 + * if any RPC is OK; returns the actual id received. + */ + uint64_t id; + + /** + * @completion_cookie: (out) If the incoming message is a response, + * this will return the completion cookie specified when the + * request was sent. For requests this will always be zero. + */ + uint64_t completion_cookie; + + /** + * @flags: (in) OR-ed combination of bits that control the operation. + * See below for values. + */ + uint32_t flags; + + /** + * @num_bpages: (in/out) Number of valid entries in @bpage_offsets. + * Passes in bpages from previous messages that can now be + * recycled; returns bpages from the new message. + */ + uint32_t num_bpages; + + /** + * @bpage_offsets: (in/out) Each entry is an offset into the buffer + * region for the socket pool. When returned from recvmsg, the + * offsets indicate where fragments of the new message are stored. All + * entries but the last refer to full buffer pages (HOMA_BPAGE_SIZE + * bytes) and are bpage-aligned. The last entry may refer to a bpage + * fragment and is not necessarily aligned. The application now owns + * these bpages and must eventually return them to Homa, using + * bpage_offsets in a future recvmsg invocation. + */ + uint32_t bpage_offsets[HOMA_MAX_BPAGES]; +}; + +#if !defined(__cplusplus) +_Static_assert(sizeof(struct homa_recvmsg_args) >= 88, + "homa_recvmsg_args shrunk"); +_Static_assert(sizeof(struct homa_recvmsg_args) <= 88, + "homa_recvmsg_args grew"); +#endif + +/* Flag bits for homa_recvmsg_args.flags (see man page for documentation): + */ +#define HOMA_RECVMSG_REQUEST 0x01 +#define HOMA_RECVMSG_RESPONSE 0x02 +#define HOMA_RECVMSG_NONBLOCKING 0x04 +#define HOMA_RECVMSG_VALID_FLAGS 0x07 + +/** define SO_HOMA_RCVBUF - setsockopt option for specifying buffer region. */ +#define SO_HOMA_RCVBUF 10 + +/** struct homa_rcvbuf_args - setsockopt argument for SO_HOMA_RCVBUF. */ +struct homa_rcvbuf_args { + /** @start: First byte of buffer region. */ + void *start; + + /** @length: Total number of bytes available at @start. */ + size_t length; +}; + +/* Meanings of the bits in Homa's flag word, which can be set using + * "sysctl /net/homa/flags". + */ + +/** + * define HOMA_FLAG_DONT_THROTTLE - disable the output throttling mechanism: + * always send all packets immediately. + */ +#define HOMA_FLAG_DONT_THROTTLE 2 + +/* I/O control calls on Homa sockets. These are mapped into the + * SIOCPROTOPRIVATE range of 0x89e0 through 0x89ef. + */ + +#define HOMAIOCFREEZE _IO(0x89, 0xef) + +#ifdef __cplusplus +} +#endif + +#endif /* _UAPI_LINUX_HOMA_H */
Note: for man pages, see the Homa Wiki at: https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview Signed-off-by: John Ousterhout <ouster@cs.stanford.edu> --- MAINTAINERS | 7 ++ include/uapi/linux/homa.h | 170 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 include/uapi/linux/homa.h