diff mbox series

[v6,18/21] fanotify: Emit generic error info type for error event

Message ID 20210812214010.3197279-19-krisman@collabora.com (mailing list archive)
State New, archived
Headers show
Series File system wide monitoring | expand

Commit Message

Gabriel Krisman Bertazi Aug. 12, 2021, 9:40 p.m. UTC
The Error info type is a record sent to users on FAN_FS_ERROR events
documenting the type of error.  It also carries an error count,
documenting how many errors were observed since the last reporting.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>

---
Changes since v5:
  - Move error code here
---
 fs/notify/fanotify/fanotify.c      |  1 +
 fs/notify/fanotify/fanotify.h      |  1 +
 fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
 include/uapi/linux/fanotify.h      |  7 ++++++
 4 files changed, 45 insertions(+)

Comments

Amir Goldstein Aug. 13, 2021, 8:47 a.m. UTC | #1
On Fri, Aug 13, 2021 at 12:41 AM Gabriel Krisman Bertazi
<krisman@collabora.com> wrote:
>
> The Error info type is a record sent to users on FAN_FS_ERROR events
> documenting the type of error.  It also carries an error count,
> documenting how many errors were observed since the last reporting.
>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>

Reviewed-by: Amir Goldstein <amir73il@gmail.com>

> ---
> Changes since v5:
>   - Move error code here
> ---
>  fs/notify/fanotify/fanotify.c      |  1 +
>  fs/notify/fanotify/fanotify.h      |  1 +
>  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
>  include/uapi/linux/fanotify.h      |  7 ++++++
>  4 files changed, 45 insertions(+)
>
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index f5c16ac37835..b49a474c1d7f 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -745,6 +745,7 @@ static int fanotify_handle_error_event(struct fsnotify_iter_info *iter_info,
>         spin_unlock(&group->notification_lock);
>
>         fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
> +       fee->error = report->error;
>         fee->fsid = fee->sb_mark->fsn_mark.connector->fsid;
>
>         fh_len = fanotify_encode_fh_len(inode);
> diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
> index 158cf0c4b0bd..0cfe376c6fd9 100644
> --- a/fs/notify/fanotify/fanotify.h
> +++ b/fs/notify/fanotify/fanotify.h
> @@ -220,6 +220,7 @@ FANOTIFY_NE(struct fanotify_event *event)
>
>  struct fanotify_error_event {
>         struct fanotify_event fae;
> +       s32 error; /* Error reported by the Filesystem. */
>         u32 err_count; /* Suppressed errors count */
>
>         struct fanotify_sb_mark *sb_mark; /* Back reference to the mark. */
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 1ab8f9d8b3ac..ca53159ce673 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -107,6 +107,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
>  #define FANOTIFY_EVENT_ALIGN 4
>  #define FANOTIFY_INFO_HDR_LEN \
>         (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
> +#define FANOTIFY_INFO_ERROR_LEN \
> +       (sizeof(struct fanotify_event_info_error))
>
>  static int fanotify_fid_info_len(int fh_len, int name_len)
>  {
> @@ -130,6 +132,9 @@ static size_t fanotify_event_len(struct fanotify_event *event,
>         if (!fid_mode)
>                 return event_len;
>
> +       if (fanotify_is_error_event(event->mask))
> +               event_len += FANOTIFY_INFO_ERROR_LEN;
> +
>         info = fanotify_event_info(event);
>         dir_fh_len = fanotify_event_dir_fh_len(event);
>         fh_len = fanotify_event_object_fh_len(event);
> @@ -176,6 +181,7 @@ static struct fanotify_event *fanotify_dup_error_to_stack(
>         error_on_stack->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
>         error_on_stack->err_count = fee->err_count;
>         error_on_stack->sb_mark = fee->sb_mark;
> +       error_on_stack->error = fee->error;
>
>         error_on_stack->fsid = fee->fsid;
>
> @@ -342,6 +348,28 @@ static int process_access_response(struct fsnotify_group *group,
>         return -ENOENT;
>  }
>
> +static size_t copy_error_info_to_user(struct fanotify_event *event,
> +                                     char __user *buf, int count)
> +{
> +       struct fanotify_event_info_error info;
> +       struct fanotify_error_event *fee = FANOTIFY_EE(event);
> +
> +       info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
> +       info.hdr.pad = 0;
> +       info.hdr.len = FANOTIFY_INFO_ERROR_LEN;
> +
> +       if (WARN_ON(count < info.hdr.len))
> +               return -EFAULT;
> +
> +       info.error = fee->error;
> +       info.error_count = fee->err_count;
> +
> +       if (copy_to_user(buf, &info, sizeof(info)))
> +               return -EFAULT;
> +
> +       return info.hdr.len;
> +}
> +
>  static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
>                              int info_type, const char *name, size_t name_len,
>                              char __user *buf, size_t count)
> @@ -505,6 +533,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>         if (f)
>                 fd_install(fd, f);
>
> +       if (fanotify_is_error_event(event->mask)) {
> +               ret = copy_error_info_to_user(event, buf, count);
> +               if (ret < 0)
> +                       goto out_close_fd;
> +               buf += ret;
> +               count -= ret;
> +       }
> +
>         /* Event info records order is: dir fid + name, child fid */
>         if (fanotify_event_dir_fh_len(event)) {
>                 info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index 16402037fc7a..80040a92e9d9 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
>  #define FAN_EVENT_INFO_TYPE_FID                1
>  #define FAN_EVENT_INFO_TYPE_DFID_NAME  2
>  #define FAN_EVENT_INFO_TYPE_DFID       3
> +#define FAN_EVENT_INFO_TYPE_ERROR      4
>
>  /* Variable length info record following event metadata */
>  struct fanotify_event_info_header {
> @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
>         unsigned char handle[0];
>  };
>
> +struct fanotify_event_info_error {
> +       struct fanotify_event_info_header hdr;
> +       __s32 error;
> +       __u32 error_count;
> +};
> +
>  struct fanotify_response {
>         __s32 fd;
>         __u32 response;
> --
> 2.32.0
>
Jan Kara Aug. 16, 2021, 4:23 p.m. UTC | #2
On Thu 12-08-21 17:40:07, Gabriel Krisman Bertazi wrote:
> The Error info type is a record sent to users on FAN_FS_ERROR events
> documenting the type of error.  It also carries an error count,
> documenting how many errors were observed since the last reporting.
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> 
> ---
> Changes since v5:
>   - Move error code here
> ---
>  fs/notify/fanotify/fanotify.c      |  1 +
>  fs/notify/fanotify/fanotify.h      |  1 +
>  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
>  include/uapi/linux/fanotify.h      |  7 ++++++
>  4 files changed, 45 insertions(+)
> 
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index f5c16ac37835..b49a474c1d7f 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -745,6 +745,7 @@ static int fanotify_handle_error_event(struct fsnotify_iter_info *iter_info,
>  	spin_unlock(&group->notification_lock);
>  
>  	fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
> +	fee->error = report->error;
>  	fee->fsid = fee->sb_mark->fsn_mark.connector->fsid;
>  
>  	fh_len = fanotify_encode_fh_len(inode);
> diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
> index 158cf0c4b0bd..0cfe376c6fd9 100644
> --- a/fs/notify/fanotify/fanotify.h
> +++ b/fs/notify/fanotify/fanotify.h
> @@ -220,6 +220,7 @@ FANOTIFY_NE(struct fanotify_event *event)
>  
>  struct fanotify_error_event {
>  	struct fanotify_event fae;
> +	s32 error; /* Error reported by the Filesystem. */
>  	u32 err_count; /* Suppressed errors count */
>  
>  	struct fanotify_sb_mark *sb_mark; /* Back reference to the mark. */
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 1ab8f9d8b3ac..ca53159ce673 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -107,6 +107,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
>  #define FANOTIFY_EVENT_ALIGN 4
>  #define FANOTIFY_INFO_HDR_LEN \
>  	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
> +#define FANOTIFY_INFO_ERROR_LEN \
> +	(sizeof(struct fanotify_event_info_error))
>  
>  static int fanotify_fid_info_len(int fh_len, int name_len)
>  {
> @@ -130,6 +132,9 @@ static size_t fanotify_event_len(struct fanotify_event *event,
>  	if (!fid_mode)
>  		return event_len;
>  
> +	if (fanotify_is_error_event(event->mask))
> +		event_len += FANOTIFY_INFO_ERROR_LEN;
> +
>  	info = fanotify_event_info(event);
>  	dir_fh_len = fanotify_event_dir_fh_len(event);
>  	fh_len = fanotify_event_object_fh_len(event);
> @@ -176,6 +181,7 @@ static struct fanotify_event *fanotify_dup_error_to_stack(
>  	error_on_stack->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
>  	error_on_stack->err_count = fee->err_count;
>  	error_on_stack->sb_mark = fee->sb_mark;
> +	error_on_stack->error = fee->error;
>  
>  	error_on_stack->fsid = fee->fsid;
>  
> @@ -342,6 +348,28 @@ static int process_access_response(struct fsnotify_group *group,
>  	return -ENOENT;
>  }
>  
> +static size_t copy_error_info_to_user(struct fanotify_event *event,
> +				      char __user *buf, int count)
> +{
> +	struct fanotify_event_info_error info;
> +	struct fanotify_error_event *fee = FANOTIFY_EE(event);
> +
> +	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
> +	info.hdr.pad = 0;
> +	info.hdr.len = FANOTIFY_INFO_ERROR_LEN;
> +
> +	if (WARN_ON(count < info.hdr.len))
> +		return -EFAULT;
> +
> +	info.error = fee->error;
> +	info.error_count = fee->err_count;
> +
> +	if (copy_to_user(buf, &info, sizeof(info)))
> +		return -EFAULT;
> +
> +	return info.hdr.len;
> +}
> +
>  static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
>  			     int info_type, const char *name, size_t name_len,
>  			     char __user *buf, size_t count)
> @@ -505,6 +533,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
>  	if (f)
>  		fd_install(fd, f);
>  
> +	if (fanotify_is_error_event(event->mask)) {
> +		ret = copy_error_info_to_user(event, buf, count);
> +		if (ret < 0)
> +			goto out_close_fd;
> +		buf += ret;
> +		count -= ret;
> +	}
> +
>  	/* Event info records order is: dir fid + name, child fid */
>  	if (fanotify_event_dir_fh_len(event)) {
>  		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index 16402037fc7a..80040a92e9d9 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
>  #define FAN_EVENT_INFO_TYPE_FID		1
>  #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
>  #define FAN_EVENT_INFO_TYPE_DFID	3
> +#define FAN_EVENT_INFO_TYPE_ERROR	4
>  
>  /* Variable length info record following event metadata */
>  struct fanotify_event_info_header {
> @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
>  	unsigned char handle[0];
>  };
>  
> +struct fanotify_event_info_error {
> +	struct fanotify_event_info_header hdr;
> +	__s32 error;
> +	__u32 error_count;
> +};
> +
>  struct fanotify_response {
>  	__s32 fd;
>  	__u32 response;
> -- 
> 2.32.0
>
Darrick J. Wong Aug. 16, 2021, 9:41 p.m. UTC | #3
On Thu, Aug 12, 2021 at 05:40:07PM -0400, Gabriel Krisman Bertazi wrote:
> The Error info type is a record sent to users on FAN_FS_ERROR events
> documenting the type of error.  It also carries an error count,
> documenting how many errors were observed since the last reporting.
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> 
> ---
> Changes since v5:
>   - Move error code here
> ---
>  fs/notify/fanotify/fanotify.c      |  1 +
>  fs/notify/fanotify/fanotify.h      |  1 +
>  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
>  include/uapi/linux/fanotify.h      |  7 ++++++
>  4 files changed, 45 insertions(+)

<snip>

> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index 16402037fc7a..80040a92e9d9 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
>  #define FAN_EVENT_INFO_TYPE_FID		1
>  #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
>  #define FAN_EVENT_INFO_TYPE_DFID	3
> +#define FAN_EVENT_INFO_TYPE_ERROR	4
>  
>  /* Variable length info record following event metadata */
>  struct fanotify_event_info_header {
> @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
>  	unsigned char handle[0];
>  };
>  
> +struct fanotify_event_info_error {
> +	struct fanotify_event_info_header hdr;
> +	__s32 error;
> +	__u32 error_count;
> +};

My apologies for not having time to review this patchset since it was
redesigned to use fanotify.  Someday it would be helpful to be able to
export more detailed error reports from XFS, but as I'm not ready to
move forward and write that today, I'll try to avoid derailling this at
the last minute.

Eventually, XFS might want to be able to report errors in file data,
file metadata, allocation group metadata, and whole-filesystem metadata.
Userspace can already gather reports from XFS about corruptions reported
by the online fsck code (see xfs_health.c).

I /think/ we could subclass the file error structure that you've
provided like so:

struct fanotify_event_info_xfs_filesystem_error {
	struct fanotify_event_info_error	base;

	__u32 magic; /* 0x58465342 to identify xfs */
	__u32 type; /* quotas, realtime bitmap, etc. */
};

struct fanotify_event_info_xfs_perag_error {
	struct fanotify_event_info_error	base;

	__u32 magic; /* 0x58465342 to identify xfs */
	__u32 type; /* agf, agi, agfl, bno btree, ino btree, etc. */
	__u32 agno; /* allocation group number */
};

struct fanotify_event_info_xfs_file_error {
	struct fanotify_event_info_error	base;

	__u32 magic; /* 0x58465342 to identify xfs */
	__u32 type; /* extent map, dir, attr, etc. */
	__u64 offset; /* file data offset, if applicable */
	__u64 length; /* file data length, if applicable */
};

(A real XFS implementation might have one structure with the type code
providing for a tagged union or something; I split it into three
separate structs here to avoid confusing things.)

I have three questions at this point:

1) What's the maximum size of a fanotify event structure?  None of these
structures exceed 36 bytes, which I hope will fit in whatever size
constraints?

2) If a program written for today's notification events sees a
fanotify_event_info_header from future-XFS with a header length that is
larger than FANOTIFY_INFO_ERROR_LEN, will it be able to react
appropriately?  Which is to say, ignore it on the grounds that the
length is unexpectedly large?

It /looks/ like this is the case; really I'm just fishing around here
to make sure nothing in the design of /this/ patchset would make it Very
Difficult(tm) to add more information later.

3) Once we let filesystem implementations create their own extended
error notifications, should we have a "u32 magic" to aid in decoding?
Or even add it to fanotify_event_info_error now?

--D

> +
>  struct fanotify_response {
>  	__s32 fd;
>  	__u32 response;
> -- 
> 2.32.0
>
Jan Kara Aug. 17, 2021, 9:05 a.m. UTC | #4
On Mon 16-08-21 14:41:03, Darrick J. Wong wrote:
> On Thu, Aug 12, 2021 at 05:40:07PM -0400, Gabriel Krisman Bertazi wrote:
> > The Error info type is a record sent to users on FAN_FS_ERROR events
> > documenting the type of error.  It also carries an error count,
> > documenting how many errors were observed since the last reporting.
> > 
> > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> > 
> > ---
> > Changes since v5:
> >   - Move error code here
> > ---
> >  fs/notify/fanotify/fanotify.c      |  1 +
> >  fs/notify/fanotify/fanotify.h      |  1 +
> >  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
> >  include/uapi/linux/fanotify.h      |  7 ++++++
> >  4 files changed, 45 insertions(+)
> 
> <snip>
> 
> > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> > index 16402037fc7a..80040a92e9d9 100644
> > --- a/include/uapi/linux/fanotify.h
> > +++ b/include/uapi/linux/fanotify.h
> > @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
> >  #define FAN_EVENT_INFO_TYPE_FID		1
> >  #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
> >  #define FAN_EVENT_INFO_TYPE_DFID	3
> > +#define FAN_EVENT_INFO_TYPE_ERROR	4
> >  
> >  /* Variable length info record following event metadata */
> >  struct fanotify_event_info_header {
> > @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
> >  	unsigned char handle[0];
> >  };
> >  
> > +struct fanotify_event_info_error {
> > +	struct fanotify_event_info_header hdr;
> > +	__s32 error;
> > +	__u32 error_count;
> > +};
> 
> My apologies for not having time to review this patchset since it was
> redesigned to use fanotify.  Someday it would be helpful to be able to
> export more detailed error reports from XFS, but as I'm not ready to
> move forward and write that today, I'll try to avoid derailling this at
> the last minute.

I think we are not quite there and tweaking the passed structure is easy
enough so no worries. Eventually, passing some filesystem-specific blob
together with the event was the plan AFAIR. You're right now is a good
moment to think how exactly we want that passed.

> Eventually, XFS might want to be able to report errors in file data,
> file metadata, allocation group metadata, and whole-filesystem metadata.
> Userspace can already gather reports from XFS about corruptions reported
> by the online fsck code (see xfs_health.c).

Yes, although note that the current plan is that we currently have only one
error event queue, others are just added to error_count until the event is
fetched by userspace (on the grounds that the first error is usually the
most meaningful, the others are usually just cascading problems). But I'm
not sure if this scheme would be suitable for online fsck usecase since we
may discard even valid independent errors this way.

> I /think/ we could subclass the file error structure that you've
> provided like so:
> 
> struct fanotify_event_info_xfs_filesystem_error {
> 	struct fanotify_event_info_error	base;
> 
> 	__u32 magic; /* 0x58465342 to identify xfs */
> 	__u32 type; /* quotas, realtime bitmap, etc. */
> };
> 
> struct fanotify_event_info_xfs_perag_error {
> 	struct fanotify_event_info_error	base;
> 
> 	__u32 magic; /* 0x58465342 to identify xfs */
> 	__u32 type; /* agf, agi, agfl, bno btree, ino btree, etc. */
> 	__u32 agno; /* allocation group number */
> };
> 
> struct fanotify_event_info_xfs_file_error {
> 	struct fanotify_event_info_error	base;
> 
> 	__u32 magic; /* 0x58465342 to identify xfs */
> 	__u32 type; /* extent map, dir, attr, etc. */
> 	__u64 offset; /* file data offset, if applicable */
> 	__u64 length; /* file data length, if applicable */
> };
> 
> (A real XFS implementation might have one structure with the type code
> providing for a tagged union or something; I split it into three
> separate structs here to avoid confusing things.)

The structure of fanotify event as passed to userspace generally is:

struct fanotify_event_metadata {
        __u32 event_len;
        __u8 vers;
        __u8 reserved;
        __u16 metadata_len;
        __aligned_u64 mask;
        __s32 fd;
        __s32 pid;
};

If event_len is > sizeof(struct fanotify_event_metadata), userspace is
expected to look for struct fanotify_event_info_header after struct
fanotify_event_metadata. struct fanotify_event_info_header looks like:

struct fanotify_event_info_header {
        __u8 info_type;
        __u8 pad;
        __u16 len;
};

Again if the end of this info (defined by 'len') is smaller than
'event_len', there is next header with next payload of data. So for example
error event will have:

struct fanotify_event_metadata
struct fanotify_event_info_error
struct fanotify_event_info_fid

Now either we could add fs specific blob into fanotify_event_info_error
(but then it would be good to add 'magic' to fanotify_event_info_error now
and define that if 'len' is larger, fs-specific blob follows after fixed
data) or we can add another info type FAN_EVENT_INFO_TYPE_ERROR_FS_DATA
(i.e., attach another structure into the event) which would contain the
'magic' and then blob of data. I don't have strong preference.

> I have three questions at this point:
> 
> 1) What's the maximum size of a fanotify event structure?  None of these
> structures exceed 36 bytes, which I hope will fit in whatever size
> constraints?

Whole event must fit into 4G, each event info needs to fit in 64k. At least
these are the limits of the interface. Practically, it would be difficult
and inefficient to manipulate such huge events... 

> 2) If a program written for today's notification events sees a
> fanotify_event_info_header from future-XFS with a header length that is
> larger than FANOTIFY_INFO_ERROR_LEN, will it be able to react
> appropriately?  Which is to say, ignore it on the grounds that the
> length is unexpectedly large?

That is the expected behavior :). But I guess separate info type for
fs-specific blob might be more foolproof in this sense - when parsing
events, you are expected to just skip info_types you don't understand
(based on 'len' and 'type' in the common header) and generally different
events have different sets of infos attached to them so you mostly have to
implement this logic to be able to process events.

> It /looks/ like this is the case; really I'm just fishing around here
> to make sure nothing in the design of /this/ patchset would make it Very
> Difficult(tm) to add more information later.
> 
> 3) Once we let filesystem implementations create their own extended
> error notifications, should we have a "u32 magic" to aid in decoding?
> Or even add it to fanotify_event_info_error now?

If we go via the 'separate info type' route, then the magic can go into
that structure and there's no great use for 'magic' in
fanotify_event_info_error.

								Honza
Amir Goldstein Aug. 17, 2021, 10:08 a.m. UTC | #5
On Tue, Aug 17, 2021 at 12:05 PM Jan Kara <jack@suse.cz> wrote:
>
> On Mon 16-08-21 14:41:03, Darrick J. Wong wrote:
> > On Thu, Aug 12, 2021 at 05:40:07PM -0400, Gabriel Krisman Bertazi wrote:
> > > The Error info type is a record sent to users on FAN_FS_ERROR events
> > > documenting the type of error.  It also carries an error count,
> > > documenting how many errors were observed since the last reporting.
> > >
> > > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> > >
> > > ---
> > > Changes since v5:
> > >   - Move error code here
> > > ---
> > >  fs/notify/fanotify/fanotify.c      |  1 +
> > >  fs/notify/fanotify/fanotify.h      |  1 +
> > >  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
> > >  include/uapi/linux/fanotify.h      |  7 ++++++
> > >  4 files changed, 45 insertions(+)
> >
> > <snip>
> >
> > > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> > > index 16402037fc7a..80040a92e9d9 100644
> > > --- a/include/uapi/linux/fanotify.h
> > > +++ b/include/uapi/linux/fanotify.h
> > > @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
> > >  #define FAN_EVENT_INFO_TYPE_FID            1
> > >  #define FAN_EVENT_INFO_TYPE_DFID_NAME      2
> > >  #define FAN_EVENT_INFO_TYPE_DFID   3
> > > +#define FAN_EVENT_INFO_TYPE_ERROR  4
> > >
> > >  /* Variable length info record following event metadata */
> > >  struct fanotify_event_info_header {
> > > @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
> > >     unsigned char handle[0];
> > >  };
> > >
> > > +struct fanotify_event_info_error {
> > > +   struct fanotify_event_info_header hdr;
> > > +   __s32 error;
> > > +   __u32 error_count;
> > > +};
> >
> > My apologies for not having time to review this patchset since it was
> > redesigned to use fanotify.  Someday it would be helpful to be able to
> > export more detailed error reports from XFS, but as I'm not ready to
> > move forward and write that today, I'll try to avoid derailling this at
> > the last minute.
>
> I think we are not quite there and tweaking the passed structure is easy
> enough so no worries. Eventually, passing some filesystem-specific blob
> together with the event was the plan AFAIR. You're right now is a good
> moment to think how exactly we want that passed.
>
> > Eventually, XFS might want to be able to report errors in file data,
> > file metadata, allocation group metadata, and whole-filesystem metadata.
> > Userspace can already gather reports from XFS about corruptions reported
> > by the online fsck code (see xfs_health.c).
>
> Yes, although note that the current plan is that we currently have only one
> error event queue, others are just added to error_count until the event is
> fetched by userspace (on the grounds that the first error is usually the
> most meaningful, the others are usually just cascading problems). But I'm
> not sure if this scheme would be suitable for online fsck usecase since we
> may discard even valid independent errors this way.
>
> > I /think/ we could subclass the file error structure that you've
> > provided like so:
> >
> > struct fanotify_event_info_xfs_filesystem_error {
> >       struct fanotify_event_info_error        base;
> >
> >       __u32 magic; /* 0x58465342 to identify xfs */
> >       __u32 type; /* quotas, realtime bitmap, etc. */
> > };
> >
> > struct fanotify_event_info_xfs_perag_error {
> >       struct fanotify_event_info_error        base;
> >
> >       __u32 magic; /* 0x58465342 to identify xfs */
> >       __u32 type; /* agf, agi, agfl, bno btree, ino btree, etc. */
> >       __u32 agno; /* allocation group number */
> > };
> >
> > struct fanotify_event_info_xfs_file_error {
> >       struct fanotify_event_info_error        base;
> >
> >       __u32 magic; /* 0x58465342 to identify xfs */
> >       __u32 type; /* extent map, dir, attr, etc. */
> >       __u64 offset; /* file data offset, if applicable */
> >       __u64 length; /* file data length, if applicable */
> > };
> >
> > (A real XFS implementation might have one structure with the type code
> > providing for a tagged union or something; I split it into three
> > separate structs here to avoid confusing things.)
>
> The structure of fanotify event as passed to userspace generally is:
>
> struct fanotify_event_metadata {
>         __u32 event_len;
>         __u8 vers;
>         __u8 reserved;
>         __u16 metadata_len;
>         __aligned_u64 mask;
>         __s32 fd;
>         __s32 pid;
> };
>
> If event_len is > sizeof(struct fanotify_event_metadata), userspace is
> expected to look for struct fanotify_event_info_header after struct
> fanotify_event_metadata. struct fanotify_event_info_header looks like:
>
> struct fanotify_event_info_header {
>         __u8 info_type;
>         __u8 pad;
>         __u16 len;
> };
>
> Again if the end of this info (defined by 'len') is smaller than
> 'event_len', there is next header with next payload of data. So for example
> error event will have:
>
> struct fanotify_event_metadata
> struct fanotify_event_info_error
> struct fanotify_event_info_fid
>
> Now either we could add fs specific blob into fanotify_event_info_error
> (but then it would be good to add 'magic' to fanotify_event_info_error now
> and define that if 'len' is larger, fs-specific blob follows after fixed
> data) or we can add another info type FAN_EVENT_INFO_TYPE_ERROR_FS_DATA
> (i.e., attach another structure into the event) which would contain the
> 'magic' and then blob of data. I don't have strong preference.
>
> > I have three questions at this point:
> >
> > 1) What's the maximum size of a fanotify event structure?  None of these
> > structures exceed 36 bytes, which I hope will fit in whatever size
> > constraints?
>
> Whole event must fit into 4G, each event info needs to fit in 64k. At least
> these are the limits of the interface. Practically, it would be difficult
> and inefficient to manipulate such huge events...
>

Just keep in mind that the current scheme pre-allocates the single event slot
on fanotify_mark() time and (I think) we agreed to pre-allocate
sizeof(fsnotify_error_event) + MAX_HDNALE_SZ.
If filesystems would want to store some variable length fs specific info,
a future implementation will have to take that into account.

> > 2) If a program written for today's notification events sees a
> > fanotify_event_info_header from future-XFS with a header length that is
> > larger than FANOTIFY_INFO_ERROR_LEN, will it be able to react
> > appropriately?  Which is to say, ignore it on the grounds that the
> > length is unexpectedly large?
>
> That is the expected behavior :). But I guess separate info type for
> fs-specific blob might be more foolproof in this sense - when parsing
> events, you are expected to just skip info_types you don't understand
> (based on 'len' and 'type' in the common header) and generally different
> events have different sets of infos attached to them so you mostly have to
> implement this logic to be able to process events.
>
> > It /looks/ like this is the case; really I'm just fishing around here
> > to make sure nothing in the design of /this/ patchset would make it Very
> > Difficult(tm) to add more information later.
> >
> > 3) Once we let filesystem implementations create their own extended
> > error notifications, should we have a "u32 magic" to aid in decoding?
> > Or even add it to fanotify_event_info_error now?
>
> If we go via the 'separate info type' route, then the magic can go into
> that structure and there's no great use for 'magic' in
> fanotify_event_info_error.

My 0.02$:
With current patch set, filesystem reports error using:
fsnotify_sb_error(sb, inode, error)

The optional @inode argument is encoded to a filesystem opaque
blob using  exportfs_encode_inode_fh(), recorded in the event
as a blob and reported to userspace as a blob.

If filesystem would like to report a different type of opaque blob
(e.g. xfs_perag_info), the interface should be extended to:
fsnotify_sb_error(sb, inode, error, info, info_len)
and the 'separate info type' route seems like the best and most natural
way to deal with the case of information that is only emitted from
a specific filesystem with a specific feature enabled (online fsck).

IOW, there is no need for fanotify_event_info_xfs_perag_error
in fanotify UAPI if you ask me.

Regarding 'magic' in fanotify_event_info_error, I also don't see the
need for that, because the event already has fsid which can be
used to identify the filesystem in question.

Keep in mind that the value of handle_type inside struct file_handle
inside struct fanotify_event_info_fid is not a universal classifier.
Specifically, the type 0x81 means "XFS_FILEID_INO64_GEN"
only in the context of XFS and it can mean something else in the
context of another type of filesystem.

If we add a new info record fanotify_event_info_fs_private
it could even be an alias to fanotify_event_info_fid with the only
difference that the handle[0] member is not expected to be
struct file_handle, but some other fs private struct.

Thanks,
Amir.
Darrick J. Wong Aug. 18, 2021, 12:10 a.m. UTC | #6
On Tue, Aug 17, 2021 at 11:05:38AM +0200, Jan Kara wrote:
> On Mon 16-08-21 14:41:03, Darrick J. Wong wrote:
> > On Thu, Aug 12, 2021 at 05:40:07PM -0400, Gabriel Krisman Bertazi wrote:
> > > The Error info type is a record sent to users on FAN_FS_ERROR events
> > > documenting the type of error.  It also carries an error count,
> > > documenting how many errors were observed since the last reporting.
> > > 
> > > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> > > 
> > > ---
> > > Changes since v5:
> > >   - Move error code here
> > > ---
> > >  fs/notify/fanotify/fanotify.c      |  1 +
> > >  fs/notify/fanotify/fanotify.h      |  1 +
> > >  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
> > >  include/uapi/linux/fanotify.h      |  7 ++++++
> > >  4 files changed, 45 insertions(+)
> > 
> > <snip>
> > 
> > > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> > > index 16402037fc7a..80040a92e9d9 100644
> > > --- a/include/uapi/linux/fanotify.h
> > > +++ b/include/uapi/linux/fanotify.h
> > > @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
> > >  #define FAN_EVENT_INFO_TYPE_FID		1
> > >  #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
> > >  #define FAN_EVENT_INFO_TYPE_DFID	3
> > > +#define FAN_EVENT_INFO_TYPE_ERROR	4
> > >  
> > >  /* Variable length info record following event metadata */
> > >  struct fanotify_event_info_header {
> > > @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
> > >  	unsigned char handle[0];
> > >  };
> > >  
> > > +struct fanotify_event_info_error {
> > > +	struct fanotify_event_info_header hdr;
> > > +	__s32 error;
> > > +	__u32 error_count;
> > > +};
> > 
> > My apologies for not having time to review this patchset since it was
> > redesigned to use fanotify.  Someday it would be helpful to be able to
> > export more detailed error reports from XFS, but as I'm not ready to
> > move forward and write that today, I'll try to avoid derailling this at
> > the last minute.
> 
> I think we are not quite there and tweaking the passed structure is easy
> enough so no worries. Eventually, passing some filesystem-specific blob
> together with the event was the plan AFAIR. You're right now is a good
> moment to think how exactly we want that passed.
> 
> > Eventually, XFS might want to be able to report errors in file data,
> > file metadata, allocation group metadata, and whole-filesystem metadata.
> > Userspace can already gather reports from XFS about corruptions reported
> > by the online fsck code (see xfs_health.c).
> 
> Yes, although note that the current plan is that we currently have only one
> error event queue, others are just added to error_count until the event is
> fetched by userspace (on the grounds that the first error is usually the
> most meaningful, the others are usually just cascading problems). But I'm
> not sure if this scheme would be suitable for online fsck usecase since we
> may discard even valid independent errors this way.

<nod> The use-cases might split here -- we probably don't want online
fsck to be generating fs error events since the only tool that can do
anything about the broken metadata is the online fsck tool itself.

However, for random errors found by regular reader/writer threads, I
have a patchset in djwong-dev that adds recording of those errors;
that's the place where I think I'd want to add the ability to send
notification blobs to userspace.

Hmm.  For handling accumulated errors, can we still access the
fanotify_event_info_* object once we've handed it to fanotify?  If the
user hasn't picked up the event yet, it might be acceptable to set more
bits in the type mask and bump the error count.  In other words, every
time userspace actually reads the event, it'll get the latest error
state.  I /think/ that's where the design of this patchset is going,
right?

> > I /think/ we could subclass the file error structure that you've
> > provided like so:
> > 
> > struct fanotify_event_info_xfs_filesystem_error {
> > 	struct fanotify_event_info_error	base;
> > 
> > 	__u32 magic; /* 0x58465342 to identify xfs */
> > 	__u32 type; /* quotas, realtime bitmap, etc. */
> > };
> > 
> > struct fanotify_event_info_xfs_perag_error {
> > 	struct fanotify_event_info_error	base;
> > 
> > 	__u32 magic; /* 0x58465342 to identify xfs */
> > 	__u32 type; /* agf, agi, agfl, bno btree, ino btree, etc. */
> > 	__u32 agno; /* allocation group number */
> > };
> > 
> > struct fanotify_event_info_xfs_file_error {
> > 	struct fanotify_event_info_error	base;
> > 
> > 	__u32 magic; /* 0x58465342 to identify xfs */
> > 	__u32 type; /* extent map, dir, attr, etc. */
> > 	__u64 offset; /* file data offset, if applicable */
> > 	__u64 length; /* file data length, if applicable */
> > };
> > 
> > (A real XFS implementation might have one structure with the type code
> > providing for a tagged union or something; I split it into three
> > separate structs here to avoid confusing things.)
> 
> The structure of fanotify event as passed to userspace generally is:
> 
> struct fanotify_event_metadata {
>         __u32 event_len;
>         __u8 vers;
>         __u8 reserved;
>         __u16 metadata_len;
>         __aligned_u64 mask;
>         __s32 fd;
>         __s32 pid;
> };
> 
> If event_len is > sizeof(struct fanotify_event_metadata), userspace is
> expected to look for struct fanotify_event_info_header after struct
> fanotify_event_metadata. struct fanotify_event_info_header looks like:
> 
> struct fanotify_event_info_header {
>         __u8 info_type;
>         __u8 pad;
>         __u16 len;
> };
> 
> Again if the end of this info (defined by 'len') is smaller than
> 'event_len', there is next header with next payload of data. So for example
> error event will have:
> 
> struct fanotify_event_metadata
> struct fanotify_event_info_error
> struct fanotify_event_info_fid
> 
> Now either we could add fs specific blob into fanotify_event_info_error
> (but then it would be good to add 'magic' to fanotify_event_info_error now
> and define that if 'len' is larger, fs-specific blob follows after fixed
> data) or we can add another info type FAN_EVENT_INFO_TYPE_ERROR_FS_DATA
> (i.e., attach another structure into the event) which would contain the
> 'magic' and then blob of data. I don't have strong preference.

I have a slight preference for the second.  It doesn't make much sense
to have a magic value in fanotify_event_info_error to decode a totally
separate structure.

> > I have three questions at this point:
> > 
> > 1) What's the maximum size of a fanotify event structure?  None of these
> > structures exceed 36 bytes, which I hope will fit in whatever size
> > constraints?
> 
> Whole event must fit into 4G, each event info needs to fit in 64k. At least
> these are the limits of the interface. Practically, it would be difficult
> and inefficient to manipulate such huge events... 

Ok.  I doubt we'll ever get close to a 4k page for a single fs object.

> > 2) If a program written for today's notification events sees a
> > fanotify_event_info_header from future-XFS with a header length that is
> > larger than FANOTIFY_INFO_ERROR_LEN, will it be able to react
> > appropriately?  Which is to say, ignore it on the grounds that the
> > length is unexpectedly large?
> 
> That is the expected behavior :). But I guess separate info type for
> fs-specific blob might be more foolproof in this sense - when parsing
> events, you are expected to just skip info_types you don't understand
> (based on 'len' and 'type' in the common header) and generally different
> events have different sets of infos attached to them so you mostly have to
> implement this logic to be able to process events.

Ok, good to hear this. :)

> > It /looks/ like this is the case; really I'm just fishing around here
> > to make sure nothing in the design of /this/ patchset would make it Very
> > Difficult(tm) to add more information later.
> > 
> > 3) Once we let filesystem implementations create their own extended
> > error notifications, should we have a "u32 magic" to aid in decoding?
> > Or even add it to fanotify_event_info_error now?
> 
> If we go via the 'separate info type' route, then the magic can go into
> that structure and there's no great use for 'magic' in
> fanotify_event_info_error.

Ok.  So far so good; now on to Amir's email...

--D

> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Darrick J. Wong Aug. 18, 2021, 12:16 a.m. UTC | #7
On Tue, Aug 17, 2021 at 01:08:06PM +0300, Amir Goldstein wrote:
> On Tue, Aug 17, 2021 at 12:05 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Mon 16-08-21 14:41:03, Darrick J. Wong wrote:
> > > On Thu, Aug 12, 2021 at 05:40:07PM -0400, Gabriel Krisman Bertazi wrote:
> > > > The Error info type is a record sent to users on FAN_FS_ERROR events
> > > > documenting the type of error.  It also carries an error count,
> > > > documenting how many errors were observed since the last reporting.
> > > >
> > > > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> > > >
> > > > ---
> > > > Changes since v5:
> > > >   - Move error code here
> > > > ---
> > > >  fs/notify/fanotify/fanotify.c      |  1 +
> > > >  fs/notify/fanotify/fanotify.h      |  1 +
> > > >  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
> > > >  include/uapi/linux/fanotify.h      |  7 ++++++
> > > >  4 files changed, 45 insertions(+)
> > >
> > > <snip>
> > >
> > > > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> > > > index 16402037fc7a..80040a92e9d9 100644
> > > > --- a/include/uapi/linux/fanotify.h
> > > > +++ b/include/uapi/linux/fanotify.h
> > > > @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
> > > >  #define FAN_EVENT_INFO_TYPE_FID            1
> > > >  #define FAN_EVENT_INFO_TYPE_DFID_NAME      2
> > > >  #define FAN_EVENT_INFO_TYPE_DFID   3
> > > > +#define FAN_EVENT_INFO_TYPE_ERROR  4
> > > >
> > > >  /* Variable length info record following event metadata */
> > > >  struct fanotify_event_info_header {
> > > > @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
> > > >     unsigned char handle[0];
> > > >  };
> > > >
> > > > +struct fanotify_event_info_error {
> > > > +   struct fanotify_event_info_header hdr;
> > > > +   __s32 error;
> > > > +   __u32 error_count;
> > > > +};
> > >
> > > My apologies for not having time to review this patchset since it was
> > > redesigned to use fanotify.  Someday it would be helpful to be able to
> > > export more detailed error reports from XFS, but as I'm not ready to
> > > move forward and write that today, I'll try to avoid derailling this at
> > > the last minute.
> >
> > I think we are not quite there and tweaking the passed structure is easy
> > enough so no worries. Eventually, passing some filesystem-specific blob
> > together with the event was the plan AFAIR. You're right now is a good
> > moment to think how exactly we want that passed.
> >
> > > Eventually, XFS might want to be able to report errors in file data,
> > > file metadata, allocation group metadata, and whole-filesystem metadata.
> > > Userspace can already gather reports from XFS about corruptions reported
> > > by the online fsck code (see xfs_health.c).
> >
> > Yes, although note that the current plan is that we currently have only one
> > error event queue, others are just added to error_count until the event is
> > fetched by userspace (on the grounds that the first error is usually the
> > most meaningful, the others are usually just cascading problems). But I'm
> > not sure if this scheme would be suitable for online fsck usecase since we
> > may discard even valid independent errors this way.
> >
> > > I /think/ we could subclass the file error structure that you've
> > > provided like so:
> > >
> > > struct fanotify_event_info_xfs_filesystem_error {
> > >       struct fanotify_event_info_error        base;
> > >
> > >       __u32 magic; /* 0x58465342 to identify xfs */
> > >       __u32 type; /* quotas, realtime bitmap, etc. */
> > > };
> > >
> > > struct fanotify_event_info_xfs_perag_error {
> > >       struct fanotify_event_info_error        base;
> > >
> > >       __u32 magic; /* 0x58465342 to identify xfs */
> > >       __u32 type; /* agf, agi, agfl, bno btree, ino btree, etc. */
> > >       __u32 agno; /* allocation group number */
> > > };
> > >
> > > struct fanotify_event_info_xfs_file_error {
> > >       struct fanotify_event_info_error        base;
> > >
> > >       __u32 magic; /* 0x58465342 to identify xfs */
> > >       __u32 type; /* extent map, dir, attr, etc. */
> > >       __u64 offset; /* file data offset, if applicable */
> > >       __u64 length; /* file data length, if applicable */
> > > };
> > >
> > > (A real XFS implementation might have one structure with the type code
> > > providing for a tagged union or something; I split it into three
> > > separate structs here to avoid confusing things.)
> >
> > The structure of fanotify event as passed to userspace generally is:
> >
> > struct fanotify_event_metadata {
> >         __u32 event_len;
> >         __u8 vers;
> >         __u8 reserved;
> >         __u16 metadata_len;
> >         __aligned_u64 mask;
> >         __s32 fd;
> >         __s32 pid;
> > };
> >
> > If event_len is > sizeof(struct fanotify_event_metadata), userspace is
> > expected to look for struct fanotify_event_info_header after struct
> > fanotify_event_metadata. struct fanotify_event_info_header looks like:
> >
> > struct fanotify_event_info_header {
> >         __u8 info_type;
> >         __u8 pad;
> >         __u16 len;
> > };
> >
> > Again if the end of this info (defined by 'len') is smaller than
> > 'event_len', there is next header with next payload of data. So for example
> > error event will have:
> >
> > struct fanotify_event_metadata
> > struct fanotify_event_info_error
> > struct fanotify_event_info_fid
> >
> > Now either we could add fs specific blob into fanotify_event_info_error
> > (but then it would be good to add 'magic' to fanotify_event_info_error now
> > and define that if 'len' is larger, fs-specific blob follows after fixed
> > data) or we can add another info type FAN_EVENT_INFO_TYPE_ERROR_FS_DATA
> > (i.e., attach another structure into the event) which would contain the
> > 'magic' and then blob of data. I don't have strong preference.
> >
> > > I have three questions at this point:
> > >
> > > 1) What's the maximum size of a fanotify event structure?  None of these
> > > structures exceed 36 bytes, which I hope will fit in whatever size
> > > constraints?
> >
> > Whole event must fit into 4G, each event info needs to fit in 64k. At least
> > these are the limits of the interface. Practically, it would be difficult
> > and inefficient to manipulate such huge events...
> >
> 
> Just keep in mind that the current scheme pre-allocates the single event slot
> on fanotify_mark() time and (I think) we agreed to pre-allocate
> sizeof(fsnotify_error_event) + MAX_HDNALE_SZ.
> If filesystems would want to store some variable length fs specific info,
> a future implementation will have to take that into account.

<nod> I /think/ for the fs and AG metadata we could preallocate these,
so long as fsnotify doesn't free them out from under us.  For inodes...
there are many more of those, so they'd have to be allocated
dynamically.

> > > 2) If a program written for today's notification events sees a
> > > fanotify_event_info_header from future-XFS with a header length that is
> > > larger than FANOTIFY_INFO_ERROR_LEN, will it be able to react
> > > appropriately?  Which is to say, ignore it on the grounds that the
> > > length is unexpectedly large?
> >
> > That is the expected behavior :). But I guess separate info type for
> > fs-specific blob might be more foolproof in this sense - when parsing
> > events, you are expected to just skip info_types you don't understand
> > (based on 'len' and 'type' in the common header) and generally different
> > events have different sets of infos attached to them so you mostly have to
> > implement this logic to be able to process events.
> >
> > > It /looks/ like this is the case; really I'm just fishing around here
> > > to make sure nothing in the design of /this/ patchset would make it Very
> > > Difficult(tm) to add more information later.
> > >
> > > 3) Once we let filesystem implementations create their own extended
> > > error notifications, should we have a "u32 magic" to aid in decoding?
> > > Or even add it to fanotify_event_info_error now?
> >
> > If we go via the 'separate info type' route, then the magic can go into
> > that structure and there's no great use for 'magic' in
> > fanotify_event_info_error.
> 
> My 0.02$:
> With current patch set, filesystem reports error using:
> fsnotify_sb_error(sb, inode, error)
> 
> The optional @inode argument is encoded to a filesystem opaque
> blob using  exportfs_encode_inode_fh(), recorded in the event
> as a blob and reported to userspace as a blob.
> 
> If filesystem would like to report a different type of opaque blob
> (e.g. xfs_perag_info), the interface should be extended to:
> fsnotify_sb_error(sb, inode, error, info, info_len)
> and the 'separate info type' route seems like the best and most natural
> way to deal with the case of information that is only emitted from
> a specific filesystem with a specific feature enabled (online fsck).

<nod> This seems reasonable to me.

> IOW, there is no need for fanotify_event_info_xfs_perag_error
> in fanotify UAPI if you ask me.
> 
> Regarding 'magic' in fanotify_event_info_error, I also don't see the
> need for that, because the event already has fsid which can be
> used to identify the filesystem in question.
> 
> Keep in mind that the value of handle_type inside struct file_handle
> inside struct fanotify_event_info_fid is not a universal classifier.
> Specifically, the type 0x81 means "XFS_FILEID_INO64_GEN"
> only in the context of XFS and it can mean something else in the
> context of another type of filesystem.

Can you pass the handle into the kernel to open a fd to file mentioned
in the report?  I don't think userspace is supposed to know what's
inside a file handle, and it would be helpful if it didn't matter here
either. :)

> If we add a new info record fanotify_event_info_fs_private
> it could even be an alias to fanotify_event_info_fid with the only
> difference that the handle[0] member is not expected to be
> struct file_handle, but some other fs private struct.

I ... think I prefer it being a separate info blob.

--D

> 
> Thanks,
> Amir.
Amir Goldstein Aug. 18, 2021, 3:24 a.m. UTC | #8
[...]

> > Just keep in mind that the current scheme pre-allocates the single event slot
> > on fanotify_mark() time and (I think) we agreed to pre-allocate
> > sizeof(fsnotify_error_event) + MAX_HDNALE_SZ.
> > If filesystems would want to store some variable length fs specific info,
> > a future implementation will have to take that into account.
>
> <nod> I /think/ for the fs and AG metadata we could preallocate these,
> so long as fsnotify doesn't free them out from under us.

fs won't get notified when the event is freed, so fsnotify must
take ownership on the data structure.
I was thinking more along the lines of limiting maximum size for fs
specific info and pre-allocating that size for the event.

> For inodes...
> there are many more of those, so they'd have to be allocated
> dynamically.

The current scheme is that the size of the queue for error events
is one and the single slot is pre-allocated.
The reason for pre-allocate is that the assumption is that fsnotify_error()
could be called from contexts where memory allocation would be
inconvenient.
Therefore, we can store the encoded file handle of the first erroneous
inode, but we do not store any more events until user read this
one event.

> Hmm.  For handling accumulated errors, can we still access the
> fanotify_event_info_* object once we've handed it to fanotify?  If the
> user hasn't picked up the event yet, it might be acceptable to set more
> bits in the type mask and bump the error count.  In other words, every
> time userspace actually reads the event, it'll get the latest error
> state.  I /think/ that's where the design of this patchset is going,
> right?

Sort of.
fsnotify does have a concept of "merging" new event with an event
already in queue.

With most fsnotify events, merge only happens if the info related
to the new event (e.g. sb,inode) is the same as that off the queued
event and the "merge" is only in the event mask
(e.g. FS_OPEN|FS_CLOSE).

However, the current scheme for "merge" of an FS_ERROR event is only
bumping err_count, even if the new reported error or inode do not
match the error/inode in the queued event.

If we define error event subtypes (e.g. FS_ERROR_WRITEBACK,
FS_ERROR_METADATA), then the error event could contain
a field for subtype mask and user could read the subtype mask
along with the accumulated error count, but this cannot be
done by providing the filesystem access to modify an internal
fsnotify event, so those have to be generic UAPI defined subtypes.

If you think that would be useful, then we may want to consider
reserving the subtype mask field in fanotify_event_info_error in
advance.

>
> > > > 2) If a program written for today's notification events sees a
> > > > fanotify_event_info_header from future-XFS with a header length that is
> > > > larger than FANOTIFY_INFO_ERROR_LEN, will it be able to react
> > > > appropriately?  Which is to say, ignore it on the grounds that the
> > > > length is unexpectedly large?
> > >
> > > That is the expected behavior :). But I guess separate info type for
> > > fs-specific blob might be more foolproof in this sense - when parsing
> > > events, you are expected to just skip info_types you don't understand
> > > (based on 'len' and 'type' in the common header) and generally different
> > > events have different sets of infos attached to them so you mostly have to
> > > implement this logic to be able to process events.
> > >
> > > > It /looks/ like this is the case; really I'm just fishing around here
> > > > to make sure nothing in the design of /this/ patchset would make it Very
> > > > Difficult(tm) to add more information later.
> > > >
> > > > 3) Once we let filesystem implementations create their own extended
> > > > error notifications, should we have a "u32 magic" to aid in decoding?
> > > > Or even add it to fanotify_event_info_error now?
> > >
> > > If we go via the 'separate info type' route, then the magic can go into
> > > that structure and there's no great use for 'magic' in
> > > fanotify_event_info_error.
> >
> > My 0.02$:
> > With current patch set, filesystem reports error using:
> > fsnotify_sb_error(sb, inode, error)
> >
> > The optional @inode argument is encoded to a filesystem opaque
> > blob using  exportfs_encode_inode_fh(), recorded in the event
> > as a blob and reported to userspace as a blob.
> >
> > If filesystem would like to report a different type of opaque blob
> > (e.g. xfs_perag_info), the interface should be extended to:
> > fsnotify_sb_error(sb, inode, error, info, info_len)
> > and the 'separate info type' route seems like the best and most natural
> > way to deal with the case of information that is only emitted from
> > a specific filesystem with a specific feature enabled (online fsck).
>
> <nod> This seems reasonable to me.
>
> > IOW, there is no need for fanotify_event_info_xfs_perag_error
> > in fanotify UAPI if you ask me.
> >
> > Regarding 'magic' in fanotify_event_info_error, I also don't see the
> > need for that, because the event already has fsid which can be
> > used to identify the filesystem in question.
> >
> > Keep in mind that the value of handle_type inside struct file_handle
> > inside struct fanotify_event_info_fid is not a universal classifier.
> > Specifically, the type 0x81 means "XFS_FILEID_INO64_GEN"
> > only in the context of XFS and it can mean something else in the
> > context of another type of filesystem.
>
> Can you pass the handle into the kernel to open a fd to file mentioned
> in the report?  I don't think userspace is supposed to know what's
> inside a file handle, and it would be helpful if it didn't matter here
> either. :)
>

User gets a file handle and can do whatever users can do with file
handles... that is, open_by_handle_at() (if filesystem and inode are
still alive and healthy) and for less privileged users, compare with
result of name_to_handle_at() of another object.

Obviously, filesystem specialized tools could parse the file handle
to extract more information.

> > If we add a new info record fanotify_event_info_fs_private
> > it could even be an alias to fanotify_event_info_fid with the only
> > difference that the handle[0] member is not expected to be
> > struct file_handle, but some other fs private struct.
>
> I ... think I prefer it being a separate info blob.
>

Yes. That is what I meant.
Separate info record INFO_TYPE_ERROR_FS_DATA, whose info record
format is quite the same as that of INFO_TYPE_FID, but the blob is a
different type of blob.

Thanks,
Amir.
Jan Kara Aug. 18, 2021, 9:58 a.m. UTC | #9
On Wed 18-08-21 06:24:26, Amir Goldstein wrote:
> [...]
> 
> > > Just keep in mind that the current scheme pre-allocates the single event slot
> > > on fanotify_mark() time and (I think) we agreed to pre-allocate
> > > sizeof(fsnotify_error_event) + MAX_HDNALE_SZ.
> > > If filesystems would want to store some variable length fs specific info,
> > > a future implementation will have to take that into account.
> >
> > <nod> I /think/ for the fs and AG metadata we could preallocate these,
> > so long as fsnotify doesn't free them out from under us.
> 
> fs won't get notified when the event is freed, so fsnotify must
> take ownership on the data structure.
> I was thinking more along the lines of limiting maximum size for fs
> specific info and pre-allocating that size for the event.

Agreed. If there's a sensible upperbound than preallocating this inside
fsnotify is likely the least problematic solution.

> > For inodes...
> > there are many more of those, so they'd have to be allocated
> > dynamically.
> 
> The current scheme is that the size of the queue for error events
> is one and the single slot is pre-allocated.
> The reason for pre-allocate is that the assumption is that fsnotify_error()
> could be called from contexts where memory allocation would be
> inconvenient.
> Therefore, we can store the encoded file handle of the first erroneous
> inode, but we do not store any more events until user read this
> one event.

Right. OTOH I can imagine allowing GFP_NOFS allocations in the error
context. At least for ext4 it would be workable (after all ext4 manages to
lock & modify superblock in its error handlers, GFP_NOFS allocation isn't
harder). But then if events are dynamically allocated there's still the
inconvenient question what are you going to do if you need to report fs
error and you hit ENOMEM. Just not sending the notification may have nasty
consequences and in the world of containerization and virtualization
tightly packed machines where ENOMEM happens aren't that unlikely. It is
just difficult to make assumptions about filesystems overall so we decided
to be better safe and preallocate the event.

Or, we could leave the allocation troubles for the filesystem and
fsnotify_sb_error() would be passed already allocated event (this way
attaching of fs-specific blobs to the event is handled as well) which it
would just queue. Plus we'd need to provide some helper to fill in generic
part of the event...

The disadvantage is that if there are filesystems / callsites needing
preallocated events, it would be painful for them. OTOH current two users -
ext4 & xfs - can handle allocation in the error path AFAIU.

Thinking about this some more, maybe we could have event preallocated (like
a "rescue event"). Normally we would dynamically allocate (or get passed
from fs) the event and only if the allocation fails, we would queue the
rescue event to indicate to listeners that something bad happened, there
was error but we could not fully report it.

But then, even if we'd go for dynamic event allocation by default, we need
to efficiently merge events since some fs failures (e.g. resulting in
journal abort in ext4) lead to basically all operations with the filesystem
to fail and that could easily swamp the notification system with useless
events. Current system with preallocated event nicely handles this
situation, it is questionable how to extend it for online fsck usecase
where we need to queue more than one event (but even there probably needs
to be some sensible upper-bound). I'll think about it...

> > Hmm.  For handling accumulated errors, can we still access the
> > fanotify_event_info_* object once we've handed it to fanotify?  If the
> > user hasn't picked up the event yet, it might be acceptable to set more
> > bits in the type mask and bump the error count.  In other words, every
> > time userspace actually reads the event, it'll get the latest error
> > state.  I /think/ that's where the design of this patchset is going,
> > right?
> 
> Sort of.
> fsnotify does have a concept of "merging" new event with an event
> already in queue.
> 
> With most fsnotify events, merge only happens if the info related
> to the new event (e.g. sb,inode) is the same as that off the queued
> event and the "merge" is only in the event mask
> (e.g. FS_OPEN|FS_CLOSE).
> 
> However, the current scheme for "merge" of an FS_ERROR event is only
> bumping err_count, even if the new reported error or inode do not
> match the error/inode in the queued event.
> 
> If we define error event subtypes (e.g. FS_ERROR_WRITEBACK,
> FS_ERROR_METADATA), then the error event could contain
> a field for subtype mask and user could read the subtype mask
> along with the accumulated error count, but this cannot be
> done by providing the filesystem access to modify an internal
> fsnotify event, so those have to be generic UAPI defined subtypes.
> 
> If you think that would be useful, then we may want to consider
> reserving the subtype mask field in fanotify_event_info_error in
> advance.

It depends on what exactly Darrick has in mind but I suspect we'd need a
fs-specific merge helper that would look at fs-specific blobs in the event
and decide whether events can be merged or not, possibly also handling the
merge by updating the blob. From the POV of fsnotify that would probably
mean merge callback in the event itself. But I guess this needs more
details from Darrick and maybe we don't need to decide this at this moment
since nobody is close to the point of having code needing to pass fs-blobs
with events.

								Honza
Darrick J. Wong Aug. 19, 2021, 3:58 a.m. UTC | #10
On Wed, Aug 18, 2021 at 11:58:18AM +0200, Jan Kara wrote:
> On Wed 18-08-21 06:24:26, Amir Goldstein wrote:
> > [...]
> > 
> > > > Just keep in mind that the current scheme pre-allocates the single event slot
> > > > on fanotify_mark() time and (I think) we agreed to pre-allocate
> > > > sizeof(fsnotify_error_event) + MAX_HDNALE_SZ.
> > > > If filesystems would want to store some variable length fs specific info,
> > > > a future implementation will have to take that into account.
> > >
> > > <nod> I /think/ for the fs and AG metadata we could preallocate these,
> > > so long as fsnotify doesn't free them out from under us.
> > 
> > fs won't get notified when the event is freed, so fsnotify must
> > take ownership on the data structure.
> > I was thinking more along the lines of limiting maximum size for fs
> > specific info and pre-allocating that size for the event.
> 
> Agreed. If there's a sensible upperbound than preallocating this inside
> fsnotify is likely the least problematic solution.
> 
> > > For inodes...
> > > there are many more of those, so they'd have to be allocated
> > > dynamically.
> > 
> > The current scheme is that the size of the queue for error events
> > is one and the single slot is pre-allocated.
> > The reason for pre-allocate is that the assumption is that fsnotify_error()
> > could be called from contexts where memory allocation would be
> > inconvenient.
> > Therefore, we can store the encoded file handle of the first erroneous
> > inode, but we do not store any more events until user read this
> > one event.
> 
> Right. OTOH I can imagine allowing GFP_NOFS allocations in the error
> context. At least for ext4 it would be workable (after all ext4 manages to
> lock & modify superblock in its error handlers, GFP_NOFS allocation isn't
> harder). But then if events are dynamically allocated there's still the
> inconvenient question what are you going to do if you need to report fs
> error and you hit ENOMEM. Just not sending the notification may have nasty
> consequences and in the world of containerization and virtualization
> tightly packed machines where ENOMEM happens aren't that unlikely. It is
> just difficult to make assumptions about filesystems overall so we decided
> to be better safe and preallocate the event.
> 
> Or, we could leave the allocation troubles for the filesystem and
> fsnotify_sb_error() would be passed already allocated event (this way
> attaching of fs-specific blobs to the event is handled as well) which it
> would just queue. Plus we'd need to provide some helper to fill in generic
> part of the event...
> 
> The disadvantage is that if there are filesystems / callsites needing
> preallocated events, it would be painful for them. OTOH current two users -
> ext4 & xfs - can handle allocation in the error path AFAIU.
> 
> Thinking about this some more, maybe we could have event preallocated (like
> a "rescue event"). Normally we would dynamically allocate (or get passed
> from fs) the event and only if the allocation fails, we would queue the
> rescue event to indicate to listeners that something bad happened, there
> was error but we could not fully report it.

Yes.

> But then, even if we'd go for dynamic event allocation by default, we need
> to efficiently merge events since some fs failures (e.g. resulting in
> journal abort in ext4) lead to basically all operations with the filesystem
> to fail and that could easily swamp the notification system with useless
> events.

Hm.  Going out on a limb, I would guess that the majority of fs error
flood events happen if the storage fails catastrophically.  Assuming
that a catastrophic failure will quickly take the filesystem offline, I
would say that for XFS we should probably send one last "and then we
died" event and stop reporting after that.

> Current system with preallocated event nicely handles this
> situation, it is questionable how to extend it for online fsck usecase
> where we need to queue more than one event (but even there probably needs
> to be some sensible upper-bound). I'll think about it...

At least for XFS, I was figuring that xfs_scrub errors wouldn't be
reported via fsnotify since the repair tool is already running anyway.

> > > Hmm.  For handling accumulated errors, can we still access the
> > > fanotify_event_info_* object once we've handed it to fanotify?  If the
> > > user hasn't picked up the event yet, it might be acceptable to set more
> > > bits in the type mask and bump the error count.  In other words, every
> > > time userspace actually reads the event, it'll get the latest error
> > > state.  I /think/ that's where the design of this patchset is going,
> > > right?
> > 
> > Sort of.
> > fsnotify does have a concept of "merging" new event with an event
> > already in queue.
> > 
> > With most fsnotify events, merge only happens if the info related
> > to the new event (e.g. sb,inode) is the same as that off the queued
> > event and the "merge" is only in the event mask
> > (e.g. FS_OPEN|FS_CLOSE).
> > 
> > However, the current scheme for "merge" of an FS_ERROR event is only
> > bumping err_count, even if the new reported error or inode do not
> > match the error/inode in the queued event.
> > 
> > If we define error event subtypes (e.g. FS_ERROR_WRITEBACK,
> > FS_ERROR_METADATA), then the error event could contain
> > a field for subtype mask and user could read the subtype mask
> > along with the accumulated error count, but this cannot be
> > done by providing the filesystem access to modify an internal
> > fsnotify event, so those have to be generic UAPI defined subtypes.
> > 
> > If you think that would be useful, then we may want to consider
> > reserving the subtype mask field in fanotify_event_info_error in
> > advance.
> 
> It depends on what exactly Darrick has in mind but I suspect we'd need a
> fs-specific merge helper that would look at fs-specific blobs in the event
> and decide whether events can be merged or not, possibly also handling the
> merge by updating the blob.

Yes.  If the filesystem itself were allowed to manage the lifespan of
the fsnotify error event object then this would be trivial -- we'll own
the object, keep it updated as needed, and fsnotify can copy the
contents to userspace whenever convenient.

(This might be a naïve view of fsnotify...)

> From the POV of fsnotify that would probably
> mean merge callback in the event itself. But I guess this needs more
> details from Darrick and maybe we don't need to decide this at this moment
> since nobody is close to the point of having code needing to pass fs-blobs
> with events.

<nod> We ... probably don't need to decide this now.

--D

> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Gabriel Krisman Bertazi Aug. 24, 2021, 4:53 p.m. UTC | #11
Jan Kara <jack@suse.cz> writes:

> On Mon 16-08-21 14:41:03, Darrick J. Wong wrote:
>> On Thu, Aug 12, 2021 at 05:40:07PM -0400, Gabriel Krisman Bertazi wrote:
>> > The Error info type is a record sent to users on FAN_FS_ERROR events
>> > documenting the type of error.  It also carries an error count,
>> > documenting how many errors were observed since the last reporting.
>> > 
>> > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
>> > 
>> > ---
>> > Changes since v5:
>> >   - Move error code here
>> > ---
>> >  fs/notify/fanotify/fanotify.c      |  1 +
>> >  fs/notify/fanotify/fanotify.h      |  1 +
>> >  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
>> >  include/uapi/linux/fanotify.h      |  7 ++++++
>> >  4 files changed, 45 insertions(+)
>> 
>> <snip>
>> 
>> > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
>> > index 16402037fc7a..80040a92e9d9 100644
>> > --- a/include/uapi/linux/fanotify.h
>> > +++ b/include/uapi/linux/fanotify.h
>> > @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
>> >  #define FAN_EVENT_INFO_TYPE_FID		1
>> >  #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
>> >  #define FAN_EVENT_INFO_TYPE_DFID	3
>> > +#define FAN_EVENT_INFO_TYPE_ERROR	4
>> >  
>> >  /* Variable length info record following event metadata */
>> >  struct fanotify_event_info_header {
>> > @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
>> >  	unsigned char handle[0];
>> >  };
>> >  
>> > +struct fanotify_event_info_error {
>> > +	struct fanotify_event_info_header hdr;
>> > +	__s32 error;
>> > +	__u32 error_count;
>> > +};
>> 
>> My apologies for not having time to review this patchset since it was
>> redesigned to use fanotify.  Someday it would be helpful to be able to
>> export more detailed error reports from XFS, but as I'm not ready to
>> move forward and write that today, I'll try to avoid derailling this at
>> the last minute.
>
> I think we are not quite there and tweaking the passed structure is easy
> enough so no worries. Eventually, passing some filesystem-specific blob
> together with the event was the plan AFAIR. You're right now is a good
> moment to think how exactly we want that passed.
>
>> Eventually, XFS might want to be able to report errors in file data,
>> file metadata, allocation group metadata, and whole-filesystem metadata.
>> Userspace can already gather reports from XFS about corruptions reported
>> by the online fsck code (see xfs_health.c).
>
> Yes, although note that the current plan is that we currently have only one
> error event queue, others are just added to error_count until the event is
> fetched by userspace (on the grounds that the first error is usually the
> most meaningful, the others are usually just cascading problems). But I'm
> not sure if this scheme would be suitable for online fsck usecase since we
> may discard even valid independent errors this way.
>
>> I /think/ we could subclass the file error structure that you've
>> provided like so:
>> 
>> struct fanotify_event_info_xfs_filesystem_error {
>> 	struct fanotify_event_info_error	base;
>> 
>> 	__u32 magic; /* 0x58465342 to identify xfs */
>> 	__u32 type; /* quotas, realtime bitmap, etc. */
>> };
>> 
>> struct fanotify_event_info_xfs_perag_error {
>> 	struct fanotify_event_info_error	base;
>> 
>> 	__u32 magic; /* 0x58465342 to identify xfs */
>> 	__u32 type; /* agf, agi, agfl, bno btree, ino btree, etc. */
>> 	__u32 agno; /* allocation group number */
>> };
>> 
>> struct fanotify_event_info_xfs_file_error {
>> 	struct fanotify_event_info_error	base;
>> 
>> 	__u32 magic; /* 0x58465342 to identify xfs */
>> 	__u32 type; /* extent map, dir, attr, etc. */
>> 	__u64 offset; /* file data offset, if applicable */
>> 	__u64 length; /* file data length, if applicable */
>> };
>> 
>> (A real XFS implementation might have one structure with the type code
>> providing for a tagged union or something; I split it into three
>> separate structs here to avoid confusing things.)
>
> The structure of fanotify event as passed to userspace generally is:
>
> struct fanotify_event_metadata {
>         __u32 event_len;
>         __u8 vers;
>         __u8 reserved;
>         __u16 metadata_len;
>         __aligned_u64 mask;
>         __s32 fd;
>         __s32 pid;
> };
>
> If event_len is > sizeof(struct fanotify_event_metadata), userspace is
> expected to look for struct fanotify_event_info_header after struct
> fanotify_event_metadata. struct fanotify_event_info_header looks like:
>
> struct fanotify_event_info_header {
>         __u8 info_type;
>         __u8 pad;
>         __u16 len;
> };
>
> Again if the end of this info (defined by 'len') is smaller than
> 'event_len', there is next header with next payload of data. So for example
> error event will have:
>
> struct fanotify_event_metadata
> struct fanotify_event_info_error
> struct fanotify_event_info_fid
>
> Now either we could add fs specific blob into fanotify_event_info_error
> (but then it would be good to add 'magic' to fanotify_event_info_error now
> and define that if 'len' is larger, fs-specific blob follows after fixed
> data) or we can add another info type FAN_EVENT_INFO_TYPE_ERROR_FS_DATA
> (i.e., attach another structure into the event) which would contain the
> 'magic' and then blob of data. I don't have strong preference.

In the v1 of this patchset [1] I implemented the later option, a new
info type that the filesystem could provide as a blob.  It was dropped
by Amir's request to leave it out of the discussion at that moment.  Should I
ressucitate it for the next iteration?  I believe it would attend to XFS needs.

[1] https://lwn.net/ml/linux-fsdevel/20210426184201.4177978-12-krisman@collabora.com/
Darrick J. Wong Aug. 25, 2021, 4:09 a.m. UTC | #12
On Tue, Aug 24, 2021 at 12:53:24PM -0400, Gabriel Krisman Bertazi wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> > On Mon 16-08-21 14:41:03, Darrick J. Wong wrote:
> >> On Thu, Aug 12, 2021 at 05:40:07PM -0400, Gabriel Krisman Bertazi wrote:
> >> > The Error info type is a record sent to users on FAN_FS_ERROR events
> >> > documenting the type of error.  It also carries an error count,
> >> > documenting how many errors were observed since the last reporting.
> >> > 
> >> > Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> >> > 
> >> > ---
> >> > Changes since v5:
> >> >   - Move error code here
> >> > ---
> >> >  fs/notify/fanotify/fanotify.c      |  1 +
> >> >  fs/notify/fanotify/fanotify.h      |  1 +
> >> >  fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++
> >> >  include/uapi/linux/fanotify.h      |  7 ++++++
> >> >  4 files changed, 45 insertions(+)
> >> 
> >> <snip>
> >> 
> >> > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> >> > index 16402037fc7a..80040a92e9d9 100644
> >> > --- a/include/uapi/linux/fanotify.h
> >> > +++ b/include/uapi/linux/fanotify.h
> >> > @@ -124,6 +124,7 @@ struct fanotify_event_metadata {
> >> >  #define FAN_EVENT_INFO_TYPE_FID		1
> >> >  #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
> >> >  #define FAN_EVENT_INFO_TYPE_DFID	3
> >> > +#define FAN_EVENT_INFO_TYPE_ERROR	4
> >> >  
> >> >  /* Variable length info record following event metadata */
> >> >  struct fanotify_event_info_header {
> >> > @@ -149,6 +150,12 @@ struct fanotify_event_info_fid {
> >> >  	unsigned char handle[0];
> >> >  };
> >> >  
> >> > +struct fanotify_event_info_error {
> >> > +	struct fanotify_event_info_header hdr;
> >> > +	__s32 error;
> >> > +	__u32 error_count;
> >> > +};
> >> 
> >> My apologies for not having time to review this patchset since it was
> >> redesigned to use fanotify.  Someday it would be helpful to be able to
> >> export more detailed error reports from XFS, but as I'm not ready to
> >> move forward and write that today, I'll try to avoid derailling this at
> >> the last minute.
> >
> > I think we are not quite there and tweaking the passed structure is easy
> > enough so no worries. Eventually, passing some filesystem-specific blob
> > together with the event was the plan AFAIR. You're right now is a good
> > moment to think how exactly we want that passed.
> >
> >> Eventually, XFS might want to be able to report errors in file data,
> >> file metadata, allocation group metadata, and whole-filesystem metadata.
> >> Userspace can already gather reports from XFS about corruptions reported
> >> by the online fsck code (see xfs_health.c).
> >
> > Yes, although note that the current plan is that we currently have only one
> > error event queue, others are just added to error_count until the event is
> > fetched by userspace (on the grounds that the first error is usually the
> > most meaningful, the others are usually just cascading problems). But I'm
> > not sure if this scheme would be suitable for online fsck usecase since we
> > may discard even valid independent errors this way.
> >
> >> I /think/ we could subclass the file error structure that you've
> >> provided like so:
> >> 
> >> struct fanotify_event_info_xfs_filesystem_error {
> >> 	struct fanotify_event_info_error	base;
> >> 
> >> 	__u32 magic; /* 0x58465342 to identify xfs */
> >> 	__u32 type; /* quotas, realtime bitmap, etc. */
> >> };
> >> 
> >> struct fanotify_event_info_xfs_perag_error {
> >> 	struct fanotify_event_info_error	base;
> >> 
> >> 	__u32 magic; /* 0x58465342 to identify xfs */
> >> 	__u32 type; /* agf, agi, agfl, bno btree, ino btree, etc. */
> >> 	__u32 agno; /* allocation group number */
> >> };
> >> 
> >> struct fanotify_event_info_xfs_file_error {
> >> 	struct fanotify_event_info_error	base;
> >> 
> >> 	__u32 magic; /* 0x58465342 to identify xfs */
> >> 	__u32 type; /* extent map, dir, attr, etc. */
> >> 	__u64 offset; /* file data offset, if applicable */
> >> 	__u64 length; /* file data length, if applicable */
> >> };
> >> 
> >> (A real XFS implementation might have one structure with the type code
> >> providing for a tagged union or something; I split it into three
> >> separate structs here to avoid confusing things.)
> >
> > The structure of fanotify event as passed to userspace generally is:
> >
> > struct fanotify_event_metadata {
> >         __u32 event_len;
> >         __u8 vers;
> >         __u8 reserved;
> >         __u16 metadata_len;
> >         __aligned_u64 mask;
> >         __s32 fd;
> >         __s32 pid;
> > };
> >
> > If event_len is > sizeof(struct fanotify_event_metadata), userspace is
> > expected to look for struct fanotify_event_info_header after struct
> > fanotify_event_metadata. struct fanotify_event_info_header looks like:
> >
> > struct fanotify_event_info_header {
> >         __u8 info_type;
> >         __u8 pad;
> >         __u16 len;
> > };
> >
> > Again if the end of this info (defined by 'len') is smaller than
> > 'event_len', there is next header with next payload of data. So for example
> > error event will have:
> >
> > struct fanotify_event_metadata
> > struct fanotify_event_info_error
> > struct fanotify_event_info_fid
> >
> > Now either we could add fs specific blob into fanotify_event_info_error
> > (but then it would be good to add 'magic' to fanotify_event_info_error now
> > and define that if 'len' is larger, fs-specific blob follows after fixed
> > data) or we can add another info type FAN_EVENT_INFO_TYPE_ERROR_FS_DATA
> > (i.e., attach another structure into the event) which would contain the
> > 'magic' and then blob of data. I don't have strong preference.
> 
> In the v1 of this patchset [1] I implemented the later option, a new
> info type that the filesystem could provide as a blob.  It was dropped
> by Amir's request to leave it out of the discussion at that moment.  Should I
> ressucitate it for the next iteration?  I believe it would attend to XFS needs.

I don't think it's necessary at this time.  We (XFS community) would
have a bit more work to do before we get to the point of needing those
sorts of hooks in upstream. :)

--D

> 
> [1] https://lwn.net/ml/linux-fsdevel/20210426184201.4177978-12-krisman@collabora.com/
> 
> -- 
> Gabriel Krisman Bertazi
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f5c16ac37835..b49a474c1d7f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -745,6 +745,7 @@  static int fanotify_handle_error_event(struct fsnotify_iter_info *iter_info,
 	spin_unlock(&group->notification_lock);
 
 	fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
+	fee->error = report->error;
 	fee->fsid = fee->sb_mark->fsn_mark.connector->fsid;
 
 	fh_len = fanotify_encode_fh_len(inode);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 158cf0c4b0bd..0cfe376c6fd9 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -220,6 +220,7 @@  FANOTIFY_NE(struct fanotify_event *event)
 
 struct fanotify_error_event {
 	struct fanotify_event fae;
+	s32 error; /* Error reported by the Filesystem. */
 	u32 err_count; /* Suppressed errors count */
 
 	struct fanotify_sb_mark *sb_mark; /* Back reference to the mark. */
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1ab8f9d8b3ac..ca53159ce673 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -107,6 +107,8 @@  struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 #define FANOTIFY_EVENT_ALIGN 4
 #define FANOTIFY_INFO_HDR_LEN \
 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+#define FANOTIFY_INFO_ERROR_LEN \
+	(sizeof(struct fanotify_event_info_error))
 
 static int fanotify_fid_info_len(int fh_len, int name_len)
 {
@@ -130,6 +132,9 @@  static size_t fanotify_event_len(struct fanotify_event *event,
 	if (!fid_mode)
 		return event_len;
 
+	if (fanotify_is_error_event(event->mask))
+		event_len += FANOTIFY_INFO_ERROR_LEN;
+
 	info = fanotify_event_info(event);
 	dir_fh_len = fanotify_event_dir_fh_len(event);
 	fh_len = fanotify_event_object_fh_len(event);
@@ -176,6 +181,7 @@  static struct fanotify_event *fanotify_dup_error_to_stack(
 	error_on_stack->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
 	error_on_stack->err_count = fee->err_count;
 	error_on_stack->sb_mark = fee->sb_mark;
+	error_on_stack->error = fee->error;
 
 	error_on_stack->fsid = fee->fsid;
 
@@ -342,6 +348,28 @@  static int process_access_response(struct fsnotify_group *group,
 	return -ENOENT;
 }
 
+static size_t copy_error_info_to_user(struct fanotify_event *event,
+				      char __user *buf, int count)
+{
+	struct fanotify_event_info_error info;
+	struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
+	info.hdr.pad = 0;
+	info.hdr.len = FANOTIFY_INFO_ERROR_LEN;
+
+	if (WARN_ON(count < info.hdr.len))
+		return -EFAULT;
+
+	info.error = fee->error;
+	info.error_count = fee->err_count;
+
+	if (copy_to_user(buf, &info, sizeof(info)))
+		return -EFAULT;
+
+	return info.hdr.len;
+}
+
 static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
 			     int info_type, const char *name, size_t name_len,
 			     char __user *buf, size_t count)
@@ -505,6 +533,14 @@  static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (f)
 		fd_install(fd, f);
 
+	if (fanotify_is_error_event(event->mask)) {
+		ret = copy_error_info_to_user(event, buf, count);
+		if (ret < 0)
+			goto out_close_fd;
+		buf += ret;
+		count -= ret;
+	}
+
 	/* Event info records order is: dir fid + name, child fid */
 	if (fanotify_event_dir_fh_len(event)) {
 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 16402037fc7a..80040a92e9d9 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -124,6 +124,7 @@  struct fanotify_event_metadata {
 #define FAN_EVENT_INFO_TYPE_FID		1
 #define FAN_EVENT_INFO_TYPE_DFID_NAME	2
 #define FAN_EVENT_INFO_TYPE_DFID	3
+#define FAN_EVENT_INFO_TYPE_ERROR	4
 
 /* Variable length info record following event metadata */
 struct fanotify_event_info_header {
@@ -149,6 +150,12 @@  struct fanotify_event_info_fid {
 	unsigned char handle[0];
 };
 
+struct fanotify_event_info_error {
+	struct fanotify_event_info_header hdr;
+	__s32 error;
+	__u32 error_count;
+};
+
 struct fanotify_response {
 	__s32 fd;
 	__u32 response;