diff mbox series

[RFC] ceph: Fix kernel crash in generic/397 test

Message ID 266c50167604c606c95a6efe575de5430c31168b.camel@ibm.com (mailing list archive)
State New
Headers show
Series [RFC] ceph: Fix kernel crash in generic/397 test | expand

Commit Message

Viacheslav Dubeyko Jan. 15, 2025, 12:49 a.m. UTC
Hello,

The generic/397 test generate kernel crash for the case of
encrypted inode with unaligned file size (for example, 33K
or 1K):

Jan 3 12:34:40 ceph-testing-0001 root: run xfstest generic/397
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 877.737811] run fstests
generic/397 at 2025-01-03 12:34:40
Jan 3 12:34:40 ceph-testing-0001 systemd1: Started /usr/bin/bash c test
-w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj;
exec ./tests/generic/397.
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 877.875761] libceph: mon0
(2)127.0.0.1:40674 session established
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 877.876130] libceph:
client4614 fsid 19b90bca-f1ae-47a6-93dd-0b03ee637949
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 877.991965] libceph: mon0
(2)127.0.0.1:40674 session established
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 877.992334] libceph:
client4617 fsid 19b90bca-f1ae-47a6-93dd-0b03ee637949
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 878.017234] libceph: mon0
(2)127.0.0.1:40674 session established
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 878.017594] libceph:
client4620 fsid 19b90bca-f1ae-47a6-93dd-0b03ee637949
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 878.031394] xfs_io (pid
18988) is setting deprecated v1 encryption policy; recommend upgrading
to v2.
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 878.054528] libceph: mon0
(2)127.0.0.1:40674 session established
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 878.054892] libceph:
client4623 fsid 19b90bca-f1ae-47a6-93dd-0b03ee637949
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 878.070287] libceph: mon0
(2)127.0.0.1:40674 session established
Jan 3 12:34:40 ceph-testing-0001 kernel: [ 878.070704] libceph:
client4626 fsid 19b90bca-f1ae-47a6-93dd-0b03ee637949
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.264586] libceph: mon0
(2)127.0.0.1:40674 session established
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.265258] libceph:
client4629 fsid 19b90bca-f1ae-47a6-93dd-0b03ee637949
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.374578] -----------[ cut
here ]------------
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.374586] kernel BUG at
net/ceph/messenger.c:1070!
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.375150] Oops: invalid
opcode: 0000 [#1] PREEMPT SMP NOPTI
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.378145] CPU: 2 UID: 0
PID: 4759 Comm: kworker/2:9 Not tainted 6.13.0-rc5+ #1
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.378969] Hardware name:
QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-
ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.380167] Workqueue: ceph-
msgr ceph_con_workfn
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.381639] RIP:
0010:ceph_msg_data_cursor_init+0x42/0x50
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.382152] Code: 89 17 48
8b 46 70 55 48 89 47 08 c7 47 18 00 00 00 00 48 89 e5 e8 de cc ff ff 5d
31 c0 31 d2 31 f6 31 ff c3 cc cc cc cc 0f 0b <0f> 0b 0f 0b 66 2e 0f 1f
84 00 00 00 00 00 90 90 90 90 90 90 90 90
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.383928] RSP:
0018:ffffb4ffc7cbbd28 EFLAGS: 00010287
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.384447] RAX:
ffffffff82bb9ac0 RBX: ffff981390c2f1f8 RCX: 0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.385129] RDX:
0000000000009000 RSI: ffff981288232b58 RDI: ffff981390c2f378
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.385839] RBP:
ffffb4ffc7cbbe18 R08: 0000000000000000 R09: 0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.386539] R10:
0000000000000000 R11: 0000000000000000 R12: ffff981390c2f030
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.387203] R13:
ffff981288232b58 R14: 0000000000000029 R15: 0000000000000001
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.387877] FS:
0000000000000000(0000) GS:ffff9814b7900000(0000) knlGS:0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.388663] CS: 0010 DS:
0000 ES: 0000 CR0: 0000000080050033
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.389212] CR2:
00005e106a0554e0 CR3: 0000000112bf0001 CR4: 0000000000772ef0
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.389921] DR0:
0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.390620] DR3:
0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.391307] PKRU: 55555554
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.391567] Call Trace:
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.391807] <TASK>
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.392021] ?
show_regs+0x71/0x90
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.392391] ? die+0x38/0xa0
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.392667] ?
do_trap+0xdb/0x100
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.392981] ?
do_error_trap+0x75/0xb0
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.393372] ?
ceph_msg_data_cursor_init+0x42/0x50
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.393842] ?
exc_invalid_op+0x53/0x80
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.394232] ?
ceph_msg_data_cursor_init+0x42/0x50
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.394694] ?
asm_exc_invalid_op+0x1b/0x20
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.395099] ?
ceph_msg_data_cursor_init+0x42/0x50
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.395583] ?
ceph_con_v2_try_read+0xd16/0x2220
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.396027] ?
_raw_spin_unlock+0xe/0x40
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.396428] ?
raw_spin_rq_unlock+0x10/0x40
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.396842] ?
finish_task_switch.isra.0+0x97/0x310
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.397338] ?
__schedule+0x44b/0x16b0
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.397738]
ceph_con_workfn+0x326/0x750
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.398121]
process_one_work+0x188/0x3d0
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.398522] ?
__pfx_worker_thread+0x10/0x10
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.398929]
worker_thread+0x2b5/0x3c0
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.399310] ?
__pfx_worker_thread+0x10/0x10
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.399727]
kthread+0xe1/0x120
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.400031] ?
__pfx_kthread+0x10/0x10
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.400431]
ret_from_fork+0x43/0x70
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.400771] ?
__pfx_kthread+0x10/0x10
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.401127]
ret_from_fork_asm+0x1a/0x30
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.401543] </TASK>
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.401760] Modules linked
in: hctr2 nhpoly1305_avx2 nhpoly1305_sse2 nhpoly1305 chacha_generic
chacha_x86_64 libchacha adiantum libpoly1305 essiv authenc mptcp_diag
xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag
netlink_diag intel_rapl_msr intel_rapl_common
intel_uncore_frequency_common skx_edac_common nfit kvm_intel kvm
crct10dif_pclmul crc32_pclmul polyval_clmulni polyval_generic
ghash_clmulni_intel sha256_ssse3 sha1_ssse3 aesni_intel joydev
crypto_simd cryptd rapl input_leds psmouse sch_fq_codel serio_raw bochs
i2c_piix4 floppy qemu_fw_cfg i2c_smbus mac_hid pata_acpi msr parport_pc
ppdev lp parport efi_pstore ip_tables x_tables
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.407319] ---[ end trace
0000000000000000 ]---
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.407775] RIP:
0010:ceph_msg_data_cursor_init+0x42/0x50
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.408317] Code: 89 17 48
8b 46 70 55 48 89 47 08 c7 47 18 00 00 00 00 48 89 e5 e8 de cc ff ff 5d
31 c0 31 d2 31 f6 31 ff c3 cc cc cc cc 0f 0b <0f> 0b 0f 0b 66 2e 0f 1f
84 00 00 00 00 00 90 90 90 90 90 90 90 90
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.410087] RSP:
0018:ffffb4ffc7cbbd28 EFLAGS: 00010287
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.410609] RAX:
ffffffff82bb9ac0 RBX: ffff981390c2f1f8 RCX: 0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.411318] RDX:
0000000000009000 RSI: ffff981288232b58 RDI: ffff981390c2f378
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.412014] RBP:
ffffb4ffc7cbbe18 R08: 0000000000000000 R09: 0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.412735] R10:
0000000000000000 R11: 0000000000000000 R12: ffff981390c2f030
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.413438] R13:
ffff981288232b58 R14: 0000000000000029 R15: 0000000000000001
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.414121] FS:
0000000000000000(0000) GS:ffff9814b7900000(0000) knlGS:0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.414935] CS: 0010 DS:
0000 ES: 0000 CR0: 0000000080050033
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.415516] CR2:
00005e106a0554e0 CR3: 0000000112bf0001 CR4: 0000000000772ef0
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.416211] DR0:
0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.416907] DR3:
0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Jan 3 12:34:41 ceph-testing-0001 kernel: [ 878.417630] PKRU: 55555554

BUG_ON(length > msg->data_length) triggers the issue:

(gdb) l *ceph_msg_data_cursor_init+0x42
0xffffffff823b45a2 is in ceph_msg_data_cursor_init
(net/ceph/messenger.c:1070).
1065
1066 void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor
*cursor,
1067 struct ceph_msg *msg, size_t length)
1068 {
1069 BUG_ON(!length);
1070 BUG_ON(length > msg->data_length);
1071 BUG_ON(!msg->num_data_items);
1072
1073 cursor->total_resid = length;
1074 cursor->data = msg->data;

The issue takes place because of this:
Jan 6 14:59:24 ceph-testing-0001 kernel: [ 202.628853] libceph: pid
144:net/ceph/messenger_v2.c:2034 prepare_sparse_read_data(): msg-
>data_length 33792, msg->sparse_read_total 36864
1070 BUG_ON(length > msg->data_length);
msg->sparse_read_total 36864 > msg->data_length 33792

The generic/397 test (xfstests) executes such steps:
(1) create encrypted files and directories;
(2) access the created files and folders with encryption key;
(3) access the created files and folders without encryption key.

We have issue because there is alignment of length before calling
ceph_osdc_new_request():

Jan 8 12:49:22 ceph-testing-0001 kernel: [ 301.522120] ceph: pid
8931:fs/ceph/crypto.h:148 ceph_fscrypt_adjust_off_and_len(): ino
0x100000009d5: encrypted 0x4000, len 33792
Jan 8 12:49:22 ceph-testing-0001 kernel: [ 301.523706] ceph: pid
8931:fs/ceph/crypto.h:155 ceph_fscrypt_adjust_off_and_len(): ino
0x100000009d5: encrypted 0x4000, len 36864

This patch uses unaligned size for retrieving file's content
from OSD and, then, aligned the size of sparse extent in the
case if inode is encrypted.

./check generic/397
FSTYP         -- ceph
PLATFORM      -- Linux/x86_64 ceph-testing-0001 6.13.0-rc7+ #58 SMP
PREEMPT_DYNAMIC Wed Jan 15 00:07:06 UTC 2025
MKFS_OPTIONS  -- 127.0.0.1:40629:/scratch
MOUNT_OPTIONS -- -o
name=fs,secret=<hidden>,ms_mode=crc,nowsync,copyfrom
127.0.0.1:<port>:/scratch /mnt/scratch

generic/397 1s ...  1s
Ran: generic/397
Passed all 1 tests

Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
---
 fs/ceph/addr.c | 30 ++++++++++++++++++++++++------
 fs/ceph/file.c | 20 +++++++++++++++++---
 2 files changed, 41 insertions(+), 9 deletions(-)

+		 */
+
 		req = ceph_osdc_new_request(osdc, &ci->i_layout,
 					ci->i_vino, read_off,
&read_len, 0, 1,
 					sparse ?
CEPH_OSD_OP_SPARSE_READ :
@@ -1111,6 +1114,9 @@ ssize_t __ceph_sync_read(struct inode *inode,
loff_t *ki_pos,
 			break;
 		}
 
+		/* determine new offset/length if encrypted */
+		ceph_fscrypt_adjust_off_and_len(inode, &read_off,
&read_len);
+
 		/* adjust len downward if the request truncated the
len */
 		if (off + len > read_off + read_len)
 			len = read_off + read_len - off;
@@ -1168,7 +1174,15 @@ ssize_t __ceph_sync_read(struct inode *inode,
loff_t *ki_pos,
 		}
 
 		if (IS_ENCRYPTED(inode)) {
+			struct ceph_sparse_extent *ext;
 			int fret;
+			int i;
+
+			for (i = 0; i < op->extent.sparse_ext_cnt;
i++) {
+				ext = &op->extent.sparse_ext[i];
+				ceph_fscrypt_adjust_off_and_len(inode,
+						&ext->off, &ext->len);
+			}
 
 			fret = ceph_fscrypt_decrypt_extents(inode,
pages,
 					read_off, op-
>extent.sparse_ext,
diff mbox series

Patch

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 85936f6d2bf7..5a7a698ad8e8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -235,6 +235,15 @@  static void finish_netfs_read(struct
ceph_osd_request *req)
 		    subreq->rreq->origin != NETFS_DIO_READ)
 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq-
>flags);
 		if (IS_ENCRYPTED(inode) && err > 0) {
+			struct ceph_sparse_extent *ext;
+			int i;
+
+			for (i = 0; i < op->extent.sparse_ext_cnt;
i++) {
+				ext = &op->extent.sparse_ext[i];
+				ceph_fscrypt_adjust_off_and_len(inode,
+						&ext->off, &ext->len);
+			}
+
 			err = ceph_fscrypt_decrypt_extents(inode,
 					osd_data->pages, subreq-
>start,
 					op->extent.sparse_ext,
@@ -357,13 +366,12 @@  static void ceph_netfs_issue_read(struct
netfs_io_subrequest *subreq)
 	if (ceph_has_inline_data(ci) &&
ceph_netfs_issue_op_inline(subreq))
 		return;
 
-	// TODO: This rounding here is slightly dodgy.  It *should*
work, for
-	// now, as the cache only deals in blocks that are a multiple
of
-	// PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE.  What
needs to
-	// happen is for the fscrypt driving to be moved into netfslib
and the
-	// data in the cache also to be stored encrypted.
+	/*
+	 * It needs to provide the real legth (not aligned) for
network subsystem.
+	 * Otherwise, ceph_msg_data_cursor_init() triggers BUG_ON() in
the case
+	 * if msg->sparse_read_total > msg->data_length.
+	 */
 	len = subreq->len;
-	ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
 
 	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino,
 			off, &len, 0, 1, sparse ?
CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
@@ -375,6 +383,16 @@  static void ceph_netfs_issue_read(struct
netfs_io_subrequest *subreq)
 		goto out;
 	}
 
+	// TODO: This rounding here is slightly dodgy.  It *should*
work, for
+	// now, as the cache only deals in blocks that are a multiple
of
+	// PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE.  What
needs to
+	// happen is for the fscrypt driving to be moved into netfslib
and the
+	// data in the cache also to be stored encrypted.
+	ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
+
+	doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
+	      ceph_vinop(inode), subreq->start, subreq->len, len);
+
 	if (sparse) {
 		extent_cnt = __ceph_sparse_read_ext_count(inode, len);
 		err = ceph_alloc_sparse_ext_map(&req->r_ops[0],
extent_cnt);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 851d70200c6b..9446a5f001aa 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1093,12 +1093,15 @@  ssize_t __ceph_sync_read(struct inode *inode,
loff_t *ki_pos,
 		u64 read_len = len;
 		int extent_cnt;
 
-		/* determine new offset/length if encrypted */
-		ceph_fscrypt_adjust_off_and_len(inode, &read_off,
&read_len);
-
 		doutc(cl, "orig %llu~%llu reading %llu~%llu", off,
len,
 		      read_off, read_len);
 
+		/*
+		 * It needs to provide the real legth (not aligned)
for network subsystem.
+		 * Otherwise, ceph_msg_data_cursor_init() triggers
BUG_ON() in the case
+		 * if msg->sparse_read_total > msg->data_length.