diff mbox series

[5/5] xfs_scrub_all: implement retry and backoff for dbus calls

Message ID 172229849719.1350420.4990697396905953343.stgit@frogsfrogsfrogs (mailing list archive)
State Accepted, archived
Headers show
Series [1/5] xfs_scrub_all: encapsulate all the subprocess code in an object | expand

Commit Message

Darrick J. Wong July 30, 2024, 1:17 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Calls to systemd across dbus are remote procedure calls, which means
that they're subject to transitory connection failures (e.g. systemd
re-exec itself).  We don't want to fail at the *first* sign of what
could be temporary trouble, so implement a limited retry with fibonacci
backoff before we resort to invoking xfs_scrub as a subprocess.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 scrub/xfs_scrub_all.in |   43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/scrub/xfs_scrub_all.in b/scrub/xfs_scrub_all.in
index f2e916513..5440e51c0 100644
--- a/scrub/xfs_scrub_all.in
+++ b/scrub/xfs_scrub_all.in
@@ -165,6 +165,22 @@  def path_to_serviceunit(path, scrub_media):
 	for line in proc.stdout:
 		return line.decode(sys.stdout.encoding).strip()
 
+def fibonacci(max_ret):
+	'''Yield fibonacci sequence up to but not including max_ret.'''
+	if max_ret < 1:
+		return
+
+	x = 0
+	y = 1
+	yield 1
+
+	z = x + y
+	while z <= max_ret:
+		yield z
+		x = y
+		y = z
+		z = x + y
+
 class scrub_service(scrub_control):
 	'''Control object for xfs_scrub systemd service.'''
 	def __init__(self, mnt, scrub_media):
@@ -188,6 +204,25 @@  class scrub_service(scrub_control):
 		self.unit = dbus.Interface(svc_obj,
 				'org.freedesktop.systemd1.Unit')
 
+	def __dbusrun(self, lambda_fn):
+		'''Call the lambda function to execute something on dbus.  dbus
+		exceptions result in retries with Fibonacci backoff, and the
+		bindings will be rebuilt every time.'''
+		global debug
+
+		fatal_ex = None
+
+		for i in fibonacci(30):
+			try:
+				return lambda_fn()
+			except dbus.exceptions.DBusException as e:
+				if debug:
+					print(e)
+				fatal_ex = e
+				time.sleep(i)
+				self.bind()
+		raise fatal_ex
+
 	def state(self):
 		'''Retrieve the active state for a systemd service.  As of
 		systemd 249, this is supposed to be one of the following:
@@ -195,8 +230,10 @@  class scrub_service(scrub_control):
 		or "deactivating".  These strings are not localized.'''
 		global debug
 
+		l = lambda: self.prop.Get('org.freedesktop.systemd1.Unit',
+				'ActiveState')
 		try:
-			return self.prop.Get('org.freedesktop.systemd1.Unit', 'ActiveState')
+			return self.__dbusrun(l)
 		except Exception as e:
 			if debug:
 				print(e, file = sys.stderr)
@@ -231,7 +268,7 @@  class scrub_service(scrub_control):
 			print('starting %s' % self.unitname)
 
 		try:
-			self.unit.Start('replace')
+			self.__dbusrun(lambda: self.unit.Start('replace'))
 			return self.wait()
 		except Exception as e:
 			print(e, file = sys.stderr)
@@ -245,7 +282,7 @@  class scrub_service(scrub_control):
 			print('stopping %s' % self.unitname)
 
 		try:
-			self.unit.Stop('replace')
+			self.__dbusrun(lambda: self.unit.Stop('replace'))
 			return self.wait()
 		except Exception as e:
 			print(e, file = sys.stderr)