diff mbox series

[22/41] lustre: obdclass: try to skip corrupted llog records

Message ID 1617583870-32029-23-git-send-email-jsimmons@infradead.org (mailing list archive)
State New, archived
Headers show
Series lustre: sync to OpenSFS branch as of March 1 | expand

Commit Message

James Simmons April 5, 2021, 12:50 a.m. UTC
From: Alex Zhuravlev <bzzz@whamcloud.com>

if llog's header or record is found corrupted, then
ignore the remaining records and try with the next one.

WC-bug-id: https://jira.whamcloud.com/browse/LU-14098
Lustre-commit: 910eb97c1b43a44 ("LU-14098 obdclass: try to skip corrupted llog records")
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/40754
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/obdclass/llog.c          | 76 ++++++++++++++++++++++++++++++--------
 fs/lustre/obdclass/llog_cat.c      | 14 +++----
 fs/lustre/obdclass/llog_internal.h |  5 +++
 3 files changed, 72 insertions(+), 23 deletions(-)
diff mbox series

Patch

diff --git a/fs/lustre/obdclass/llog.c b/fs/lustre/obdclass/llog.c
index e172ebc..7668d51 100644
--- a/fs/lustre/obdclass/llog.c
+++ b/fs/lustre/obdclass/llog.c
@@ -184,7 +184,7 @@  int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 			     (llh->llh_flags & LLOG_F_IS_CAT &&
 			      flags & LLOG_F_IS_PLAIN))) {
 			CERROR("%s: llog type is %s but initializing %s\n",
-			       handle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(handle),
 			       llh->llh_flags & LLOG_F_IS_CAT ?
 			       "catalog" : "plain",
 			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
@@ -206,7 +206,7 @@  int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		if (unlikely(uuid &&
 			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
 			CERROR("%s: llog uuid mismatch: %s/%s\n",
-			       handle->lgh_ctxt->loc_obd->obd_name,
+			       loghandle2name(handle),
 			       (char *)uuid->uuid,
 			       (char *)llh->llh_tgtuuid.uuid);
 			rc = -EEXIST;
@@ -220,8 +220,8 @@  int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 		llh->llh_flags |= LLOG_F_IS_FIXSIZE;
 	} else if (!(flags & LLOG_F_IS_PLAIN)) {
 		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
-		       handle->lgh_ctxt->loc_obd->obd_name,
-		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		       loghandle2name(handle), flags, LLOG_F_IS_CAT,
+		       LLOG_F_IS_PLAIN);
 		rc = -EINVAL;
 	}
 	llh->llh_flags |= fmt;
@@ -234,6 +234,29 @@  int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
 }
 EXPORT_SYMBOL(llog_init_handle);
 
+int llog_verify_record(const struct llog_handle *llh, struct llog_rec_hdr *rec)
+{
+	int chunk_size = llh->lgh_hdr->llh_hdr.lrh_len;
+
+	if (rec->lrh_len == 0 || rec->lrh_len > chunk_size) {
+		CERROR("%s: record is too large: %d > %d\n",
+		       loghandle2name(llh), rec->lrh_len, chunk_size);
+		return -EINVAL;
+	}
+	if (rec->lrh_index >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr)) {
+		CERROR("%s: index is too high: %d\n",
+		       loghandle2name(llh), rec->lrh_index);
+		return -EINVAL;
+	}
+	if ((rec->lrh_type & LLOG_OP_MASK) != LLOG_OP_MAGIC) {
+		CERROR("%s: magic %x is bad\n",
+		       loghandle2name(llh), rec->lrh_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int llog_process_thread(void *arg)
 {
 	struct llog_process_info *lpi = arg;
@@ -247,6 +270,7 @@  static int llog_process_thread(void *arg)
 	int saved_index = 0;
 	int last_called_index = 0;
 	bool repeated = false;
+	bool refresh_idx = false;
 
 	if (!llh)
 		return -EINVAL;
@@ -380,12 +404,21 @@  static int llog_process_thread(void *arg)
 
 			repeated = false;
 
-			if (!rec->lrh_len || rec->lrh_len > chunk_size) {
-				CWARN("invalid length %d in llog record for index %d/%d\n",
-				      rec->lrh_len,
-				      rec->lrh_index, index);
-				rc = -EINVAL;
-				goto out;
+			rc = llog_verify_record(loghandle, rec);
+			if (rc) {
+				CERROR("%s: invalid record in llog "DFID" record for index %d/%d: rc = %d\n",
+				       loghandle2name(loghandle),
+                                       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+				       rec->lrh_len, index, rc);
+				/*
+				 * the block seem to be corrupted, let's try
+				 * with the next one. reset rc to go to the
+				 * next chunk.
+				 */
+				refresh_idx = true;
+				index = 0;
+				rc = 0;
+				goto repeat;
 			}
 
 			if (rec->lrh_index < index) {
@@ -395,11 +428,22 @@  static int llog_process_thread(void *arg)
 			}
 
 			if (rec->lrh_index != index) {
-				CERROR("%s: Invalid record: index %u but expected %u\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
-				       rec->lrh_index, index);
-				rc = -ERANGE;
-				goto out;
+				/*
+				 * the last time we couldn't parse the block due
+				 * to corruption, thus has no idea about the
+				 * next index, take it from the block, once.
+				 */
+				if (refresh_idx) {
+					refresh_idx = false;
+					index = rec->lrh_index;
+				} else {
+					CERROR("%s: "DFID" Invalid record: index %u but expected %u\n",
+					       loghandle2name(loghandle),
+					       PFID(&loghandle->lgh_id.lgl_oi.oi_fid),
+					       rec->lrh_index, index);
+					rc = -ERANGE;
+					goto out;
+				}
 			}
 
 			CDEBUG(D_OTHER,
@@ -501,7 +545,7 @@  int llog_process_or_fork(const struct lu_env *env,
 		if (IS_ERR(task)) {
 			rc = PTR_ERR(task);
 			CERROR("%s: cannot start thread: rc = %d\n",
-			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			       loghandle2name(loghandle), rc);
 			goto out_lpi;
 		}
 		wait_for_completion(&lpi->lpi_completion);
diff --git a/fs/lustre/obdclass/llog_cat.c b/fs/lustre/obdclass/llog_cat.c
index 9298808..b67e7a2b 100644
--- a/fs/lustre/obdclass/llog_cat.c
+++ b/fs/lustre/obdclass/llog_cat.c
@@ -80,7 +80,7 @@  static int llog_cat_id2handle(const struct lu_env *env,
 		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
 			if (cgl->lgl_ogen != logid->lgl_ogen) {
 				CWARN("%s: log " DFID " generation %x != %x\n",
-				      loghandle->lgh_ctxt->loc_obd->obd_name,
+				      loghandle2name(loghandle),
 				      PFID(&logid->lgl_oi.oi_fid),
 				      cgl->lgl_ogen, logid->lgl_ogen);
 				continue;
@@ -88,7 +88,7 @@  static int llog_cat_id2handle(const struct lu_env *env,
 			*res = llog_handle_get(loghandle);
 			if (!*res) {
 				CERROR("%s: log "DFID" refcount is zero!\n",
-				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       loghandle2name(loghandle),
 				       PFID(&logid->lgl_oi.oi_fid));
 				continue;
 			}
@@ -103,8 +103,8 @@  static int llog_cat_id2handle(const struct lu_env *env,
 		       LLOG_OPEN_EXISTS);
 	if (rc < 0) {
 		CERROR("%s: error opening log id " DFID ":%x: rc = %d\n",
-		       cathandle->lgh_ctxt->loc_obd->obd_name,
-		       PFID(&logid->lgl_oi.oi_fid), logid->lgl_ogen, rc);
+		       loghandle2name(cathandle), PFID(&logid->lgl_oi.oi_fid),
+		       logid->lgl_ogen, rc);
 		return rc;
 	}
 
@@ -155,7 +155,7 @@  static int llog_cat_process_common(const struct lu_env *env,
 	if (rec->lrh_type != le32_to_cpu(LLOG_LOGID_MAGIC)) {
 		rc = -EINVAL;
 		CWARN("%s: invalid record in catalog " DFID ":%x: rc = %d\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      loghandle2name(cat_llh),
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid),
 		      cat_llh->lgh_id.lgl_ogen, rc);
 
@@ -170,7 +170,7 @@  static int llog_cat_process_common(const struct lu_env *env,
 	rc = llog_cat_id2handle(env, cat_llh, llhp, &lir->lid_id);
 	if (rc) {
 		CWARN("%s: can't find llog handle " DFID ":%x: rc = %d\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      loghandle2name(cat_llh),
 		      PFID(&lir->lid_id.lgl_oi.oi_fid),
 		      lir->lid_id.lgl_ogen, rc);
 
@@ -235,7 +235,7 @@  static int llog_cat_process_or_fork(const struct lu_env *env,
 		struct llog_process_cat_data cd;
 
 		CWARN("%s: catlog " DFID " crosses index zero\n",
-		      cat_llh->lgh_ctxt->loc_obd->obd_name,
+		      loghandle2name(cat_llh),
 		      PFID(&cat_llh->lgh_id.lgl_oi.oi_fid));
 		/*startcat = 0 is default value for general processing */
 		if ((startcat != LLOG_CAT_FIRST &&
diff --git a/fs/lustre/obdclass/llog_internal.h b/fs/lustre/obdclass/llog_internal.h
index c34adfe..41ac4f0 100644
--- a/fs/lustre/obdclass/llog_internal.h
+++ b/fs/lustre/obdclass/llog_internal.h
@@ -74,4 +74,9 @@  static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec)
 {
 	return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len);
 }
+
+static inline char *loghandle2name(const struct llog_handle *lgh)
+{
+	return lgh->lgh_ctxt->loc_obd->obd_name;
+}
 #endif