@@ -1530,10 +1530,161 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos,
return ret;
}
+struct btrfs_r5l_recover_ctx {
+ u64 pos;
+ u64 seq;
+ u64 total_size;
+ struct page *meta_page;
+ struct page *io_page;
+};
+
+static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+ struct btrfs_r5l_meta_block *mb;
+
+ btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos >> 9), PAGE_SIZE, ctx->meta_page, REQ_OP_READ);
+
+ mb = kmap(ctx->meta_page);
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("ctx->pos %llu ctx->seq %llu pos %llu seq %llu\n", ctx->pos, ctx->seq, le64_to_cpu(mb->position), le64_to_cpu(mb->seq));
+#endif
+
+ if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC ||
+ le64_to_cpu(mb->position) != ctx->pos ||
+ le64_to_cpu(mb->seq) != ctx->seq) {
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("%s: mismatch magic %llu default %llu\n", __func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC);
+#endif
+ return -EINVAL;
+ }
+
+ ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE);
+ kunmap(ctx->meta_page);
+
+ /* meta_block */
+ ctx->total_size = PAGE_SIZE;
+
+ return 0;
+}
+
+static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+ u64 offset;
+ struct btrfs_r5l_meta_block *mb;
+ u64 meta_size;
+ u64 io_offset;
+ struct btrfs_device *dev;
+
+ mb = kmap(ctx->meta_page);
+
+ io_offset = PAGE_SIZE;
+ offset = sizeof(struct btrfs_r5l_meta_block);
+ meta_size = le32_to_cpu(mb->meta_size);
+
+ while (offset < meta_size) {
+ struct btrfs_r5l_payload *payload = (void *)mb + offset;
+
+ /* read data from log disk and write to payload->location */
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("payload type %d flags %d size %d location 0x%llx devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), le32_to_cpu(payload->size), le64_to_cpu(payload->location), le64_to_cpu(payload->devid));
+#endif
+
+ dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL);
+ if (!dev || dev->missing) {
+ ASSERT(0);
+ }
+
+ if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) {
+ ASSERT(le32_to_cpu(payload->size) == 1);
+ btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+ btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+ io_offset += PAGE_SIZE;
+ } else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) {
+ int i;
+ ASSERT(le32_to_cpu(payload->size) == 16);
+ for (i = 0; i < le32_to_cpu(payload->size); i++) {
+ /* liubo: parity are guaranteed to be
+ * contiguous, use just one bio to
+ * hold all pages and flush them. */
+ u64 parity_off = le64_to_cpu(payload->location) + i * PAGE_SIZE;
+ btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+ btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+ io_offset += PAGE_SIZE;
+ }
+ } else {
+ ASSERT(0);
+ }
+
+ offset += sizeof(struct btrfs_r5l_payload);
+ }
+ kunmap(ctx->meta_page);
+
+ ctx->total_size += (io_offset - PAGE_SIZE);
+ return 0;
+}
+
+static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx)
+{
+ int ret;
+
+ while (1) {
+ ret = btrfs_r5l_recover_load_meta(log, ctx);
+ if (ret)
+ break;
+
+ ret = btrfs_r5l_recover_load_data(log, ctx);
+ ASSERT(!ret || ret > 0);
+ if (ret)
+ break;
+
+ ctx->seq++;
+ ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, ctx->total_size);
+ }
+
+ return ret;
+}
+
static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp);
static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
{
+ struct btrfs_r5l_recover_ctx *ctx;
+ u64 pos;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+ ASSERT(ctx);
+
+ ctx->pos = log->last_checkpoint;
+ ctx->seq = log->last_cp_seq;
+ ctx->meta_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ ASSERT(ctx->meta_page);
+ ctx->io_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ ASSERT(ctx->io_page);
+
+ ret = btrfs_r5l_recover_flush_log(log, ctx);
+ if (ret) {
+ ;
+ }
+
+ pos = ctx->pos;
+ log->next_checkpoint = ctx->pos;
+ ctx->seq += 10000;
+ btrfs_r5l_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+ ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, PAGE_SIZE);
+
+ log->log_start = ctx->pos;
+ log->seq = ctx->seq;
+ /* last_checkpoint point to the empty block. */
+ log->last_checkpoint = pos;
+ btrfs_r5l_write_super(log->fs_info, pos);
+
+#ifdef BTRFS_DEBUG_R5LOG
+ trace_printk("%s: log_start %llu seq %llu\n", __func__, log->log_start, log->seq);
+#endif
+ __free_page(ctx->meta_page);
+ __free_page(ctx->io_page);
+ kfree(ctx);
return 0;
}
This is adding recovery on raid5/6 log. We've set a %journal_tail in super_block, which indicates the position from where we need to replay data. So we scan the log and replay valid meta/data/parity pairs until finding an invalid one. By replaying, it simply reads data/parity from the raid5/6 log and issues writes to the raid disks where it should be. Please note that the whole meta/data/parity pair can be discarded if it fails the sanity check in the meta block. After recovery, we also append an empty meta block and update the %journal_tail in super_block in order to avoid a situation, where the layout on the raid5/6 log is [valid A][invalid B][valid C], so block A is the only one we should replay. Then the recovery ends up pointing to block A as block B is invalid, and some new writes come in and append to block A so that block B is now overwritten to be a valid meta/data/parity. If a power loss happens, the new recovery starts again from block A, and since block B is now valid, it may replay block C as well which has become stale. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/raid56.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+)