diff mbox series

[05/11] libceph: journaling: introduce api to replay uncommitted journal events

Message ID 1543841435-13652-6-git-send-email-dongsheng.yang@easystack.cn (mailing list archive)
State New, archived
Headers show
Series [01/11] libceph: support prefix and suffix in bio_iter | expand

Commit Message

Dongsheng Yang Dec. 3, 2018, 12:50 p.m. UTC
When we are going to make sure the data and journal are consistent in opening
journal, we can call the api of start_replay() to replay the all events recorded
but not committed.

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 include/linux/ceph/journaler.h |   2 +
 net/ceph/journaler.c           | 535 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 537 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/ceph/journaler.h b/include/linux/ceph/journaler.h
index 50e8c52..acc1c6f 100644
--- a/include/linux/ceph/journaler.h
+++ b/include/linux/ceph/journaler.h
@@ -159,4 +159,6 @@  struct ceph_journaler *ceph_journaler_create(struct ceph_osd_client *osdc,
 
 int ceph_journaler_get_cached_client(struct ceph_journaler *journaler, char *client_id,
 				     struct ceph_journaler_client **client_result);
+// replaying
+int ceph_journaler_start_replay(struct ceph_journaler *journaler);
 #endif
diff --git a/net/ceph/journaler.c b/net/ceph/journaler.c
index 8f2ed41..3b73725 100644
--- a/net/ceph/journaler.c
+++ b/net/ceph/journaler.c
@@ -511,3 +511,538 @@  int ceph_journaler_get_cached_client(struct ceph_journaler *journaler, char *cli
 	return ret;
 }
 EXPORT_SYMBOL(ceph_journaler_get_cached_client);
+
+// replaying
+static int ceph_journaler_obj_read_sync(struct ceph_journaler *journaler,
+			     struct ceph_object_id *oid,
+			     struct ceph_object_locator *oloc,
+			     void *buf, uint32_t read_off, uint64_t buf_len)
+
+{
+	struct ceph_osd_client *osdc = journaler->osdc;
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int num_pages = calc_pages_for(0, buf_len);
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, oid);
+	ceph_oloc_copy(&req->r_base_oloc, oloc);
+	req->r_flags = CEPH_OSD_FLAG_READ;
+
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		goto out_req;
+	}
+
+	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, read_off, buf_len, 0, 0);
+	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
+					 true);
+
+	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
+	if (ret)
+		goto out_req;
+
+	ceph_osdc_start_request(osdc, req, false);
+	ret = ceph_osdc_wait_request(osdc, req);
+	if (ret >= 0)
+		ceph_copy_from_page_vector(pages, buf, 0, ret);
+
+out_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+
+static bool entry_is_readable(struct ceph_journaler *journaler, void *buf,
+			      void *end, uint32_t *bytes_needed)
+{
+	uint32_t remaining = end - buf;
+	uint64_t preamble;
+	uint32_t data_size;
+	void *origin_buf = buf;
+	uint32_t crc = 0, crc_encoded = 0;
+
+	if (remaining < HEADER_FIXED_SIZE) {
+		*bytes_needed = HEADER_FIXED_SIZE - remaining;
+		return false;
+	}
+
+	preamble = ceph_decode_64(&buf);
+	if (PREAMBLE != preamble) {
+		*bytes_needed = 0;
+		return false;
+	}
+
+	buf += (HEADER_FIXED_SIZE - sizeof(preamble));
+	remaining = end - buf;
+	if (remaining < sizeof(uint32_t)) {
+		*bytes_needed = sizeof(uint32_t) - remaining;
+		return false;
+	}
+
+	data_size = ceph_decode_32(&buf);
+	remaining = end - buf;
+	if (remaining < data_size) {
+		*bytes_needed = data_size - remaining;
+		return false;
+	}
+
+	buf += data_size;
+	
+	remaining = end - buf;
+	if (remaining < sizeof(uint32_t)) {
+		*bytes_needed = sizeof(uint32_t) - remaining;
+		return false;
+	}
+
+	*bytes_needed = 0;
+	crc = crc32c(0, origin_buf, buf - origin_buf);
+	crc_encoded = ceph_decode_32(&buf);
+	if (crc != crc_encoded) {
+		pr_err("crc corrupted");
+		return false;
+	}
+	return true;
+}
+
+static int playback_entry(struct ceph_journaler *journaler,
+			  struct ceph_journaler_entry *entry,
+			  uint64_t commit_tid)
+{
+	int ret = 0;
+
+	if (journaler->handle_entry != NULL)
+		ret = journaler->handle_entry(journaler->entry_handler,
+					      entry, commit_tid);
+
+	return ret;
+}
+
+static bool get_last_entry_tid(struct ceph_journaler *journaler,
+			       uint64_t tag_tid, uint64_t *entry_tid)
+{
+	struct entry_tid *pos = NULL;
+
+	spin_lock(&journaler->entry_tid_lock);
+	list_for_each_entry(pos, &journaler->entry_tids, node) {
+		if (pos->tag_tid == tag_tid) {
+			*entry_tid = pos->entry_tid;
+			spin_unlock(&journaler->entry_tid_lock);
+			return true;
+		}
+	}
+	spin_unlock(&journaler->entry_tid_lock);
+
+	return false;
+}
+
+static int reserve_entry_tid(struct ceph_journaler *journaler,
+			      uint64_t tag_tid, uint64_t entry_tid)
+{
+	struct entry_tid *pos;
+
+	spin_lock(&journaler->entry_tid_lock);
+	list_for_each_entry(pos, &journaler->entry_tids, node) {
+		if (pos->tag_tid == tag_tid) {
+			if (pos->entry_tid < entry_tid) {
+				pos->entry_tid = entry_tid;
+			}
+
+			spin_unlock(&journaler->entry_tid_lock);
+			return 0;
+		}
+	}
+
+	pos = kzalloc(sizeof(struct entry_tid), GFP_KERNEL);
+	if (!pos) {
+		WARN_ON(!pos);
+		return -ENOMEM;
+	}
+
+	pos->tag_tid = tag_tid;
+	pos->entry_tid = entry_tid;
+	INIT_LIST_HEAD(&pos->node);
+
+	list_add_tail(&pos->node, &journaler->entry_tids);
+	spin_unlock(&journaler->entry_tid_lock);
+
+	return 0;
+}
+
+static struct ceph_journaler_entry *journaler_entry_decode(void **p, void *end)
+{
+	struct ceph_journaler_entry *entry = NULL;
+	uint64_t preamble = 0;
+	uint8_t version = 0;
+	uint32_t crc = 0, crc_encoded = 0;
+	void *start = *p;
+
+	preamble = ceph_decode_64(p);
+	if (PREAMBLE != preamble) {
+		return NULL;
+	}
+
+	version = ceph_decode_8(p);
+
+	if (version != 1)
+		return NULL;
+
+	entry = kzalloc(sizeof(struct ceph_journaler_entry), GFP_KERNEL);
+	if (!entry) {
+		goto err;
+	}
+
+	INIT_LIST_HEAD(&entry->node);
+	entry->entry_tid = ceph_decode_64(p);
+	entry->tag_tid = ceph_decode_64(p);
+	entry->data = ceph_extract_encoded_string(p, end, &entry->data_len, GFP_NOIO);
+	if (!entry->data)
+		goto free_entry;
+
+	crc = crc32c(0, start, *p - start);
+
+	crc_encoded = ceph_decode_32(p);
+
+	if (crc != crc_encoded)
+		goto free_data;
+	return entry;
+
+free_data:
+	kfree(entry->data);
+free_entry:
+	kfree(entry);
+err:
+	return NULL;
+}
+
+// TODO refactor this function to make it more readable and more effective on memory usage.
+static int fetch(struct ceph_journaler *journaler, uint64_t object_num)
+{
+	struct ceph_object_id object_oid;
+	int ret = 0;
+	void *buf = NULL, *read_buf = NULL, *buf_p = NULL;
+	void *end = NULL;
+	uint64_t read_len = 2 << journaler->order;
+	uint32_t read_off = 0;
+	uint64_t buf_len = read_len;
+	struct list_head entry_list;
+	bool position_found = false;
+	struct ceph_journaler_object_pos *pos;
+	struct object_replayer *obj_replayer = NULL;
+
+	obj_replayer = &journaler->obj_replayers[object_num % journaler->splay_width];
+	obj_replayer->object_num = object_num;
+	list_for_each_entry(pos, &journaler->client->object_positions, node) {
+		if (pos->object_num == object_num) {
+			position_found = true;
+			break;
+		}
+	}
+
+	if (position_found)
+		obj_replayer->pos = pos;
+
+	INIT_LIST_HEAD(&entry_list);
+	ceph_oid_init(&object_oid);
+	ret = ceph_oid_aprintf(&object_oid, GFP_KERNEL, "%s%llu",
+				journaler->object_oid_prefix, object_num);
+	if (ret) {
+		pr_err("failed to initialize object_id : %d", ret);
+		return ret;
+	}
+
+	buf = vmalloc(buf_len);
+	if (!buf) {
+		pr_err("failed to vmalloc buf: %llu", buf_len);
+		ret = -ENOMEM;
+		goto err_free_object_oid;
+	}
+	read_buf = buf;
+	buf_p = buf;
+
+refetch:
+	ret = ceph_journaler_obj_read_sync(journaler, &object_oid,
+					   &journaler->data_oloc, read_buf,
+					   read_off, read_len);
+	if (ret == -ENOENT) {
+		pr_err("no such object, %s: %d", object_oid.name, ret);
+		goto err_free_buf;
+	} else if (ret < 0) {
+		pr_err("failed to read: %d", ret);
+		goto err_free_buf;
+	} else if (ret == 0) {
+		pr_err("no data: %d", ret);
+		goto err_free_buf;
+	}
+	read_off = read_off + ret;
+
+	end = read_buf + ret;
+	while (buf < end) {
+		uint32_t bytes_needed = 0;
+		struct ceph_journaler_entry *entry = NULL;
+
+		if (!entry_is_readable(journaler, buf, end, &bytes_needed)) {
+			uint64_t remain = end - buf;
+			pr_err("not readable");
+			if (bytes_needed != 0) {
+				void *new_buf = vmalloc(read_len + remain);
+				if (!new_buf) {
+					pr_err("failed to alloc new buf");
+					goto err_free_buf;
+				}
+				memcpy(new_buf, buf, remain);
+				vfree(buf_p);
+				buf_p = new_buf;
+				buf = new_buf;
+				read_buf = buf + remain;
+				goto refetch;
+			} else {
+				pr_err("entry corruption");
+				goto err_free_buf;
+			}
+		}
+		entry = journaler_entry_decode(&buf, end);
+		if (!entry)
+			goto err_free_buf;
+
+		list_add_tail(&entry->node, &obj_replayer->entry_list);
+	}
+	ret = 0;
+
+err_free_buf:
+	vfree(buf_p);
+err_free_object_oid:
+	ceph_oid_destroy(&object_oid);
+	return ret;
+}
+
+static int add_commit_entry(struct ceph_journaler *journaler, uint64_t commit_tid,
+			    uint64_t object_num, uint64_t tag_tid, uint64_t entry_tid)
+{
+	struct commit_entry	*entry = NULL;
+	int ret = 0;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	RB_CLEAR_NODE(&entry->r_node);
+
+	entry->commit_tid = commit_tid;
+	entry->object_num = object_num;
+	entry->tag_tid = tag_tid;
+	entry->entry_tid = entry_tid;
+
+	spin_lock(&journaler->commit_lock);
+	insert_commit_entry(&journaler->commit_entries, entry);
+	spin_unlock(&journaler->commit_lock);
+
+out:
+	return ret;
+}
+
+static uint64_t allocate_commit_tid(struct ceph_journaler *journaler)
+{
+	return ++journaler->commit_tid;
+}
+
+static int get_first_entry(struct ceph_journaler *journaler,
+			   struct ceph_journaler_entry **entry,
+			   uint64_t *commit_tid)
+{
+	struct object_replayer *obj_replayer = NULL;
+	struct ceph_journaler_entry *tmp_entry = NULL;
+	uint64_t last_entry_tid = 0;
+	int ret = 0;
+
+	obj_replayer = &journaler->obj_replayers[journaler->splay_offset];
+
+	if (list_empty(&obj_replayer->entry_list)) {
+		return -ENOENT;
+	}
+
+	tmp_entry = list_first_entry(&obj_replayer->entry_list,
+				     struct ceph_journaler_entry, node);
+	list_del(&tmp_entry->node);
+
+	ret = get_last_entry_tid(journaler, tmp_entry->tag_tid, &last_entry_tid);
+	if (ret && tmp_entry->entry_tid != last_entry_tid + 1) {
+		pr_err("missing prior journal entry, last_entry_tid: %llu",
+		       last_entry_tid);
+		ret = -ENOMSG;
+		goto free_entry;
+	}
+
+	journaler->active_tag_tid = tmp_entry->tag_tid;
+	journaler->splay_offset = (journaler->splay_offset + 1) % journaler->splay_width;
+
+	if (list_empty(&obj_replayer->entry_list)) {
+		ret = fetch(journaler, obj_replayer->object_num + journaler->splay_width);
+		if (ret && ret != -ENOENT) {
+			goto free_entry;
+		}
+	}
+
+	ret = reserve_entry_tid(journaler, tmp_entry->tag_tid, tmp_entry->entry_tid);
+	if (ret)
+		goto free_entry;
+
+	*commit_tid = allocate_commit_tid(journaler);
+	ret = add_commit_entry(journaler, *commit_tid, obj_replayer->object_num,
+			       tmp_entry->tag_tid, tmp_entry->entry_tid);
+	if (ret)
+		goto free_entry;
+
+	*entry = tmp_entry;
+	return 0;
+
+free_entry:
+	kfree(tmp_entry);
+	return ret;
+}
+
+static void prune_tag(struct ceph_journaler *journaler)
+{
+	struct ceph_journaler_entry *entry, *next;
+	struct object_replayer *obj_replayer = NULL;
+	int i = 0;
+
+	for (i = 0; i < journaler->splay_width; i++) {
+		obj_replayer = &journaler->obj_replayers[i];
+		list_for_each_entry_safe(entry, next,
+					 &obj_replayer->entry_list, node) {
+			if (entry->tag_tid == journaler->active_tag_tid) {
+				list_del(&entry->node);
+				kfree(entry);
+			}
+		}
+	}
+}
+
+static int process_replay(struct ceph_journaler *journaler)
+{
+	int r = 0;
+	struct ceph_journaler_entry *entry = NULL;
+	uint64_t commit_tid;
+
+next:
+	r = get_first_entry(journaler, &entry, &commit_tid);
+	if (r) {
+		if (r == -ENOENT) {
+			prune_tag(journaler);
+			r = 0;
+		} else {
+			kfree(entry);
+		}
+		return r;
+	}
+
+	r = playback_entry(journaler, entry, commit_tid);
+	if (r) {
+		kfree(entry);
+		return r;
+	}
+	kfree(entry);
+
+	goto next;
+}
+
+static int preprocess_replay(struct ceph_journaler *journaler)
+{
+	struct ceph_journaler_entry *entry, *next;
+	bool found_commit = false;
+	struct object_replayer *obj_replayer = NULL;
+	int i = 0;
+	int ret = 0;
+
+	for (i = 0; i < journaler->splay_width; i++) {
+		obj_replayer = &journaler->obj_replayers[i];
+
+		if (!obj_replayer->pos)
+			continue;
+
+		found_commit = false;
+		list_for_each_entry_safe(entry, next,
+					 &obj_replayer->entry_list, node) {
+			if (entry->tag_tid == obj_replayer->pos->tag_tid &&
+			    entry->entry_tid == obj_replayer->pos->entry_tid) {
+				found_commit = true;
+			} else if (found_commit) {
+				break;
+			}
+
+			ret = reserve_entry_tid(journaler, entry->tag_tid, entry->entry_tid);
+			if (ret)
+				return ret;
+			list_del(&entry->node);
+			kfree(entry);
+		}
+	}
+	return 0;
+}
+
+int ceph_journaler_start_replay(struct ceph_journaler *journaler)
+{	
+	struct ceph_journaler_object_pos *active_pos = NULL;
+	struct ceph_journaler_client *client = NULL;
+	uint64_t *fetch_objects = NULL;
+	uint64_t object_num;
+	int i = 0;
+	int ret = 0;
+
+	fetch_objects = kzalloc(sizeof(uint64_t) * journaler->splay_width, GFP_KERNEL);
+	if (!fetch_objects) {
+		return -ENOMEM;
+	}
+
+	spin_lock(&journaler->meta_lock);
+	client = journaler->client;
+	if (!list_empty(&journaler->client->object_positions)) {
+		active_pos = list_first_entry(&journaler->client->object_positions,
+					      struct ceph_journaler_object_pos, node);
+
+		journaler->splay_offset = (active_pos->object_num + 1) % journaler->splay_width;
+
+		list_for_each_entry(active_pos, &client->object_positions, node) {
+			fetch_objects[active_pos->object_num %
+				      journaler->splay_width] = active_pos->object_num;
+		}
+	}
+	spin_unlock(&journaler->meta_lock);
+
+	for (i = 0; i < journaler->splay_width; i++) {
+		if (fetch_objects[i] == 0) {
+			object_num = i;
+		} else {
+			object_num = fetch_objects[i];
+		}
+		ret = fetch(journaler, object_num);
+		if (ret && ret != -ENOENT)
+			goto out;
+	}
+	ret = preprocess_replay(journaler);
+	if (ret)
+		goto out;
+
+	ret = process_replay(journaler);
+out:
+	for (i = 0; i < journaler->splay_width; i++) {
+		struct object_replayer *obj_replayer = &journaler->obj_replayers[i];
+		struct ceph_journaler_entry *entry = NULL, *next_entry = NULL;
+
+		spin_lock(&obj_replayer->lock);
+		list_for_each_entry_safe(entry, next_entry, &obj_replayer->entry_list, node) {
+			list_del(&entry->node);
+			kfree(entry);
+		}
+		spin_unlock(&obj_replayer->lock);
+	}
+	kfree(fetch_objects);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_journaler_start_replay);