btrfs-progs: dduper - BTRFS offline deduplication tool

Message ID	20180824042440.GA7793@giis.co.in (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-btrfs-owner@kernel.org> Date: Fri, 24 Aug 2018 09:54:40 +0530 From: Lakshmipathi Ganapathi <lakshmipathi.ganapathi@collabora.co.uk> To: linux-btrfs@vger.kernel.org, lakshmipathi.g@giis.co.in Subject: [PATCH] btrfs-progs: dduper - BTRFS offline deduplication tool Message-ID: <20180824042440.GA7793@giis.co.in> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.24 (2015-08-30) Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk
Series	btrfs-progs: dduper - BTRFS offline deduplication tool \| expand btrfs-progs: dduper - BTRFS offline deduplication tool

diff --git a/dduper b/dduper new file mode 100644 index 0000000..2170b11 --- /dev/null +++ b/dduper @@ -0,0 +1,310 @@ +#!/usr/bin/env python + +""" dduper - BTRFS Dedupe tool. + +This is a offline dedupe tool. Instead of reading whole file blocks and +computing checksum, It works by fetching checksum from BTRFS csum tree. +This hugely improves the performance. + +Authors: Lakshmipathi.G <lakshmipathi.ganapathi@collabora.co.uk> +""" +import argparse +import errno +import hashlib +import numpy as np +import math +import os +import pdb +import struct +import subprocess +import sys + +from collections import OrderedDict +from fcntl import ioctl +from itertools import combinations +from itertools import izip_longest +from stat import * +# 4kb block size +blk_size = 4 +# no.of csum on single row - right now its 8 +no_of_chunks = 0 +FICLONERANGE = 0x4020940d + +device_name = None +skip = 0 +chunk_sz = 0 +run_len = 0 +ele_sz = 0 + +# Already deduped files +processed_files = [] + + +# From https://stackoverflow.com/questions/434287 +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return izip_longest(*args, fillvalue=fillvalue) + + +def get_ele_size(): + + global no_of_chunks, run_len + if chunk_sz <= 0 or chunk_sz % 32 != 0: + print "Ensure chunk size is of multiple 32KB. (32,64,128 etc)" + sys.exit(-1) + no_of_chunks = chunk_sz / blk_size + ele_sz = no_of_chunks / 8 + run_len = no_of_chunks * blk_size * 1024 + return ele_sz + + +def get_hashes(out1): + """ + For each list item compute its hash and store it with offset as its key. + """ + global ele_sz + + if ele_sz == 1: + od = OrderedDict() + for idx, ele in enumerate(out1): + v = [] + k = hashlib.md5(str(ele)).hexdigest() + v.append(idx) + if k in od: + print "Collison with: "+str(k) + "at offset: "+str(v) + od[k] = v + else: + od = OrderedDict() + for idx, ele in enumerate(grouper(out1, ele_sz, 'x')): + v = [] + k = hashlib.md5(str(ele)).hexdigest() + v.append(idx) + if k in od: + print "Collison with: "+str(k) + "at offset: "+str(v) + od[k] = v + + return od + + +def ioctl_ficlonerange(dst_fd, s): + + try: + ioctl(dst_fd, FICLONERANGE, s) + except Exception as e: + print "error({0})".format(e.errno) + + +def cmp_files(file1, file2): + + md1 = subprocess.Popen(['md5sum', file1], stdout=subprocess.PIPE, + close_fds=True).stdout.read().split(" ")[0] + md2 = subprocess.Popen(['md5sum', file2], stdout=subprocess.PIPE, + close_fds=True).stdout.read().split(" ")[0] + if md1 == md2: + return 0 + else: + return 1 + + +def do_dedupe(src_file, dst_file, dry_run): + + bkup_file = dst_file + ".__superduper" + src_fd = os.open(src_file, os.O_RDONLY) + dst_fd = os.open(dst_file, os.O_WRONLY) + perfect_match = 0 + + out1 = subprocess.Popen(['btrfs', 'inspect-internal', 'dump-csum', src_file, device_name], + stdout=subprocess.PIPE, close_fds=True).stdout.readlines() + out2 = subprocess.Popen(['btrfs', 'inspect-internal', 'dump-csum', dst_file, device_name], + stdout=subprocess.PIPE, close_fds=True).stdout.readlines() + # todo : perfect match files. Remove dst_file from further operations + if out1 == out2: + print "Prefect match : ", src_file, dst_file + perfect_match = 1 + + src_dict = get_hashes(out1) + dst_dict = get_hashes(out2) + total_entry = len(src_dict) - 1 # Fix missing final ele + file_size = os.path.getsize(src_file) + + np1 = np.array([v for v in src_dict.keys()]) + np2 = np.array([v for v in dst_dict.keys()]) + matched_keys = np.intersect1d(np1, np2) + unmatched_keys = np.setdiff1d(np1, np2) + + if dry_run == 0: + # todo: Clear dict/np/list if there are not used further + # todo : handle same content within single file + + if matched_keys is not None: + if skip == 0: + bkup2 = subprocess.Popen(['cp', '--reflink=always', dst_file, bkup_file], stdout=subprocess.PIPE) + print "*" * 24 + # print "matched regions" + for location in matched_keys: + entry = src_dict[location][0] + src_len = no_of_chunks * blk_size * 1024 + src_offset = src_dict[location][0] * src_len + + multi_dst_offsets = dst_dict[location] # list + for offset in multi_dst_offsets: + dst_offset = offset * src_len + + if entry == total_entry: # fix final ele + src_len = file_size - src_offset + # print "matching chunk : src offset:"+str(src_offset) +" src_len="+ str(src_len) +" dest_off="+ str(dst_offset) + s = struct.pack("qQQQ", src_fd, src_offset, src_len, dst_offset) + ioctl_ficlonerange(dst_fd, s) + + print "Dedupe completed for " + src_file + ":" + dst_file + # Verify original unmodified file and newly deduped file both point to same contents + if skip == 0: + ret = cmp_files(dst_file, bkup_file) + if ret == 0: + print "Dedupe validation successful " + src_file + ":" + dst_file + # Removing temporary backup file path + os.unlink(bkup_file) + else: + print "WARNING: Dedupe for " + dst_file + " Resulted in corruption" + \ + " Backup file path " + bkup_file + + # Close open fds + os.close(src_fd) + os.close(dst_fd) + + print "Summary" + print "blk_size : %d chunksize : %d" % (blk_size, chunk_sz) + print src_file + " has " + str(len(src_dict)) + " chunks" + print dst_file + " has " + str(len(dst_dict)) + " chunks" + print "Matched chunks : " + str(len(matched_keys)) + print "Unmatched chunks: " + str(len(unmatched_keys)) + return perfect_match + + +def validate_files(src_file, dst_file, processed_files): + global run_len + if src_file in processed_files: + return False + if dst_file in processed_files: + return False + src_stat = os.stat(src_file) + dst_stat = os.stat(dst_file) + # Verify its a unique regular file + if (S_ISREG(src_stat.st_mode) == S_ISREG(dst_stat.st_mode) and + (src_stat.st_ino != dst_stat.st_ino) and + (src_stat.st_size >= 4096) and + (dst_stat.st_size >= 4096) and + (src_stat.st_size >= run_len) and + (dst_stat.st_size >= run_len)): + return True + print "Skipped", src_file, dst_file, "not unique regular files or \ + file size < 4kb or size < " + str(run_len) + return False + + +def dedupe_files(file_list, dry_run): + ret = 0 + global processed_files + if len(file_list) == 2: + src_file = file_list[0] + dst_file = file_list[1] + if validate_files(src_file, dst_file, processed_files) is True: + ret = do_dedupe(src_file, dst_file, dry_run) + elif len(file_list) > 2: + comb = combinations(file_list, 2) + for f in comb: + src_file = f[0] + dst_file = f[1] + if validate_files(src_file, dst_file, processed_files) is True: + ret = do_dedupe(src_file, dst_file, dry_run) + if ret == 1: + # perfectly matching file found - stop re-use this file again. + processed_files.append(dst_file) + else: + print "Single file given" + return + + +def validate_file(filename): + global run_len + file_stat = os.stat(filename) + # Verify its a unique regular file + if (S_ISREG(file_stat.st_mode) and (file_stat.st_size >= 4096) and + (file_stat.st_size >= run_len)): + return True + else: + filelist.remove(f) + print "Skipped", f, "not unique regular files or \ + file size < 4kb or size < " + str(run_len) + return False + + +def dedupe_dir(dir_path, dry_run, recurse): + file_list = [] + if recurse == 1: + for path, dirs, files in os.walk(dir_path): + for filename in files: + fn = os.path.join(path, filename) + if validate_file(fn) is True: + file_list.append(fn) + else: + for fi in os.listdir(dir_path): + if os.path.isfile(os.path.join(dir_path, fi)): + fn = os.path.join(dir_path, fi) + if validate_file(fn) is True: + file_list.append(fn) + dedupe_files(file_list, dry_run) + + +def main(results): + + if results.file_list is not None: + dedupe_files(results.file_list, results.dry_run) + + if results.dir_path is not None: + dedupe_dir(results.dir_path, results.dry_run, results.recurse) + + return + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('-p', '--device', action='store', dest='device_name', + type=str, help='Device with BTRFS partition (ex: /dev/sda3) ', + required=True) + + single = parser.add_mutually_exclusive_group() + + single.add_argument('-d', '--dir', action='store', dest='dir_path', + type=str, help='Dedupe Given directory', required=False) + + single.add_argument('-f', '--files', action='store', dest='file_list', + nargs='+', help='Dedupe List of files', type=str, required=False) + + parser.add_argument('-r', '--recurse', action='store_true', dest='recurse', + help='Parse dir recursively (used along with -d)') + + parser.add_argument('-D', '--dry-run', action='store_true', dest='dry_run', + help='Show summary of dedupe details') + + parser.add_argument('-s', '--skip', action='store_true', dest='skip', + help='Will skip backup/validation process.') + + parser.add_argument('-c', '--chunk-size', action='store', dest='chunk_sz', + type=int, default=32, help='Dedupe chunk size in KB', required=False) + + parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01', + help="Show version info") + + results = parser.parse_args() + + if not (results.dir_path or results.file_list): + parser.error('No action requested, add --files or --dir') + + device_name = results.device_name + skip = results.skip + chunk_sz = results.chunk_sz + ele_sz = get_ele_size() + + main(results)

btrfs-progs: dduper - BTRFS offline deduplication tool

Commit Message

Patch