[RFC,DONOTAPPLY] enhanceio: STEC EnhanceIO SSD caching software for Linux kernel

Message ID	20130202004459.GA4760@blackbox.djwong.org (mailing list archive)
State	Deferred, archived
Headers	show Return-Path: <dm-devel-bounces@redhat.com> Date: Fri, 1 Feb 2013 16:44:59 -0800 From: "Darrick J. Wong" <darrick.wong@oracle.com> To: Amit Kale <akale@stec-inc.com> Message-ID: <20130202004459.GA4760@blackbox.djwong.org> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.21 (2010-09-15) Cc: thornber@redhat.com, Mike Snitzer <snitzer@redhat.com>, device-mapper development <dm-devel@redhat.com>, LKML <linux-kernel@vger.kernel.org>, Jason Warr <jason@warr.net>, linux-bcache <linux-bcache@vger.kernel.org>, Kent Overstreet <kent.overstreet@gmail.com> Subject: [dm-devel] [RFC] [DONOTAPPLY] [PATCH] enhanceio: STEC EnhanceIO SSD caching software for Linux kernel Precedence: junk Reply-To: device-mapper development <dm-devel@redhat.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: dm-devel-bounces@redhat.com Errors-To: dm-devel-bounces@redhat.com

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 329bdb4..0e97141 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -142,4 +142,6 @@ source "drivers/staging/sb105x/Kconfig" source "drivers/staging/fwserial/Kconfig" +source "drivers/staging/enhanceio/Kconfig" + endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index c7ec486..81656de 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -63,3 +63,4 @@ obj-$(CONFIG_DRM_IMX) += imx-drm/ obj-$(CONFIG_DGRP) += dgrp/ obj-$(CONFIG_SB105X) += sb105x/ obj-$(CONFIG_FIREWIRE_SERIAL) += fwserial/ +obj-$(CONFIG_ENHANCEIO) += enhanceio/ diff --git a/drivers/staging/enhanceio/Kconfig b/drivers/staging/enhanceio/Kconfig new file mode 100644 index 0000000..e153496 --- /dev/null +++ b/drivers/staging/enhanceio/Kconfig @@ -0,0 +1,21 @@ +# +# EnhanceIO caching solution by STEC INC. +# + +config ENHANCEIO + tristate "Enable EnhanceIO" + depends on BLK_DEV + default n + ---help--- + Based on Facebook's open source Flashcache project developed by + Mohan Srinivasan and hosted at "http://github.com", EnhanceIO is + a collection of (currently three) loadable kernel modules for + using SSDs as cache devices for traditional rotating hard disk + + The caching engine is a loadable kernel module ("enhanceio.ko") + implemented as a device mapper target. The cache replacement + policies are implemented as loadable kernel modules + ("enhanceio_fifo.ko", "enhanceio_lru.ko") that register with + the caching engine module. + + If unsure, say N. diff --git a/drivers/staging/enhanceio/Makefile b/drivers/staging/enhanceio/Makefile new file mode 100644 index 0000000..926aa71 --- /dev/null +++ b/drivers/staging/enhanceio/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for EnhanceIO block device caching. +# +obj-$(CONFIG_ENHANCEIO) += enhanceio.o enhanceio_lru.o enhanceio_fifo.o +enhanceio-y += \ + eio_conf.o \ + eio_ioctl.o \ + eio_main.o \ + eio_mem.o \ + eio_policy.o \ + eio_procfs.o \ + eio_setlru.o \ + eio_subr.o \ + eio_ttc.o +enhanceio_fifo-y += eio_fifo.o +enhanceio_lru-y += eio_lru.o diff --git a/drivers/staging/enhanceio/eio.h b/drivers/staging/enhanceio/eio.h new file mode 100644 index 0000000..3ecece3 --- /dev/null +++ b/drivers/staging/enhanceio/eio.h @@ -0,0 +1,1146 @@ +/* + * eio.h + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Saied Kazemi <skazemi@stec-inc.com> + * Added EnhanceIO-specific code. + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * Common data structures and definitions between Windows and Linux. + * Amit Kale <akale@stec-inc.com> + * Restructured much of the io code to split bio within map function instead + * of letting dm do it. + * Amit Kale <akale@stec-inc.com> + * Harish Pujari <hpujari@stec-inc.com> + * Designed and implemented the writeback caching mode + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <asm/atomic.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/pagemap.h> +#include <linux/random.h> +#include <linux/hardirq.h> +#include <linux/sysctl.h> +#include <linux/version.h> +#include <linux/reboot.h> +#include <linux/delay.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/device-mapper.h> +#include <linux/dm-kcopyd.h> +#include <linux/sort.h> /* required for eio_subr.c */ +#include <linux/kthread.h> +#include <linux/jiffies.h> +#include <linux/vmalloc.h> /* for sysinfo (mem) variables */ +#include <linux/mm.h> +#include <scsi/scsi_device.h> /* required for SSD failure handling */ +/* resolve conflict with scsi/scsi_device.h */ +#ifdef QUEUED +#undef QUEUED +#endif + +#if defined(__KERNEL__) && !defined(CONFIG_PROC_FS) +#error "EnhanceIO requires CONFIG_PROC_FS" +#endif /* __KERNEL__ && !CONFIG_PROC_FS */ + + +#ifndef EIO_INC_H +#define EIO_INC_H + +#define EIO_DBN_SET(dmc, index, dbn) ssdcache_dbn_set(dmc, index, dbn) +#define EIO_DBN_GET(dmc, index) ssdcache_dbn_get(dmc, index) +#define EIO_CACHE_STATE_SET(dmc, index, state) ssdcache_cache_state_set(dmc, index, state) +#define EIO_CACHE_STATE_GET(dmc, index) ssdcache_cache_state_get(dmc, index) +#define EIO_CACHE_STATE_OFF(dmc, index, bitmask) ssdcache_cache_state_off(dmc, index, bitmask) +#define EIO_CACHE_STATE_ON(dmc, index, bitmask) ssdcache_cache_state_on(dmc, index, bitmask) + +/* Bit offsets for wait_on_bit_lock() */ +#define EIO_UPDATE_LIST 0 +#define EIO_HANDLE_REBOOT 1 + +struct eio_control_s { + volatile unsigned long synch_flags; +}; + +int eio_wait_schedule(void *unused); + + +struct eio_event { + struct task_struct *process; /* handle of the sleeping process */ +}; + +typedef long int index_t; + +/* + * This file has three sections as follows: + * + * Section 1: User space only + * Section 2: User space and kernel + * Section 3: Kernel only + * + * Each section may contain its own subsections. + */ + +/* + * Begin Section 1: User space only. + */ + + +/* + * End Section 1: User space only. + */ + +/* + * Begin Section 2: User space and kernel. + */ + +/* States of a cache block */ +#define INVALID 0x0001 +#define VALID 0x0002 /* Valid */ +#define DISKREADINPROG 0x0004 /* Read from disk in progress */ +#define DISKWRITEINPROG 0x0008 /* Write to disk in progress */ +#define CACHEREADINPROG 0x0010 /* Read from cache in progress */ +#define CACHEWRITEINPROG 0x0020 /* Write to cache in progress */ +#define DIRTY 0x0040 /* Dirty, needs writeback to disk */ +#define QUEUED 0x0080 /* Other requests are queued for this block */ + +#define BLOCK_IO_INPROG (DISKREADINPROG | DISKWRITEINPROG | \ + CACHEREADINPROG | CACHEWRITEINPROG) +#define DIRTY_INPROG (VALID | DIRTY | CACHEWRITEINPROG) /* block being dirtied */ +#define CLEAN_INPROG (VALID | DIRTY | DISKWRITEINPROG) /* ongoing clean */ +#define ALREADY_DIRTY (VALID | DIRTY) /* block which is dirty to begin with for an I/O */ + +/* + * This is a special state used only in the following scenario as + * part of device (SSD) failure handling: + * + * ------| dev fail |------| dev resume |------------ + * ...-<--- Tf --><- Td -><---- Tr ---><-- Tn ---... + * |---- Normal ----|-- Degraded -------|-- Normal ---| + * + * Tf: Time during device failure. + * Td: Time after failure when the cache is in degraded mode. + * Tr: Time when the SSD comes back online. + * + * When a failed SSD is added back again, it should be treated + * as a cold SSD. + * + * If Td is very small, then there can be IOs that were initiated + * before or during Tf, and did not finish until the end of Tr. From + * the IO's viewpoint, the SSD was there when the IO was initiated + * and it was there when the IO was finished. These IOs need special + * handling as described below. + * + * To add the SSD as a cold cache device, we initialize all blocks + * to INVALID, execept for the ones that had IOs in progress before + * or during Tf. We mark such blocks as both VALID and INVALID. + * These blocks will be marked INVALID when finished. + */ +#define NO_SSD_IO_INPROG (VALID | INVALID) + +/* + * On Flash (cache metadata) Structures + */ +#define CACHE_MD_STATE_DIRTY 0x55daddee +#define CACHE_MD_STATE_CLEAN 0xacceded1 +#define CACHE_MD_STATE_FASTCLEAN 0xcafebabf +#define CACHE_MD_STATE_UNSTABLE 0xdeaddeee + +/* Do we have a read cache or a read-write cache */ +#define CACHE_MODE_WB 1 +#define CACHE_MODE_RO 2 +#define CACHE_MODE_WT 3 +#define CACHE_MODE_FIRST CACHE_MODE_WB +#define CACHE_MODE_LAST CACHE_MODE_WT +#define CACHE_MODE_DEFAULT CACHE_MODE_WT + +#define DEV_PATHLEN 128 +#define EIO_SUPERBLOCK_SIZE 4096 + + +#define EIO_CLEAN_ABORT 0x00000000 +#define EIO_CLEAN_START 0x00000001 +#define EIO_CLEAN_KEEP 0x00000002 + +/* EIO magic number */ +#define EIO_MAGIC 0xE10CAC6E +#define EIO_BAD_MAGIC 0xBADCAC6E + +/* EIO version */ +#define EIO_SB_VERSION 3 /* kernel superblock version */ +#define EIO_SB_MAGIC_VERSION 3 /* version in which magic number was introduced */ + +typedef union eio_superblock { + struct superblock_fields { + sector_t size; /* Cache size */ + u_int32_t block_size; /* Cache block size */ + u_int32_t assoc; /* Cache associativity */ + u_int32_t cache_sb_state; /* Clean shutdown ? */ + char cache_devname[DEV_PATHLEN]; + sector_t cache_devsize; + char disk_devname[DEV_PATHLEN]; + sector_t disk_devsize; + u_int32_t cache_version; + char cache_name[DEV_PATHLEN]; + u_int32_t mode; + u_int32_t repl_policy; + u_int32_t cache_flags; + /* + * Version 1.1 superblock ends here. + * Don't modify any of the above fields. + */ + u_int32_t magic; /* Has to be the 1st field afer 1.1 superblock */ + u_int32_t cold_boot; /* cache to be started as cold after boot */ + char ssd_uuid[DEV_PATHLEN]; + sector_t cache_md_start_sect; /* cache metadata start (8K aligned)*/ + sector_t cache_data_start_sect; /* cache data start (8K aligned) */ + u_int32_t dirty_high_threshold; + u_int32_t dirty_low_threshold; + u_int32_t dirty_set_high_threshold; + u_int32_t dirty_set_low_threshold; + u_int32_t time_based_clean_interval; + u_int32_t autoclean_threshold; + } sbf; + u_int8_t padding[EIO_SUPERBLOCK_SIZE]; +} eio_superblock_t; + +/* + * For EnhanceIO, we move the superblock from sector 0 to 128 + * and give it a full 4K. Also, in addition to the single + * "red-zone" buffer that separates metadata sectors from the + * data sectors, we allocate extra sectors so that we can + * align the data sectors on a 4K boundary. + * + * 64K 4K variable variable 8K variable variable + * +--------+--+--------+---------+---+--------+---------+ + * | unused |SB| align1 |metadata | Z | align2 | data... | + * +--------+--+--------+---------+---+--------+---------+ + * <------------- dmc->md_sectors ------------> + */ +#define EIO_UNUSED_SECTORS 128 +#define EIO_SUPERBLOCK_SECTORS 8 +#define EIO_REDZONE_SECTORS 16 +#define EIO_START 0 + +#define EIO_ALIGN1_SECTORS(index) ((index % 16) ? (24 - (index % 16)) : 8) +#define EIO_ALIGN2_SECTORS(index) ((index % 16) ? (16 - (index % 16)) : 0) +#define EIO_SUPERBLOCK_START (EIO_START + EIO_UNUSED_SECTORS) +#define EIO_METADATA_START(hd_start_sect) (EIO_SUPERBLOCK_START + \ + EIO_SUPERBLOCK_SECTORS + \ + EIO_ALIGN1_SECTORS(hd_start_sect)) + +#define EIO_EXTRA_SECTORS(start_sect, md_sects) (EIO_METADATA_START(start_sect) + \ + EIO_REDZONE_SECTORS + \ + EIO_ALIGN2_SECTORS(md_sects)) + +/* + * We do metadata updates only when a block trasitions from DIRTY -> CLEAN + * or from CLEAN -> DIRTY. Consequently, on an unclean shutdown, we only + * pick up blocks that are marked (DIRTY | CLEAN), we clean these and stick + * them in the cache. + * On a clean shutdown, we will sync the state for every block, and we will + * load every block back into cache on a restart. + */ +struct flash_cacheblock { + sector_t dbn; /* Sector number of the cached block */ +#ifdef DO_CHECKSUM + u_int64_t checksum; +#endif /* DO_CHECKSUM */ + u_int32_t cache_state; +}; + +/* blksize in terms of no. of sectors */ +#define BLKSIZE_2K 4 +#define BLKSIZE_4K 8 +#define BLKSIZE_8K 16 + +/* + * Give me number of pages to allocated for the + * iosize x specified in terms of bytes. + */ +#define IO_PAGE_COUNT(x) (((x) + (PAGE_SIZE - 1)) / PAGE_SIZE) + +/* + * Macro that calculates number of biovecs to be + * allocated depending on the iosize and cache + * block size. + */ +#define IO_BVEC_COUNT(x, blksize) ({ \ + int count = IO_PAGE_COUNT(x); \ + switch((blksize)) { \ + case BLKSIZE_2K: \ + count = count * 2; \ + break; \ + case BLKSIZE_4K: \ + case BLKSIZE_8K: \ + break; \ + } \ + count; \ +}) + +#define MD_MAX_NR_PAGES 16 +#define MD_BLOCKS_PER_PAGE ((PAGE_SIZE) / sizeof(struct flash_cacheblock)) +#define INDEX_TO_MD_PAGE(INDEX) ((INDEX) / MD_BLOCKS_PER_PAGE) +#define INDEX_TO_MD_PAGE_OFFSET(INDEX) ((INDEX) % MD_BLOCKS_PER_PAGE) + +#define MD_BLOCKS_PER_SECTOR (512 / (sizeof(struct flash_cacheblock))) +#define INDEX_TO_MD_SECTOR(INDEX) ((INDEX) / MD_BLOCKS_PER_SECTOR) +#define INDEX_TO_MD_SECTOR_OFFSET(INDEX) ((INDEX) % MD_BLOCKS_PER_SECTOR) +#define MD_BLOCKS_PER_CBLOCK(dmc) (MD_BLOCKS_PER_SECTOR * (dmc)->block_size) + + +#define METADATA_IO_BLOCKSIZE (256 * 1024) +#define METADATA_IO_BLOCKSIZE_SECT (METADATA_IO_BLOCKSIZE / 512) +#define SECTORS_PER_PAGE ((PAGE_SIZE) / 512) + +/* + * Cache persistence. + */ +#define CACHE_RELOAD 1 +#define CACHE_CREATE 2 +#define CACHE_FORCECREATE 3 + +/* + * Cache replacement policy. + */ +#define CACHE_REPL_FIFO 1 +#define CACHE_REPL_LRU 2 +#define CACHE_REPL_RANDOM 3 +#define CACHE_REPL_FIRST CACHE_REPL_FIFO +#define CACHE_REPL_LAST CACHE_REPL_RANDOM +#define CACHE_REPL_DEFAULT CACHE_REPL_FIFO + +/* + * Default cache parameters. + */ +#define DEFAULT_CACHE_ASSOC 512 +#define DEFAULT_CACHE_BLKSIZE 8 /* 4 KB */ + +/* + * Valid commands that can be written to "control". + * NOTE: Update CACHE_CONTROL_FLAG_MAX value whenever a new control flag is added + */ +#define CACHE_CONTROL_FLAG_MAX 7 +#define CACHE_VERBOSE_OFF 0 +#define CACHE_VERBOSE_ON 1 +#define CACHE_WRITEBACK_ON 2 /* register write back variables */ +#define CACHE_WRITEBACK_OFF 3 +#define CACHE_INVALIDATE_ON 4 /* register invalidate variables */ +#define CACHE_INVALIDATE_OFF 5 +#define CACHE_FAST_REMOVE_ON 6 /* do not write MD when destroying cache */ +#define CACHE_FAST_REMOVE_OFF 7 + + +/* + * Bit definitions in "cache_flags". These are exported in Linux as + * hex in the "flags" output line of /proc/enhanceio/<cache_name>/config. + */ + +#define CACHE_FLAGS_VERBOSE (1 << 0) +#define CACHE_FLAGS_INVALIDATE (1 << 1) +#define CACHE_FLAGS_FAST_REMOVE (1 << 2) +#define CACHE_FLAGS_DEGRADED (1 << 3) +#define CACHE_FLAGS_SSD_ADD_INPROG (1 << 4) +#define CACHE_FLAGS_MD8 (1 << 5) /* using 8-byte metadata (instead of 4-byte md) */ +#define CACHE_FLAGS_FAILED (1 << 6) +#define CACHE_FLAGS_STALE (1 << 7) +#define CACHE_FLAGS_SHUTDOWN_INPROG (1 << 8) +#define CACHE_FLAGS_MOD_INPROG (1 << 9) /* cache modification such as edit/delete in progress */ +#define CACHE_FLAGS_DELETED (1 << 10) +#define CACHE_FLAGS_INCORE_ONLY (CACHE_FLAGS_DEGRADED | \ + CACHE_FLAGS_SSD_ADD_INPROG | \ + CACHE_FLAGS_FAILED | \ + CACHE_FLAGS_SHUTDOWN_INPROG | \ + CACHE_FLAGS_MOD_INPROG | \ + CACHE_FLAGS_STALE | \ + CACHE_FLAGS_DELETED) /* need a proper definition */ + +/* flags that govern cold/warm enable after reboot */ +#define BOOT_FLAG_COLD_ENABLE (1 << 0) /* enable the cache as cold */ +#define BOOT_FLAG_FORCE_WARM (1 << 1) /* override the cold enable flag */ + +typedef enum dev_notifier { + NOTIFY_INITIALIZER, + NOTIFY_SSD_ADD, + NOTIFY_SSD_REMOVED, + NOTIFY_SRC_REMOVED +} dev_notifier_t; + + +/* + * End Section 2: User space and kernel. + */ + +/* + * Begin Section 3: Kernel only. + */ +#if defined(__KERNEL__) + +/* + * Subsection 3.1: Definitions. + */ + +#define EIO_SB_VERSION 3 /* kernel superblock version */ + +/* kcached/pending job states */ +#define READCACHE 1 +#define WRITECACHE 2 +#define READDISK 3 +#define WRITEDISK 4 +#define READFILL 5 /* Read Cache Miss Fill */ +#define INVALIDATE 6 + +/* Cache persistence */ +#define CACHE_RELOAD 1 +#define CACHE_CREATE 2 +#define CACHE_FORCECREATE 3 + +/* Sysctl defined */ +#define MAX_CLEAN_IOS_SET 2 +#define MAX_CLEAN_IOS_TOTAL 4 + +/* + * Harish: TBD + * Rethink on max, min, default values + */ +#define DIRTY_HIGH_THRESH_DEF 30 +#define DIRTY_LOW_THRESH_DEF 10 +#define DIRTY_SET_HIGH_THRESH_DEF 100 +#define DIRTY_SET_LOW_THRESH_DEF 30 + +#define CLEAN_FACTOR(sectors) ((sectors) >> 25) /* in 16 GB multiples */ +#define TIME_BASED_CLEAN_INTERVAL_DEF(dmc) (uint32_t)(CLEAN_FACTOR((dmc)->cache_size) ? \ + CLEAN_FACTOR((dmc)->cache_size) : 1) +#define TIME_BASED_CLEAN_INTERVAL_MAX 720 /* in minutes */ + +#define AUTOCLEAN_THRESH_DEF 128 /* Number of I/Os which puts a hold on time based cleaning */ +#define AUTOCLEAN_THRESH_MAX 1024 /* Number of I/Os which puts a hold on time based cleaning */ + +/* Inject a 5s delay between cleaning blocks and metadata */ +#define CLEAN_REMOVE_DELAY 5000 + +/* + * Subsection 2: Data structures. + */ + +/* + * Block checksums : + * Block checksums seem a good idea (especially for debugging, I found a couple + * of bugs with this), but in practice there are a number of issues with this + * in production. + * 1) If a flash write fails, there is no guarantee that the failure was atomic. + * Some sectors may have been written to flash. If so, the checksum we have + * is wrong. We could re-read the flash block and recompute the checksum, but + * the read could fail too. + * 2) On a node crash, we could have crashed between the flash data write and the + * flash metadata update (which updates the new checksum to flash metadata). When + * we reboot, the checksum we read from metadata is wrong. This is worked around + * by having the cache load recompute checksums after an unclean shutdown. + * 3) Checksums require 4 or 8 more bytes per block in terms of metadata overhead. + * Especially because the metadata is wired into memory. + * 4) Checksums force us to do a flash metadata IO on a block re-dirty. If we + * didn't maintain checksums, we could avoid the metadata IO on a re-dirty. + * Therefore in production we disable block checksums. + * + * Use the Makefile to enable/disable DO_CHECKSUM + */ +typedef void (*eio_notify_fn)(int error, void *context); + +/* + * 4-byte metadata support. + */ + +#define EIO_MAX_SECTOR (((u_int64_t)1) << 40) + +struct md4 { + u_int16_t bytes1_2; + u_int8_t byte3; + u_int8_t cache_state; +}; + +struct cacheblock { + union { + u_int32_t u_i_md4; + struct md4 u_s_md4; + } md4_u; +#ifdef DO_CHECKSUM + u_int64_t checksum; +#endif /* DO_CHECKSUM */ +}; + +#define md4_md md4_u.u_i_md4 +#define md4_cache_state md4_u.u_s_md4.cache_state +#define EIO_MD4_DBN_BITS (32 - 8) /* 8 bits for state */ +#define EIO_MD4_DBN_MASK ((1 << EIO_MD4_DBN_BITS) - 1) +#define EIO_MD4_INVALID (INVALID << EIO_MD4_DBN_BITS) +#define EIO_MD4_CACHE_STATE(dmc, index) (dmc->cache[index].md4_cache_state) + + +/* + * 8-byte metadata support. + */ + +struct md8 { + u_int32_t bytes1_4; + u_int16_t bytes5_6; + u_int8_t byte7; + u_int8_t cache_state; +}; + +struct cacheblock_md8 { + union { + u_int64_t u_i_md8; + struct md8 u_s_md8; + } md8_u; +#ifdef DO_CHECKSUM + u_int64_t checksum; +#endif /* DO_CHECKSUM */ +}; + +#define md8_md md8_u.u_i_md8 +#define md8_cache_state md8_u.u_s_md8.cache_state +#define EIO_MD8_DBN_BITS (64 - 8) /* 8 bits for state */ +#define EIO_MD8_DBN_MASK ((((u_int64_t)1) << EIO_MD8_DBN_BITS) - 1) +#define EIO_MD8_INVALID (((u_int64_t)INVALID) << EIO_MD8_DBN_BITS) +#define EIO_MD8_CACHE_STATE(dmc, index) ((dmc)->cache_md8[index].md8_cache_state) +#define EIO_MD8(dmc) CACHE_MD8_IS_SET(dmc) + +/* Structure used for metadata update on-disk and in-core for writeback cache */ +struct mdupdate_request { + struct list_head list; /* to build mdrequest chain */ + struct work_struct work; /* work structure */ + struct cache_c *dmc; /* cache pointer */ + index_t set; /* set index */ + unsigned md_size; /* metadata size */ + unsigned mdbvec_count; /* count of bvecs allocated. */ + struct bio_vec *mdblk_bvecs; /* bvecs for updating md_blocks */ + atomic_t holdcount; /* I/O hold count */ + struct eio_bio *pending_mdlist; /* ebios pending for md update */ + struct eio_bio *inprog_mdlist; /* ebios processed for md update */ + int error; /* error during md update */ + struct mdupdate_request *next; /* next mdreq in the mdreq list .Harish: TBD. Deprecate */ +}; + +#define SETFLAG_CLEAN_INPROG 0x00000001 /* clean in progress on a set */ +#define SETFLAG_CLEAN_WHOLE 0x00000002 /* clean the set fully */ + +/* Structure used for doing operations and storing cache set level info */ +struct cache_set { + struct list_head list; + u_int32_t nr_dirty; /* number of dirty blocks */ + spinlock_t cs_lock; /* spin lock to protect struct fields */ + struct rw_semaphore rw_lock; /* reader-writer lock used for clean */ + unsigned int flags; /* misc cache set specific flags */ + struct mdupdate_request *mdreq; /* metadata update request pointer */ +}; + +struct eio_errors { + int disk_read_errors; + int disk_write_errors; + int ssd_read_errors; + int ssd_write_errors; + int memory_alloc_errors; + int no_cache_dev; + int no_source_dev; +}; + +/* + * Stats. Note that everything should be "atomic64_t" as + * code relies on it. + */ +#define SECTOR_STATS(statval, io_size) \ + atomic64_add(to_sector(io_size), &statval); + +struct eio_stats { + atomic64_t reads; /* Number of reads */ + atomic64_t writes; /* Number of writes */ + atomic64_t read_hits; /* Number of cache hits */ + atomic64_t write_hits; /* Number of write hits (includes dirty write hits) */ + atomic64_t dirty_write_hits; /* Number of "dirty" write hits */ + atomic64_t cached_blocks; /* Number of cached blocks */ + atomic64_t rd_replace; /* Number of read cache replacements. Harish: TBD modify def doc */ + atomic64_t wr_replace; /* Number of write cache replacements. Harish: TBD modify def doc */ + atomic64_t noroom; /* No room in set */ + atomic64_t cleanings; /* blocks cleaned Harish: TBD modify def doc */ + atomic64_t md_write_dirty; /* Metadata sector writes dirtying block */ + atomic64_t md_write_clean; /* Metadata sector writes cleaning block */ + atomic64_t md_ssd_writes; /* How many md ssd writes did we do ? */ + atomic64_t uncached_reads; + atomic64_t uncached_writes; + atomic64_t uncached_map_size; + atomic64_t uncached_map_uncacheable; + atomic64_t disk_reads; + atomic64_t disk_writes; + atomic64_t ssd_reads; + atomic64_t ssd_writes; + atomic64_t ssd_readfills; + atomic64_t ssd_readfill_unplugs; + atomic64_t readdisk; + atomic64_t writedisk; + atomic64_t readcache; + atomic64_t readfill; + atomic64_t writecache; + atomic64_t wrtime_ms; /* total write time in ms */ + atomic64_t rdtime_ms; /* total read time in ms */ + atomic64_t readcount; /* total reads received so far */ + atomic64_t writecount; /* total writes received so far */ +}; + + +#define PENDING_JOB_HASH_SIZE 32 +#define PENDING_JOB_HASH(index) ((index) % PENDING_JOB_HASH_SIZE) +#define SIZE_HIST (128 + 1) +#define EIO_COPY_PAGES 1024 /* Number of pages for I/O */ +#define MIN_JOBS 1024 +#define MIN_EIO_IO 4096 +#define MIN_DMC_BIO_PAIR 8192 + + +/* Structure representing a sequence of sets(first to last set index) */ +struct set_seq { + index_t first_set; + index_t last_set; + struct set_seq *next; +}; + +/* EIO system control variables(tunables) */ +/* + * vloatile are used here since the cost a strong synchonisation + * is not worth the benefits. +*/ +struct eio_sysctl { + volatile uint32_t error_inject; + volatile int32_t fast_remove; + volatile int32_t zerostats; + volatile int32_t do_clean; + volatile uint32_t dirty_high_threshold; + volatile uint32_t dirty_low_threshold; + volatile uint32_t dirty_set_high_threshold; + volatile uint32_t dirty_set_low_threshold; + volatile uint32_t time_based_clean_interval; /* time after which dirty sets should clean */ + volatile int32_t autoclean_threshold; + volatile int32_t mem_limit_pct; + volatile int32_t control; + volatile u_int64_t invalidate; +}; + +/* forward declaration */ +struct lru_ls; + +/* Replacement for 'struct dm_dev' */ +struct eio_bdev { + struct block_device *bdev; + fmode_t mode; + char name[16]; +}; + +/* Replacement for 'struct dm_io_region */ +struct eio_io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; /* If zero the region is ignored*/ +}; + +/* + * Cache context + */ +struct cache_c { + struct list_head cachelist; + make_request_fn *origmfn; + char dev_info; /* partition or whole device */ + + sector_t dev_start_sect; + sector_t dev_end_sect; + int cache_rdonly; /* protected by ttc_write lock */ + struct eio_bdev *disk_dev; /* Source device */ + struct eio_bdev *cache_dev; /* Cache device */ + struct cacheblock *cache; /* Hash table for cache blocks */ + struct cache_set *cache_sets; + struct cache_c *next_cache; + struct kcached_job *readfill_queue; + struct work_struct readfill_wq; + + struct list_head cleanq; /* queue of sets to awaiting clean */ + struct eio_event clean_event; /* event to wait for, when cleanq is empty */ + spinlock_t clean_sl; /* spinlock to protect cleanq etc */ + void *clean_thread; /* OS specific thread object to handle cleanq */ + int clean_thread_running; /* to indicate that clean thread is running */ + atomic64_t clean_pendings; /* Number of sets pending to be cleaned */ + struct bio_vec *clean_dbvecs; /* Data bvecs for clean set */ + struct page **clean_mdpages; /* Metadata pages for clean set */ + int dbvec_count; + int mdpage_count; + int clean_excess_dirty; /* Clean in progress to bring cache dirty blocks in limits */ + atomic_t clean_index; /* set being cleaned, in case of force clean */ + + u_int64_t md_start_sect; /* Sector no. at which Metadata starts */ + u_int64_t md_sectors; /* Numbers of metadata sectors, including header */ + u_int64_t disk_size; /* Source size */ + u_int64_t size; /* Cache size */ + u_int32_t assoc; /* Cache associativity */ + u_int32_t block_size; /* Cache block size */ + u_int32_t block_shift; /* Cache block size in bits */ + u_int32_t block_mask; /* Cache block mask */ + u_int32_t consecutive_shift; /* Consecutive blocks size in bits */ + u_int32_t persistence; /* Create | Force create | Reload */ + u_int32_t mode; /* CACHE_MODE_{WB, RO, WT} */ + u_int32_t cold_boot; /* Cache should be started as cold after boot */ + u_int32_t bio_nr_pages; /* number of hardware sectors supported by SSD in terms of PAGE_SIZE */ + + spinlock_t cache_spin_lock; + long unsigned int cache_spin_lock_flags; /* See comments above spin_lock_irqsave_FLAGS */ + atomic_t nr_jobs; /* Number of I/O jobs */ + + volatile u_int32_t cache_flags; + u_int32_t sb_state; /* Superblock state */ + u_int32_t sb_version; /* Superblock version */ + + int readfill_in_prog; + struct eio_stats eio_stats; /* Run time stats */ + struct eio_errors eio_errors; /* Error stats */ + int max_clean_ios_set; /* Max cleaning IOs per set */ + int max_clean_ios_total; /* Total max cleaning IOs */ + int clean_inprog; + atomic64_t nr_dirty; + atomic64_t nr_ios; + atomic64_t size_hist[SIZE_HIST]; + + void *sysctl_handle_common; + void *sysctl_handle_writeback; + void *sysctl_handle_invalidate; + + struct eio_sysctl sysctl_pending; /* sysctl values pending to become active */ + struct eio_sysctl sysctl_active; /* sysctl currently active */ + + char cache_devname[DEV_PATHLEN]; + char disk_devname[DEV_PATHLEN]; + char cache_name[DEV_PATHLEN]; + char cache_gendisk_name[DEV_PATHLEN]; /* Used for SSD failure checks */ + char cache_srcdisk_name[DEV_PATHLEN]; /* Used for SRC failure checks */ + char ssd_uuid[DEV_PATHLEN]; + + struct cacheblock_md8 *cache_md8; + sector_t cache_size; /* Cache size passed to ctr(), used by dmsetup info */ + sector_t cache_dev_start_sect; /* starting sector of cache device */ + u_int64_t index_zero; /* index of cache block with starting sector 0 */ + u_int32_t num_sets; /* number of cache sets */ + u_int32_t num_sets_bits; /* number of bits to encode "num_sets" */ + u_int64_t num_sets_mask; /* mask value for bits in "num_sets" */ + + struct eio_policy *policy_ops; /* Cache block Replacement policy */ + u_int32_t req_policy; /* Policy requested by the user */ + u_int32_t random; /* Use for random replacement policy */ + void *sp_cache_blk; /* Per cache-block data structure */ + void *sp_cache_set; /* Per cache-set data structure */ + struct lru_ls *dirty_set_lru; /* lru for dirty sets : lru_list_t */ + spinlock_t dirty_set_lru_lock; /* spinlock for dirty set lru */ + struct delayed_work clean_aged_sets_work; /* work item for clean_aged_sets */ + int is_clean_aged_sets_sched; /* to know whether clean aged sets is scheduled */ + struct workqueue_struct *mdupdate_q; /* Workqueue to handle md updates */ + struct workqueue_struct *callback_q; /* Workqueue to handle io callbacks */ +}; + +#define EIO_CACHE_IOSIZE 0 + +#define EIO_ROUND_SECTOR(dmc, sector) (sector& (~(unsigned)(dmc->block_size - 1))) +#define EIO_ROUND_SET_SECTOR(dmc, sector) (sector& (~(unsigned)((dmc->block_size * dmc->assoc) - 1))) + +/* + * The bit definitions are exported to the user space and are in the very beginning of the file. + */ +#define CACHE_VERBOSE_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_VERBOSE) ? 1 : 0) +#define CACHE_INVALIDATE_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_INVALIDATE) ? 1 : 0) +#define CACHE_FAST_REMOVE_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_FAST_REMOVE) ? 1 : 0) +#define CACHE_DEGRADED_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_DEGRADED) ? 1 : 0) +#define CACHE_SSD_ADD_INPROG_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_SSD_ADD_INPROG) ? 1 : 0) +#define CACHE_MD8_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_MD8) ? 1 : 0) +#define CACHE_FAILED_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_FAILED) ? 1 : 0) +#define CACHE_STALE_IS_SET(dmc) (((dmc)->cache_flags & CACHE_FLAGS_STALE) ? 1 : 0) + +/* Device failure handling. */ +#define CACHE_SRC_IS_ABSENT(dmc) (((dmc)->eio_errors.no_source_dev == 1) ? 1 : 0) + +#define AUTOCLEAN_THRESHOLD_CROSSED(dmc) \ + ((atomic64_read(&(dmc)->nr_ios) > (int64_t)(dmc)->sysctl_active.autoclean_threshold) || \ + ((dmc)->sysctl_active.autoclean_threshold == 0)) + +#define DIRTY_CACHE_THRESHOLD_CROSSED(dmc) \ + (((atomic64_read(&(dmc)->nr_dirty) - atomic64_read(&(dmc)->clean_pendings)) >= \ + (int64_t)((dmc)->sysctl_active.dirty_high_threshold * (dmc)->size) / 100) && \ + ((dmc)->sysctl_active.dirty_high_threshold > (dmc)->sysctl_active.dirty_low_threshold)) + + +#define DIRTY_SET_THRESHOLD_CROSSED(dmc, set) \ + (((dmc)->cache_sets[(set)].nr_dirty >= (u_int32_t)((dmc)->sysctl_active.dirty_set_high_threshold * (dmc)->assoc)/100) && \ + ((dmc)->sysctl_active.dirty_set_high_threshold > (dmc)->sysctl_active.dirty_set_low_threshold)) + + +/* + * Do not reverse the order of disk and cache! Code + * relies on this ordering. (Eg: eio_dm_io_async_bvec()). + */ +struct job_io_regions { + struct eio_io_region disk; /* has to be the first member */ + struct eio_io_region cache; /* has to be the second member */ +}; + +#define EB_MAIN_IO 1 +#define EB_SUBORDINATE_IO 2 +#define EB_INVAL 4 +#define GET_BIO_FLAGS(ebio) ((ebio)->eb_bc->bc_bio->bi_rw) +#define VERIFY_BIO_FLAGS(ebio) VERIFY((ebio) && (ebio)->eb_bc && (ebio)->eb_bc->bc_bio) + +#define SET_BARRIER_FLAGS(rw_flags) (rw_flags |= (REQ_WRITE | REQ_FLUSH)) + +struct eio_bio { + int eb_iotype; + struct bio_container *eb_bc; + unsigned eb_cacheset; + sector_t eb_sector; //sector number + unsigned eb_size; //size in bytes + struct bio_vec *eb_bv; //bvec pointer + unsigned eb_nbvec; //number of bio_vecs + int eb_dir; // io direction + struct eio_bio *eb_next; //used for splitting reads + index_t eb_index; //for read bios + atomic_t eb_holdcount; /* ebio hold count, currently used only for dirty block I/O */ + struct bio_vec eb_rbv[0]; +}; + +enum eio_io_dir { + EIO_IO_INVALID_DIR = 0, + CACHED_WRITE, + CACHED_READ, + UNCACHED_WRITE, + UNCACHED_READ, + UNCACHED_READ_AND_READFILL +}; + +/* ASK + * Container for all eio_bio corresponding to a given bio + */ +struct bio_container { + spinlock_t bc_lock; /* lock protecting the bc fields */ + atomic_t bc_holdcount; /* number of ebios referencing bc */ + struct bio *bc_bio; /* bio for the bc */ + struct cache_c *bc_dmc; /* cache structure */ + struct eio_bio *bc_mdlist; /* ebios waiting for md update */ + int bc_mdwait; /* count of ebios that will do md update */ + struct mdupdate_request *mdreqs; /* mdrequest structures required for md update */ + struct set_seq *bc_setspan; /* sets spanned by the bc(used only for wb) */ + struct set_seq bc_singlesspan; /* used(by wb) if bc spans a single set sequence */ + enum eio_io_dir bc_dir; /* bc I/O direction */ + int bc_error; /* error encountered during processing bc */ + unsigned long bc_iotime; /* maintains i/o time in jiffies */ + struct bio_container *bc_next; /* next bc in the chain */ +}; + +/* structure used as callback context during synchronous I/O */ +struct sync_io_context { + struct rw_semaphore sio_lock; + unsigned long sio_error; +}; + +struct kcached_job { + struct list_head list; + struct work_struct work; + struct cache_c *dmc; + struct eio_bio *ebio; + struct job_io_regions job_io_regions; + index_t index; + int action; + int error; + struct flash_cacheblock *md_sector; + struct bio_vec md_io_bvec; + struct kcached_job *next; +}; + +struct ssd_rm_list { + struct cache_c *dmc; + int action; + dev_t devt; + dev_notifier_t note; + struct list_head list; +}; + +struct dbn_index_pair { + sector_t dbn; + index_t index; +}; + +/* + * Subsection 3: Function prototypes and definitions. + */ + +struct kcached_job *eio_alloc_cache_job(void); +void eio_free_cache_job(struct kcached_job *job); +struct kcached_job *pop(struct list_head *jobs); +void push(struct list_head *jobs, struct kcached_job *job); +void do_work(struct work_struct *unused); +void update_job_cacheregion(struct kcached_job *job, struct cache_c *dmc, struct eio_bio* bio); +void push_io(struct kcached_job *job); +void push_md_io(struct kcached_job *job); +void push_md_complete(struct kcached_job *job); +void push_uncached_io_complete(struct kcached_job *job); +int eio_io_empty(void); +int eio_md_io_empty(void); +int eio_md_complete_empty(void); +void eio_md_write_done(struct kcached_job *job); +void eio_ssderror_diskread(struct kcached_job *job); +void eio_md_write(struct kcached_job *job); +void eio_md_write_kickoff(struct kcached_job *job); +void eio_do_readfill(struct work_struct *work); +void eio_comply_dirty_thresholds(struct cache_c *dmc, index_t set); +void eio_clean_all(struct cache_c *dmc); +void eio_clean_for_reboot(struct cache_c *dmc); +void eio_clean_aged_sets(struct work_struct *work); +void eio_comply_dirty_thresholds(struct cache_c *dmc, index_t set); +#ifndef SSDCACHE +void eio_reclaim_lru_movetail(struct cache_c *dmc, index_t index, struct eio_policy *); +#endif /* !SSDCACHE */ +int eio_io_sync_vm(struct cache_c *dmc, struct eio_io_region *where, int rw, struct bio_vec *bvec, int nbvec); +int eio_io_sync_pages(struct cache_c *dmc, struct eio_io_region *where, int rw, + struct page **pages, int num_bvecs); +void eio_update_sync_progress(struct cache_c *dmc); +void eio_plug_cache_device(struct cache_c *dmc); +void eio_unplug_cache_device(struct cache_c *dmc); +void eio_plug_disk_device(struct cache_c *dmc); +void eio_unplug_disk_device(struct cache_c *dmc); +int dm_io_async_bvec(unsigned int num_regions, struct eio_io_region *where, int rw, + struct bio_vec *bvec, eio_notify_fn fn, void *context); +void eio_put_cache_device(struct cache_c *dmc); +void eio_suspend_caching(struct cache_c *dmc, dev_notifier_t note); +void eio_resume_caching(struct cache_c *dmc, char *dev); +int eio_ctr_ssd_add(struct cache_c *dmc, char *dev); + +/* procfs */ +void eio_module_procfs_init(void); +void eio_module_procfs_exit(void); +void eio_procfs_ctr(struct cache_c *dmc); +void eio_procfs_dtr(struct cache_c *dmc); + +int eio_sb_store(struct cache_c *dmc); + +int eio_md_destroy(struct dm_target *tip, char *namep, char *srcp, char *cachep, int force); + +/* eio_conf.c */ +extern int eio_ctr(struct dm_target *ti, unsigned int argc, char **argv); +extern void eio_dtr(struct dm_target *ti); +extern int eio_md_destroy(struct dm_target *tip, char *namep, char *srcp, char *cachep, int force); +extern int eio_ctr_ssd_add(struct cache_c *dmc, char *dev); + +/* thread related functions */ +void * eio_create_thread(int (*func)(void *), void *context, char *name); +void eio_thread_exit(long exit_code); +void eio_wait_thread_exit(void *thrdptr, int *notifier); + + +/* eio_main.c */ +extern int eio_map(struct cache_c *, struct request_queue *, struct bio *); +extern void eio_md_write_done(struct kcached_job *job); +extern void eio_ssderror_diskread(struct kcached_job *job); +extern void eio_md_write(struct kcached_job *job); +extern void eio_md_write_kickoff(struct kcached_job *job); +extern void eio_do_readfill(struct work_struct *work); +extern void eio_check_dirty_thresholds(struct cache_c *dmc, index_t set); +extern void eio_clean_all(struct cache_c *dmc); +extern int eio_clean_thread_proc(void *context); +extern void eio_touch_set_lru(struct cache_c *dmc, index_t set); +extern void eio_inval_range(struct cache_c *dmc, sector_t iosector, + unsigned iosize); +extern int eio_invalidate_sanity_check(struct cache_c *dmc, u_int64_t iosector, + u_int64_t *iosize); +/* + * Invalidates all cached blocks without waiting for them to complete + * Should be called with incoming IO suspended + */ +extern int eio_invalidate_cache(struct cache_c *dmc); + +/* eio_mem.c */ +extern int eio_mem_init(struct cache_c *dmc); +extern u_int32_t eio_hash_block(struct cache_c *dmc, sector_t dbn); +extern unsigned int eio_shrink_dbn(struct cache_c *dmc, sector_t dbn); +extern sector_t eio_expand_dbn(struct cache_c *dmc, u_int64_t index); +extern void eio_invalidate_md(struct cache_c *dmc, u_int64_t index); +extern void eio_md4_dbn_set(struct cache_c *dmc, u_int64_t index, u_int32_t dbn_24); +extern void eio_md8_dbn_set(struct cache_c *dmc, u_int64_t index, sector_t dbn); + +/* eio_procfs.c */ +extern void eio_module_procfs_init(void); +extern void eio_module_procfs_exit(void); +extern void eio_procfs_ctr(struct cache_c *dmc); +extern void eio_procfs_dtr(struct cache_c *dmc); +extern int eio_version_query(size_t buf_sz, char *bufp); + +/* eio_subr.c */ +extern void eio_free_cache_job(struct kcached_job *job); +extern void eio_do_work(struct work_struct *unused); +extern struct kcached_job *eio_new_job(struct cache_c *dmc, struct eio_bio* bio, index_t index); +extern void eio_push_ssdread_failures(struct kcached_job *job); +extern void eio_push_md_io(struct kcached_job *job); +extern void eio_push_md_complete(struct kcached_job *job); +extern void eio_push_uncached_io_complete(struct kcached_job *job); +extern int eio_io_empty(void); +extern int eio_io_sync_vm(struct cache_c *dmc, struct eio_io_region *where, int rw, struct bio_vec *bvec, int nbvec); +extern void eio_unplug_cache_device(struct cache_c *dmc); +extern void eio_put_cache_device(struct cache_c *dmc); +extern void eio_suspend_caching(struct cache_c *dmc, dev_notifier_t note); +extern void eio_resume_caching(struct cache_c *dmc, char *dev); + +static __inline__ void +EIO_DBN_SET(struct cache_c *dmc, u_int64_t index, sector_t dbn) +{ + if (EIO_MD8(dmc)) + eio_md8_dbn_set(dmc, index, dbn); + else + eio_md4_dbn_set(dmc, index, eio_shrink_dbn(dmc, dbn)); + if (dbn == 0) + dmc->index_zero = index; +} + +static __inline__ u_int64_t +EIO_DBN_GET(struct cache_c *dmc, u_int64_t index) +{ + if (EIO_MD8(dmc)) + return dmc->cache_md8[index].md8_md & EIO_MD8_DBN_MASK; + + return eio_expand_dbn(dmc, index); +} + +static __inline__ void +EIO_CACHE_STATE_SET(struct cache_c *dmc, u_int64_t index, u_int8_t cache_state) +{ + if (EIO_MD8(dmc)) + EIO_MD8_CACHE_STATE(dmc, index) = cache_state; + else + EIO_MD4_CACHE_STATE(dmc, index) = cache_state; +} + +static __inline__ u_int8_t +EIO_CACHE_STATE_GET(struct cache_c *dmc, u_int64_t index) +{ + u_int8_t cache_state; + + if (EIO_MD8(dmc)) + cache_state = EIO_MD8_CACHE_STATE(dmc, index); + else + cache_state = EIO_MD4_CACHE_STATE(dmc, index); + return cache_state; +} + +static __inline__ void +EIO_CACHE_STATE_OFF(struct cache_c *dmc, index_t index, u_int8_t bitmask) +{ + u_int8_t cache_state = EIO_CACHE_STATE_GET(dmc, index); + cache_state &= ~bitmask; + EIO_CACHE_STATE_SET(dmc, index, cache_state); +} + +static __inline__ void +EIO_CACHE_STATE_ON(struct cache_c *dmc, index_t index, u_int8_t bitmask) +{ + u_int8_t cache_state = EIO_CACHE_STATE_GET(dmc, index); + cache_state |= bitmask; + EIO_CACHE_STATE_SET(dmc, index, cache_state); +} + +void eio_set_warm_boot(void); +#endif /* defined(__KERNEL__) */ + +#include "eio_ioctl.h" + +/* resolve conflict with scsi/scsi_device.h */ +#ifdef __KERNEL__ +#ifdef VERIFY +#undef VERIFY +#endif +#define ENABLE_VERIFY +#ifdef ENABLE_VERIFY +/* Like ASSERT() but always compiled in */ +#define VERIFY(x) do { \ + if (unlikely(!(x))) { \ + dump_stack(); \ + panic("VERIFY: assertion (%s) failed at %s (%d)\n", \ + #x, __FILE__ , __LINE__); \ + } \ +} while(0) +#else /* ENABLE_VERIFY */ +#define VERIFY(x) do { } while(0); +#endif /* ENABLE_VERIFY */ + +extern sector_t eio_get_device_size(struct eio_bdev *); +extern sector_t eio_get_device_start_sect(struct eio_bdev *); +#endif /* __KERNEL__ */ + + +#define EIO_INIT_EVENT(ev) \ + do { \ + (ev)->process = NULL; \ + } while (0) + +//Assumes that the macro gets called under the same spinlock as in wait event +#define EIO_SET_EVENT_AND_UNLOCK(ev, sl, flags) \ + do { \ + struct task_struct *p = NULL; \ + if ((ev)->process) { \ + (p) = (ev)->process; \ + (ev)->process = NULL; \ + } \ + spin_unlock_irqrestore((sl), flags); \ + if (p) { \ + (void)wake_up_process(p); \ + } \ + } while (0) + +//Assumes that the spin lock sl is taken while calling this macro +#define EIO_WAIT_EVENT(ev, sl, flags) \ + do { \ + (ev)->process = current; \ + set_current_state(TASK_INTERRUPTIBLE); \ + spin_unlock_irqrestore((sl), flags); \ + (void)schedule_timeout(10 * HZ); \ + spin_lock_irqsave((sl), flags); \ + (ev)->process = NULL; \ + } while (0) + +#define EIO_CLEAR_EVENT(ev) \ + do { \ + (ev)->process = NULL; \ + } while (0) + + +#include "eio_setlru.h" +#include "eio_policy.h" +#define EIO_CACHE(dmc) (EIO_MD8(dmc) ? (void *)dmc->cache_md8 : (void *)dmc->cache) + + + +#endif /* !EIO_INC_H */ + + diff --git a/drivers/staging/enhanceio/eio_conf.c b/drivers/staging/enhanceio/eio_conf.c new file mode 100644 index 0000000..6c49a7f --- /dev/null +++ b/drivers/staging/enhanceio/eio_conf.c @@ -0,0 +1,2537 @@ +/* + * eio_conf.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * Amit Kale <akale@stec-inc.com> + * Restructured much of the io code to split bio within map function instead + * of letting dm do it. + * Simplified queued logic for write through. + * Amit Kale <akale@stec-inc.com> + * Harish Pujari <hpujari@stec-inc.com> + * Designed and implemented the writeback caching mode + * + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "eio.h" +#include "eio_ttc.h" + +#define KMEM_CACHE_JOB "eio-kcached-jobs" +#define KMEM_EIO_IO "eio-io-context" +#define KMEM_DMC_BIO_PAIR "eio-dmc-bio-pair" +/* #define KMEM_CACHE_PENDING_JOB "eio-pending-jobs" */ + +static struct cache_c *cache_list_head = NULL; +struct work_struct _kcached_wq; + +static struct kmem_cache *_job_cache; +struct kmem_cache *_io_cache; /* cache of eio_context objects */ +mempool_t *_job_pool; +mempool_t *_io_pool; /* pool of eio_context object */ + +atomic_t nr_cache_jobs; + +extern int eio_reboot_notified; + +LIST_HEAD(ssd_rm_list); +int ssd_rm_list_not_empty; +spinlock_t ssd_rm_list_lock; + +struct eio_control_s *eio_control; + +int eio_force_warm_boot; +static int eio_notify_reboot(struct notifier_block *nb, unsigned long action, void *x); +void eio_stop_async_tasks(struct cache_c *dmc); +static int eio_notify_ssd_rm(struct notifier_block *nb, unsigned long action, void *x); + +/* + * The notifiers are registered in descending order of priority and + * executed in descending order or priority. We should be run before + * any notifiers of ssd's or other block devices. Typically, devices + * use a priority of 0. + * XXX - If in the future we happen to use a md device as the cache + * block device, we have a problem because md uses a priority of + * INT_MAX as well. But we want to run before the md's reboot notifier ! + */ +static struct notifier_block eio_reboot_notifier = { + .notifier_call = eio_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* should be > ssd pri's and disk dev pri's */ +}; + +static struct notifier_block eio_ssd_rm_notifier = { + .notifier_call = eio_notify_ssd_rm, + .next = NULL, + .priority = 0, +}; + + +int +eio_wait_schedule(void *unused) +{ + + schedule(); + return 0; +} + +/* + * Check if the System RAM threshold > requested memory, don't care + * if threshold is set to 0. Return value is 0 for fail and 1 for success. + */ +static inline int +eio_mem_available(struct cache_c *dmc, size_t size) +{ + struct sysinfo si; + + + if (unlikely(dmc->sysctl_active.mem_limit_pct <= 0 || dmc->sysctl_active.mem_limit_pct >= 100)) + return 1; + + si_meminfo(&si); + return (((si.freeram << PAGE_SHIFT) * dmc->sysctl_active.mem_limit_pct) / 100) > size; +} + +/* create a new thread and call the specified function */ +void * +eio_create_thread(int (*func)(void *), void *context, char *name) +{ + return kthread_run(func, context, name); +} + +/* wait for the given thread to exit */ +void +eio_wait_thread_exit(void *thrdptr, int *running) +{ + while (*running) { + msleep(1); + } + + //do_exit() would be called within the thread func itself + + return; +} + +/* thread exit self */ +void +eio_thread_exit(long exit_code) +{ + do_exit(exit_code); +} + + + +inline int +eio_policy_init(struct cache_c *dmc) +{ + int error = 0; + + + if (dmc->req_policy == 0) + dmc->req_policy = CACHE_REPL_DEFAULT; + + if (dmc->req_policy == CACHE_REPL_RANDOM) { + dmc->policy_ops = NULL; + pr_info("Setting replacement policy to random"); + } else { + dmc->policy_ops = eio_get_policy(dmc->req_policy); + if (dmc->policy_ops == NULL) { + dmc->req_policy = CACHE_REPL_RANDOM; + pr_err("policy_init: Cannot find requested policy, defaulting to random"); + error = -ENOMEM; + } else { + /* Back pointer to reference dmc from policy_ops */ + dmc->policy_ops->sp_dmc = dmc; + pr_info("Setting replacement policy to %s (%d)", (dmc->policy_ops->sp_name == CACHE_REPL_FIFO) ? "fifo" : "lru", + dmc->policy_ops->sp_name); + } + } + return error; +} + +static int +eio_jobs_init(void) +{ + + _job_cache = _io_cache = NULL; + _job_pool = _io_pool = NULL; + + _job_cache = kmem_cache_create(KMEM_CACHE_JOB, + sizeof(struct kcached_job), + __alignof__(struct kcached_job), + 0, NULL); + if (!_job_cache) + return -ENOMEM; + + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, + mempool_free_slab, _job_cache); + if (!_job_pool) + goto out; + + _io_cache = kmem_cache_create(KMEM_EIO_IO, + sizeof(struct eio_context), + __alignof__(struct eio_context), + 0, NULL); + if (!_io_cache) + goto out; + + _io_pool = mempool_create(MIN_EIO_IO, mempool_alloc_slab, + mempool_free_slab, _io_cache); + if (!_io_pool) + goto out; + + return 0; + +out: + if (_io_pool) + mempool_destroy(_io_pool); + if (_io_cache) + kmem_cache_destroy(_io_cache); + if (_job_pool) + mempool_destroy(_job_pool); + if (_job_cache) + kmem_cache_destroy(_job_cache); + + _job_pool = _io_pool = NULL; + _job_cache = _io_cache = NULL; + return -ENOMEM; +} + +static void +eio_jobs_exit(void) +{ + + mempool_destroy(_io_pool); + mempool_destroy(_job_pool); + kmem_cache_destroy(_io_cache); + kmem_cache_destroy(_job_cache); + + _job_pool = _io_pool = NULL; + _job_cache = _io_cache = NULL; +} + + +static int +eio_kcached_init(struct cache_c *dmc) +{ + + /* init_waitqueue_head(&dmc->destroyq); */ + atomic_set(&dmc->nr_jobs, 0); + return 0; +} + + +static void +eio_kcached_client_destroy(struct cache_c *dmc) +{ + + /* Wait for all IOs */ + //wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs)); +} + +/* Store the cache superblock on ssd */ +int +eio_sb_store(struct cache_c *dmc) +{ + eio_superblock_t *sb = NULL; + struct eio_io_region where; + int error; + + struct bio_vec *sb_pages; + int nr_pages; + int page_count, page_index; + + if ((unlikely(CACHE_FAILED_IS_SET(dmc)) || CACHE_DEGRADED_IS_SET(dmc)) && + (!CACHE_SSD_ADD_INPROG_IS_SET(dmc))) { + pr_err("sb_store: Cannot write superblock for cache \"%s\", in degraded/failed mode.\n", + dmc->cache_name); + return -ENODEV; + } + + page_count = 0; + nr_pages = EIO_SUPERBLOCK_SIZE / PAGE_SIZE; + VERIFY(nr_pages != 0); + + sb_pages = eio_alloc_pages(nr_pages, &page_count); + if (sb_pages == NULL) { + pr_err("sb_store: System memory too low.\n"); + return -ENOMEM; + } + + VERIFY(page_count == nr_pages); + + nr_pages = page_count; + page_index = 0; + sb = (eio_superblock_t *)kmap(sb_pages[page_index].bv_page); + + sb->sbf.cache_sb_state = dmc->sb_state; + sb->sbf.block_size = dmc->block_size; + sb->sbf.size = dmc->size; + sb->sbf.assoc = dmc->assoc; + sb->sbf.cache_md_start_sect = dmc->md_start_sect; + sb->sbf.cache_data_start_sect = dmc->md_sectors; + strncpy(sb->sbf.disk_devname, dmc->disk_devname, DEV_PATHLEN); + strncpy(sb->sbf.cache_devname, dmc->cache_devname, DEV_PATHLEN); + strncpy(sb->sbf.ssd_uuid, dmc->ssd_uuid, DEV_PATHLEN - 1); + sb->sbf.cache_devsize = to_sector(eio_get_device_size(dmc->cache_dev)); + sb->sbf.disk_devsize = to_sector(eio_get_device_size(dmc->disk_dev)); + sb->sbf.cache_version = dmc->sb_version; + strncpy(sb->sbf.cache_name, dmc->cache_name, DEV_PATHLEN); + sb->sbf.cache_name[DEV_PATHLEN-1] = '\0'; + sb->sbf.mode = dmc->mode; + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + sb->sbf.repl_policy = dmc->req_policy; + sb->sbf.cache_flags = dmc->cache_flags & ~CACHE_FLAGS_INCORE_ONLY; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (dmc->sb_version) { + sb->sbf.magic = EIO_MAGIC; + } else { + sb->sbf.magic = EIO_BAD_MAGIC; + } + + sb->sbf.cold_boot = dmc->cold_boot; + if (sb->sbf.cold_boot && eio_force_warm_boot) { + sb->sbf.cold_boot |= BOOT_FLAG_FORCE_WARM; + } + + sb->sbf.dirty_high_threshold = dmc->sysctl_active.dirty_high_threshold; + sb->sbf.dirty_low_threshold = dmc->sysctl_active.dirty_low_threshold; + sb->sbf.dirty_set_high_threshold = dmc->sysctl_active.dirty_set_high_threshold; + sb->sbf.dirty_set_low_threshold = dmc->sysctl_active.dirty_set_low_threshold; + sb->sbf.time_based_clean_interval = dmc->sysctl_active.time_based_clean_interval; + sb->sbf.autoclean_threshold = dmc->sysctl_active.autoclean_threshold; + + /* write out to ssd */ + where.bdev = dmc->cache_dev->bdev; + where.sector = EIO_SUPERBLOCK_START; + where.count = to_sector(EIO_SUPERBLOCK_SIZE); + error = eio_io_sync_vm(dmc, &where, WRITE, sb_pages, nr_pages); + if (error) { + pr_err("sb_store: Could not write out superblock to sector %lu (error %d) for cache \"%s\".\n", + where.sector, error, dmc->cache_name); + } + + /* free the allocated pages here */ + if (sb_pages) { + kunmap(sb_pages[0].bv_page); + for (page_index = 0; page_index < nr_pages; page_index++) + put_page(sb_pages[page_index].bv_page); + kfree(sb_pages); + sb_pages = NULL; + } + + return error; +} + +/* + * Write out the metadata one sector at a time. + * Then dump out the superblock. + */ +int +eio_md_store(struct cache_c *dmc) +{ + struct flash_cacheblock *next_ptr; + struct eio_io_region where; + sector_t i; + int j, k; + int num_valid = 0, num_dirty = 0; + int error; + int write_errors = 0; + sector_t sectors_written = 0, sectors_expected = 0; /* debug */ + int slots_written = 0; /* How many cache slots did we fill in this MD io block ? */ + + struct bio_vec *pages; + int nr_pages; + int page_count, page_index; + void **pg_virt_addr; + + + if (unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + pr_err("md_store: Cannot write metadata in failed/degraded mode for cache \"%s\".", + dmc->cache_name); + return -ENODEV; + } + + if (CACHE_FAST_REMOVE_IS_SET(dmc)) { + if (CACHE_VERBOSE_IS_SET(dmc)) { + pr_info("Skipping writing out metadata to cache"); + } + if (!dmc->sb_version) { + + /* + * Incase of delete, flush the superblock + * irrespective of fast_remove being set. + */ + + goto sb_store; + } + return 0; + } + + if (!eio_mem_available(dmc, METADATA_IO_BLOCKSIZE_SECT)) { + pr_err("md_store: System memory too low for allocating metadata IO buffers"); + return -ENOMEM; + } + + page_count = 0; + pages = eio_alloc_pages(dmc->bio_nr_pages, &page_count); + if (pages == NULL) { + pr_err("eio_md_store: System memory too low."); + return -ENOMEM; + } + + /* get the exact number of pages allocated */ + nr_pages = page_count; + where.bdev = dmc->cache_dev->bdev; + where.sector = dmc->md_start_sect; + slots_written = 0; + page_index = 0; + + pg_virt_addr = kmalloc(nr_pages * (sizeof (void *)), GFP_KERNEL); + if (pg_virt_addr == NULL) { + pr_err("eio_md_store: System memory too low."); + for (k = 0; k < nr_pages; k++) + put_page(pages[k].bv_page); + kfree(pages); + return -ENOMEM; + } + + for (k = 0; k < nr_pages; k++) + pg_virt_addr[k] = kmap(pages[k].bv_page); + + next_ptr = (struct flash_cacheblock *)pg_virt_addr[page_index]; + j = MD_BLOCKS_PER_PAGE; + + pr_info("Writing out metadata to cache device. Please wait..."); + + for (i = 0 ; i < dmc->size ; i++) { + if (EIO_CACHE_STATE_GET(dmc, (index_t)i) & VALID) + num_valid++; + if (EIO_CACHE_STATE_GET(dmc, (index_t)i) & DIRTY) + num_dirty++; + next_ptr->dbn = EIO_DBN_GET(dmc, i); + next_ptr->cache_state = EIO_CACHE_STATE_GET(dmc, (index_t)i) & + (INVALID | VALID | DIRTY); + + next_ptr++; + slots_written++; + j--; + if (j == 0) { + /* + * Filled the page, goto the next page. + */ + page_index++; + + if (slots_written == (int)(MD_BLOCKS_PER_PAGE * nr_pages)) { + /* + * Wrote out an entire metadata IO block, write the block to the ssd. + */ + where.count = slots_written / MD_BLOCKS_PER_SECTOR; + slots_written = 0; + page_index = 0; + sectors_written += where.count; /* debug */ + + error = eio_io_sync_vm(dmc, &where, WRITE, pages, nr_pages); + + if (error) { + write_errors++; + pr_err("md_store: Could not write out metadata to sector %lu (error %d)", + where.sector, error); + } + where.sector += where.count; /* Advance offset */ + } + /* Move next slot pointer into next sector */ + next_ptr = (struct flash_cacheblock *)pg_virt_addr[page_index]; + j = MD_BLOCKS_PER_PAGE; + } + } + + if (next_ptr != (struct flash_cacheblock *)pg_virt_addr[0]) { + /* Write the remaining last page out */ + VERIFY(slots_written > 0); + + where.count = slots_written / MD_BLOCKS_PER_SECTOR; + + if (slots_written % MD_BLOCKS_PER_SECTOR) + where.count++; + + sectors_written += where.count; + + /* + * This may happen that we are at the beginning of the next page + * and did not fill up any slots in this page. Verify this condition + * and set page_index accordingly. + */ + + if (next_ptr != (struct flash_cacheblock *)pg_virt_addr[page_index]) { + unsigned offset; + + slots_written = slots_written % MD_BLOCKS_PER_PAGE; + + /* + * We have some extra slots written at this page_index. + * Let us try to zero out the remaining page size before submitting + * this page. + */ + offset = slots_written * (sizeof(struct flash_cacheblock)); + memset(pg_virt_addr[page_index] + offset, 0, PAGE_SIZE - offset); + + page_index++; + } + + error = eio_io_sync_vm(dmc, &where, WRITE, pages, page_index); + /* XXX: should we call eio_sb_store() on error ?? */ + if (error) { + write_errors++; + pr_err("md_store: Could not write out metadata to sector %lu (error %d)", + where.sector, error); + } + } + + /* Debug Tests */ + sectors_expected = dmc->size / MD_BLOCKS_PER_SECTOR; + if (dmc->size % MD_BLOCKS_PER_SECTOR) + sectors_expected++; + VERIFY(sectors_expected == sectors_written); + /* XXX: should we call eio_sb_store() on error ?? */ + if (sectors_expected != sectors_written) { + pr_err("md_store: Sector mismatch! sectors_expected=%ld, sectors_written=%ld\n", + sectors_expected, sectors_written); + } + + for (k = 0; k < nr_pages; k++) + kunmap(pages[k].bv_page); + kfree(pg_virt_addr); + + if (pages) + for (k = 0; k < nr_pages; k++) + put_page(pages[k].bv_page); + kfree(pages); + pages = NULL; + + if (write_errors == 0) { + if (num_dirty == 0) { + dmc->sb_state = CACHE_MD_STATE_CLEAN; + } else { + dmc->sb_state = CACHE_MD_STATE_FASTCLEAN; + } + } else { + dmc->sb_state = CACHE_MD_STATE_UNSTABLE; + } + +sb_store: + error = eio_sb_store(dmc); + if (error) { + /* Harish: TBD. should we return error */ + write_errors++; + pr_err("md_store: superblock store failed(error %d)", error); + } + if (!dmc->sb_version && CACHE_FAST_REMOVE_IS_SET(dmc)) { + return 0; + } + + if (write_errors == 0) { + pr_info("Metadata saved on the cache device"); + } else { + pr_info("CRITICAL: There were %d errors in saving metadata on cache device", write_errors); + if (num_dirty) + pr_info("CRITICAL: %d dirty blocks could not be written out", num_dirty); + } + + pr_info("Valid blocks: %d, Dirty blocks: %d, Metadata sectors: %lu", + num_valid, num_dirty, (long unsigned int)dmc->md_sectors); + + return 0; +} + +static int +eio_md_create(struct cache_c *dmc, int force, int cold) +{ + struct flash_cacheblock *next_ptr; + eio_superblock_t *header; + struct eio_io_region where; + sector_t i; + int j, error; + sector_t cache_size, dev_size; + sector_t order; + sector_t sectors_written = 0, sectors_expected = 0; /* debug */ + int slots_written = 0; /* How many cache slots did we fill in this MD io block ? */ + + struct bio_vec *header_page = NULL; /* Header page */ + struct bio_vec *pages = NULL; /* Metadata pages */ + int nr_pages = 0; + int page_count, page_index; + int ret = 0, k; + void **pg_virt_addr = NULL; + + // Allocate single page for superblock header. + page_count = 0; + header_page = eio_alloc_pages(1, &page_count); + if (header_page == NULL) { + pr_err("eio_md_create: System memory too low."); + return -ENOMEM; + } + + VERIFY(page_count = 1); + header = (eio_superblock_t *)kmap(header_page[0].bv_page); + + /* + * Apart from normal cache creation, eio_md_create() is also called when + * the SSD is added as part of eio_resume_caching(). At this point, + * the CACHE_FLAGS_DEGRADED is set, but we do want to write to the md area. + * Therefore, if the CACHE_FLAGS_SSD_ADD_INPROG is set, then proceed instead + * of returning -ENODEV. + */ + if ((unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) + && (!CACHE_SSD_ADD_INPROG_IS_SET(dmc))) { + pr_err("md_create: Cannot write metadata in failed/degraded mode for cache \"%s\".\n", + dmc->cache_name); + ret = -ENODEV; + goto free_header; + } + + where.bdev = dmc->cache_dev->bdev; + where.sector = EIO_SUPERBLOCK_START; + where.count = to_sector(EIO_SUPERBLOCK_SIZE); + error = eio_io_sync_vm(dmc, &where, READ, header_page, 1); + if (error) { + pr_err("md_create: Could not read superblock sector %lu error %d for cache \"%s\".\n", + where.sector, error, dmc->cache_name); + ret = -EINVAL; + goto free_header; + } + + if (!force && + ((header->sbf.cache_sb_state == CACHE_MD_STATE_DIRTY) || + (header->sbf.cache_sb_state == CACHE_MD_STATE_CLEAN) || + (header->sbf.cache_sb_state == CACHE_MD_STATE_FASTCLEAN))) { + pr_err("md_create: Existing cache detected, use force to re-create.\n"); + ret = -EINVAL; + goto free_header; + } + + /* + * Compute the size of the metadata including header. + * and here we also are making sure that metadata and userdata + * on SSD is aligned at 8K boundary. + * + * Note dmc->size is in raw sectors + */ + dmc->md_start_sect = EIO_METADATA_START(dmc->cache_dev_start_sect); + dmc->md_sectors = INDEX_TO_MD_SECTOR(dmc->size / (sector_t)dmc->block_size); + dmc->md_sectors += EIO_EXTRA_SECTORS(dmc->cache_dev_start_sect, dmc->md_sectors); + dmc->size -= dmc->md_sectors; /* total sectors available for cache */ + dmc->size /= dmc->block_size; + dmc->size = (dmc->size / (sector_t)dmc->assoc) * (sector_t)dmc->assoc; + /* Recompute since dmc->size was possibly trunc'ed down */ + dmc->md_sectors = INDEX_TO_MD_SECTOR(dmc->size); + dmc->md_sectors += EIO_EXTRA_SECTORS(dmc->cache_dev_start_sect, dmc->md_sectors); + + if ((error = eio_mem_init(dmc)) == -1) { + ret = -EINVAL; + goto free_header; + } + if ((unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) + && (!CACHE_SSD_ADD_INPROG_IS_SET(dmc))) { + pr_err("md_create: Cannot write metadata in failed/degraded mode for cache \"%s\".\n", + dmc->cache_name); + ret = -ENODEV; + goto free_header; + } + dev_size = to_sector(eio_get_device_size(dmc->cache_dev)); + cache_size = dmc->md_sectors + (dmc->size * dmc->block_size); + if (cache_size > dev_size) { + pr_err("md_create: Requested cache size exceeds the cache device's capacity (%lu > %lu)", + cache_size, dev_size); + ret = -EINVAL; + goto free_header; + } + + order = dmc->size * (EIO_MD8(dmc) ? sizeof (struct cacheblock_md8) : sizeof (struct cacheblock)); + i = EIO_MD8(dmc) ? sizeof (struct cacheblock_md8) : sizeof (struct cacheblock); + pr_info("Allocate %luKB (%luB per) mem for %lu-entry cache " \ + "(capacity:%luMB, associativity:%u, block size:%u bytes)", + order >> 10, i, (long unsigned int)dmc->size, + (cache_size >> (20-SECTOR_SHIFT)), dmc->assoc, dmc->block_size << SECTOR_SHIFT); + + if (!eio_mem_available(dmc, order) && !CACHE_SSD_ADD_INPROG_IS_SET(dmc)) { + pr_err("md_create: System memory too low for allocating cache metadata.\n"); + ret = -ENOMEM; + goto free_header; + } + + /* + * If we are called due to SSD add, the memory was already allocated + * as part of cache creation (i.e., eio_ctr()) in the past. + */ + if (!CACHE_SSD_ADD_INPROG_IS_SET(dmc)) { + if (EIO_MD8(dmc)) + dmc->cache_md8 = (struct cacheblock_md8 *)vmalloc((size_t)order); + else + dmc->cache = (struct cacheblock *)vmalloc((size_t)order); + if ((EIO_MD8(dmc) && !dmc->cache_md8) || (!EIO_MD8(dmc) && !dmc->cache)) { + pr_err("md_create: Unable to allocate cache md for cache \"%s\".\n", + dmc->cache_name); + ret = -ENOMEM; + goto free_header; + } + } + if (eio_repl_blk_init(dmc->policy_ops) != 0) { + pr_err("md_create: Unable to allocate memory for policy cache block for cache \"%s\".\n", + dmc->cache_name); + ret = -ENOMEM; + goto free_header; + } + + if (cold) { + int retry = 0; + do { + for (i = 0; i < dmc->size; i++) { + if (CACHE_SSD_ADD_INPROG_IS_SET(dmc)) { + u_int8_t cache_state = EIO_CACHE_STATE_GET(dmc, i); + if (cache_state & BLOCK_IO_INPROG) { + /* sleep for 1 sec and retry */ + msleep(1000); + break; + } + } + eio_invalidate_md(dmc, i); + } + } while ((retry++ < 10) && (i < dmc->size)); + + if (i < dmc->size) { + pr_err("md_create: Cache \"%s\" is not in quiesce state. Can't proceed to resume.\n", + dmc->cache_name); + ret = -EBUSY; + goto free_header; + } + + /* Allocate pages of the order dmc->bio_nr_pages */ + page_count = 0; + pages = eio_alloc_pages(dmc->bio_nr_pages, &page_count); + if (!pages) { + pr_err("md_create: Unable to allocate pages for cache \"%s\".\n", + dmc->cache_name); + pr_err("md_create: Could not write out cache metadata.\n"); + ret = -ENOMEM; + goto free_header; + } + + /* nr_pages is used for freeing the pages */ + nr_pages = page_count; + + where.bdev = dmc->cache_dev->bdev; + where.sector = dmc->md_start_sect; + slots_written = 0; + page_index = 0; + + pg_virt_addr = kmalloc(nr_pages * (sizeof (void *)), GFP_KERNEL); + if (pg_virt_addr == NULL) { + pr_err("md_create: System memory too low.\n"); + for (k = 0; k < nr_pages; k++) + put_page(pages[k].bv_page); + kfree(pages); + ret = -ENOMEM; + goto free_header; + } + + for (k = 0; k < nr_pages; k++) + pg_virt_addr[k] = kmap(pages[k].bv_page); + + next_ptr = (struct flash_cacheblock *)pg_virt_addr[page_index]; + j = MD_BLOCKS_PER_PAGE; + + for (i = 0 ; i < dmc->size ; i++) { + next_ptr->dbn = EIO_DBN_GET(dmc, i); + next_ptr->cache_state = EIO_CACHE_STATE_GET(dmc, (index_t)i) & + (INVALID | VALID | DIRTY); + next_ptr++; + slots_written++; + j--; + + if (j == 0) { + + page_index++; + + if ((unsigned)slots_written == MD_BLOCKS_PER_PAGE * nr_pages) { + + where.count = slots_written / MD_BLOCKS_PER_SECTOR; + slots_written = 0; + page_index = 0; + sectors_written += where.count; /* debug */ + error = eio_io_sync_vm(dmc, &where, WRITE, pages, nr_pages); + + if (error) { + if (!CACHE_SSD_ADD_INPROG_IS_SET(dmc)) + vfree(EIO_CACHE(dmc)); + pr_err("md_create: Could not write cache metadata sector %lu error %d.\n for cache \"%s\".\n", + where.sector, error, dmc->cache_name); + ret = -EIO; + goto free_md; + } + where.sector += where.count; /* Advance offset */ + } + + /* Move next slot pointer into next page */ + next_ptr = (struct flash_cacheblock *)pg_virt_addr[page_index]; + j = MD_BLOCKS_PER_PAGE; + } + } + + if (next_ptr != (struct flash_cacheblock *)pg_virt_addr[0]) { + /* Write the remaining last page out */ + VERIFY(slots_written > 0); + + where.count = slots_written / MD_BLOCKS_PER_SECTOR; + + if (slots_written % MD_BLOCKS_PER_SECTOR) + where.count++; + + sectors_written += where.count; + + if (next_ptr != (struct flash_cacheblock *)pg_virt_addr[page_index]) { + unsigned offset; + + slots_written = slots_written % MD_BLOCKS_PER_PAGE; + + /* + * We have some extra slots written at this page_index. + * Let us try to zero out the remaining page size before submitting + * this page. + */ + offset = slots_written * (sizeof(struct flash_cacheblock)); + memset(pg_virt_addr[page_index] + offset, 0, PAGE_SIZE - offset); + + page_index = page_index + 1; + } + + error = eio_io_sync_vm(dmc, &where, WRITE, pages, page_index); + if (error) { + if (!CACHE_SSD_ADD_INPROG_IS_SET(dmc)) + vfree((void *)EIO_CACHE(dmc)); + pr_err("md_create: Could not write cache metadata sector %lu error %d for cache \"%s\".\n", + where.sector, error, dmc->cache_name); + ret = -EIO; + goto free_md; + } + } + + /* Debug Tests */ + sectors_expected = dmc->size / MD_BLOCKS_PER_SECTOR; + if (dmc->size % MD_BLOCKS_PER_SECTOR) + sectors_expected++; + if (sectors_expected != sectors_written) { + pr_err("md_create: Sector mismatch! sectors_expected=%ld, sectors_written=%ld for cache \"%s\".\n", + sectors_expected, sectors_written, dmc->cache_name); + ret = -EIO; + goto free_md; + } + } /* if cold ends here */ + + /* Write the superblock */ + + if ((unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) + && (!CACHE_SSD_ADD_INPROG_IS_SET(dmc))) { + pr_err("md_create: Cannot write metadata in failed/degraded mode for cache \"%s\".\n", + dmc->cache_name); + vfree((void *)EIO_CACHE(dmc)); + ret = -ENODEV; + goto free_md; + } + + dmc->sb_state = CACHE_MD_STATE_DIRTY; + dmc->sb_version = EIO_SB_VERSION; + error = eio_sb_store(dmc); + if (error) { + if (!CACHE_SSD_ADD_INPROG_IS_SET(dmc)) + vfree((void *)EIO_CACHE(dmc)); + pr_err("md_create: Could not write cache superblock sector(error %d) for cache \"%s\"\n", + error, dmc->cache_name); + ret = -EIO; + goto free_md; + } + +free_md: + for (k = 0; k < nr_pages; k++) + kunmap(pages[k].bv_page); + kfree(pg_virt_addr); + + /* Free metadata pages here. */ + if (pages) { + for (k = 0; k < nr_pages; k++) + put_page(pages[k].bv_page); + kfree(pages); + pages = NULL; + } + +free_header: + /* Free header page here */ + if (header_page) { + kunmap(header_page[0].bv_page); + put_page(header_page[0].bv_page); + kfree(header_page); + header_page = NULL; + } + + return ret; +} + + +static int +eio_md_load(struct cache_c *dmc) +{ + struct flash_cacheblock *meta_data_cacheblock, *next_ptr; + eio_superblock_t *header; + struct eio_io_region where; + int i; + index_t j, slots_read; + sector_t size; + int clean_shutdown; + int dirty_loaded = 0; + sector_t order, data_size; + int num_valid = 0; + int error; + sector_t sectors_read = 0, sectors_expected = 0; /* Debug */ + int force_warm_boot = 0; + + struct bio_vec *header_page, *pages; + int nr_pages, page_count, page_index; + int ret = 0; + void **pg_virt_addr; + + page_count = 0; + header_page = eio_alloc_pages(1, &page_count); + if (header_page == NULL) { + pr_err ("md_load: Unable to allocate memory"); + return -ENOMEM; + } + + VERIFY(page_count == 1); + header = (eio_superblock_t *)kmap(header_page[0].bv_page); + + if (CACHE_FAILED_IS_SET(dmc) || CACHE_DEGRADED_IS_SET(dmc)) { + pr_err("md_load: Cannot load metadata in failed / degraded mode"); + ret = -ENODEV; + goto free_header; + } + + where.bdev = dmc->cache_dev->bdev; + where.sector = EIO_SUPERBLOCK_START; + where.count = to_sector(EIO_SUPERBLOCK_SIZE); + error = eio_io_sync_vm(dmc, &where, READ, header_page, 1); + if (error) { + pr_err("md_load: Could not read cache superblock sector %lu error %d", + where.sector, error); + ret = -EINVAL; + goto free_header; + } + + /* check ondisk superblock version */ + if (header->sbf.cache_version != EIO_SB_VERSION) { + pr_info("md_load: Cache superblock mismatch detected."\ + " (current: %u, ondisk: %u)", EIO_SB_VERSION, + header->sbf.cache_version); + + if (header->sbf.cache_version == 0) { + pr_err("md_load: Can't enable cache %s. Either "\ + "superblock version is invalid or cache has"\ + " been deleted", header->sbf.cache_name); + ret = 1; + goto free_header; + } + + if (header->sbf.cache_version > EIO_SB_VERSION) { + pr_err("md_load: Can't enable cache %s with newer "\ + " superblock version.", header->sbf.cache_name); + ret = 1; + goto free_header; + } + + if (header->sbf.mode == CACHE_MODE_WB) { + pr_err("md_load: Can't enable write-back cache %s" \ + " with newer superblock version.", + header->sbf.cache_name); + ret = 1; + goto free_header; + } else if ((header->sbf.mode == CACHE_MODE_RO) || + (header->sbf.mode == CACHE_MODE_WT)) { + dmc->persistence = CACHE_FORCECREATE; + pr_info("md_load: Can't enable cache, recreating"\ + " cache %s with newer superblock version.", + header->sbf.cache_name); + ret = 0; + goto free_header; + } + } + + /* check ondisk magic number */ + + if (header->sbf.cache_version >= EIO_SB_MAGIC_VERSION && + header->sbf.magic != EIO_MAGIC) { + pr_err("md_load: Magic number mismatch in superblock detected."\ + " (current: %u, ondisk: %u)", EIO_MAGIC, + header->sbf.magic); + ret = 1; + goto free_header; + } + + dmc->sb_version = EIO_SB_VERSION; + + /* + * Harish: TBD + * For writeback, only when the dirty blocks are non-zero + * and header state is unexpected, we should treat it as md corrupted. + * Otherwise, a bad write in last shutdown, can lead to data inaccessible + * in writeback case. + */ + if (!((header->sbf.cache_sb_state == CACHE_MD_STATE_DIRTY) || + (header->sbf.cache_sb_state == CACHE_MD_STATE_CLEAN) || + (header->sbf.cache_sb_state == CACHE_MD_STATE_FASTCLEAN))) { + pr_err("md_load: Corrupt cache superblock"); + ret = -EINVAL; + goto free_header; + } + + if (header->sbf.cold_boot & BOOT_FLAG_FORCE_WARM) { + force_warm_boot = 1; + header->sbf.cold_boot &= ~BOOT_FLAG_FORCE_WARM; + } + + /* + * Determine if we can start as cold or hot cache + * - if cold_boot is set(unless force_warm_boot), start as cold cache + * - else if it is unclean shutdown, start as cold cache + * cold cache will still treat the dirty blocks as hot + */ + if (dmc->cold_boot != header->sbf.cold_boot) { + pr_info("superblock(%u) and config(%u) cold boot values do not match. Relying on config", + header->sbf.cold_boot, dmc->cold_boot); + } + if (dmc->cold_boot && !force_warm_boot) { + pr_info("Cold boot is set, starting as if unclean shutdown(only dirty blocks will be hot)"); + clean_shutdown = 0; + } else { + if (header->sbf.cache_sb_state == CACHE_MD_STATE_DIRTY) { + pr_info("Unclean shutdown detected"); + pr_info("Only dirty blocks exist in cache"); + clean_shutdown = 0; + } else if (header->sbf.cache_sb_state == CACHE_MD_STATE_CLEAN) { + pr_info("Slow (clean) shutdown detected"); + pr_info("Only clean blocks exist in cache"); + clean_shutdown = 1; + } else if (header->sbf.cache_sb_state == CACHE_MD_STATE_FASTCLEAN) { + pr_info("Fast (clean) shutdown detected"); + pr_info("Both clean and dirty blocks exist in cache"); + clean_shutdown = 1; + } else { + /* Harish: Won't reach here, but TBD may change the previous if condition */ + pr_info("cache state is %d. Treating as unclean shutdown", + header->sbf.cache_sb_state); + pr_info("Only dirty blocks exist in cache"); + clean_shutdown = 0; + } + } + + if (!dmc->mode) + dmc->mode = header->sbf.mode; + if (!dmc->req_policy) + dmc->req_policy = header->sbf.repl_policy; + + if (!dmc->cache_flags) + dmc->cache_flags = header->sbf.cache_flags; + + (void)eio_policy_init(dmc); + + dmc->block_size = header->sbf.block_size; + dmc->block_shift = ffs(dmc->block_size) - 1; + dmc->block_mask = dmc->block_size - 1; + dmc->size = header->sbf.size; + dmc->cache_size = header->sbf.cache_devsize; + dmc->assoc = header->sbf.assoc; + dmc->consecutive_shift = ffs(dmc->assoc) - 1; + dmc->md_start_sect = header->sbf.cache_md_start_sect; + dmc->md_sectors = header->sbf.cache_data_start_sect; + dmc->sysctl_active.dirty_high_threshold = header->sbf.dirty_high_threshold; + dmc->sysctl_active.dirty_low_threshold = header->sbf.dirty_low_threshold; + dmc->sysctl_active.dirty_set_high_threshold = header->sbf.dirty_set_high_threshold; + dmc->sysctl_active.dirty_set_low_threshold = header->sbf.dirty_set_low_threshold; + dmc->sysctl_active.time_based_clean_interval = header->sbf.time_based_clean_interval; + dmc->sysctl_active.autoclean_threshold = header->sbf.autoclean_threshold; + + if ((i = eio_mem_init(dmc)) == -1) { + pr_err("eio_md_load: Failed to initialize memory."); + ret = -EINVAL; + goto free_header; + } + + order = dmc->size * ((i == 1) ? sizeof (struct cacheblock_md8) : sizeof (struct cacheblock)); + data_size = dmc->size * dmc->block_size; + size = EIO_MD8(dmc) ? sizeof (struct cacheblock_md8) : sizeof (struct cacheblock); + pr_info("Allocate %luKB (%ldB per) mem for %lu-entry cache " \ + "(capacity:%luMB, associativity:%u, block size:%u bytes)", + order >> 10, size, (long unsigned int)dmc->size, + (long unsigned int)(dmc->md_sectors + data_size) >> (20-SECTOR_SHIFT), + dmc->assoc, dmc->block_size << SECTOR_SHIFT); + + if (EIO_MD8(dmc)) + dmc->cache_md8 = (struct cacheblock_md8 *)vmalloc((size_t)order); + else + dmc->cache = (struct cacheblock *)vmalloc((size_t)order); + + if ((EIO_MD8(dmc) && !dmc->cache_md8) || (!EIO_MD8(dmc) && !dmc->cache)) { + pr_err("md_load: Unable to allocate memory"); + vfree((void *)header); + return 1; + } + + if (eio_repl_blk_init(dmc->policy_ops) != 0) { + vfree((void *)EIO_CACHE(dmc)); + pr_err("md_load: Unable to allocate memory for policy cache block"); + ret = -EINVAL; + goto free_header; + } + + /* Allocate pages of the order dmc->bio_nr_pages */ + page_count = 0; + pages = eio_alloc_pages(dmc->bio_nr_pages, &page_count); + if (!pages) { + pr_err("md_create: unable to allocate pages"); + pr_err("md_create: Could not write out cache metadata"); + vfree((void *)EIO_CACHE(dmc)); + ret = -ENOMEM; + goto free_header; + } + + /* nr_pages is used for freeing the pages */ + nr_pages = page_count; + + pg_virt_addr = kmalloc(nr_pages * (sizeof (void *)), GFP_KERNEL); + if (pg_virt_addr == NULL) { + pr_err("eio_md_store: System memory too low."); + for (i = 0; i < nr_pages; i++) + put_page(pages[i].bv_page); + kfree(pages); + ret = -ENOMEM; + goto free_header; + } + + for (i = 0; i < nr_pages; i++) + pg_virt_addr[i] = kmap(pages[i].bv_page); + + /* + * Read 1 PAGE of the metadata at a time and load up the + * incore metadata struct. + */ + + page_index = 0; + page_count = 0; + meta_data_cacheblock = (struct flash_cacheblock *)pg_virt_addr[page_index]; + + where.bdev = dmc->cache_dev->bdev; + where.sector = dmc->md_start_sect; + size = dmc->size; + i = 0; + while (size > 0) { + slots_read = min((long)size, ((long)MD_BLOCKS_PER_PAGE * nr_pages)); + + if (slots_read % MD_BLOCKS_PER_SECTOR) + where.count = 1 + (slots_read / MD_BLOCKS_PER_SECTOR); + else + where.count = slots_read / MD_BLOCKS_PER_SECTOR; + + if (slots_read % MD_BLOCKS_PER_PAGE) + page_count = 1 + (slots_read / MD_BLOCKS_PER_PAGE); + else + page_count = slots_read / MD_BLOCKS_PER_PAGE; + + sectors_read += where.count; /* Debug */ + error = eio_io_sync_vm(dmc, &where, READ, pages, page_count); + if (error) { + vfree((void *)EIO_CACHE(dmc)); + pr_err("md_load: Could not read cache metadata sector %lu error %d", + where.sector, error); + ret = -EIO; + goto free_md; + } + + where.sector += where.count; + next_ptr = meta_data_cacheblock; + + for (j = 0, page_index = 0 ; j < slots_read ; j++) { + + if ((j % MD_BLOCKS_PER_PAGE) == 0) + next_ptr = (struct flash_cacheblock *)pg_virt_addr[page_index++]; + + // If unclean shutdown, only the DIRTY blocks are loaded. + if (clean_shutdown || (next_ptr->cache_state & DIRTY)) { + + if (next_ptr->cache_state & DIRTY) + dirty_loaded++; + + EIO_CACHE_STATE_SET(dmc, i, (u_int8_t)next_ptr->cache_state & ~QUEUED); + + VERIFY((EIO_CACHE_STATE_GET(dmc, i) & (VALID | INVALID)) + != (VALID | INVALID)); + + if (EIO_CACHE_STATE_GET(dmc, i) & VALID) + num_valid++; + EIO_DBN_SET(dmc, i, next_ptr->dbn); + } else { + eio_invalidate_md(dmc, i); + } + next_ptr++; + i++; + } + size -= slots_read; + } + + /* + * If the cache contains dirty data, the only valid mode is write back. + */ + if (dirty_loaded && dmc->mode != CACHE_MODE_WB) { + vfree((void *)EIO_CACHE(dmc)); + pr_err("md_load: Cannot use %s mode because dirty data exists in the cache", \ + (dmc->mode == CACHE_MODE_RO) ? "read only" : "write through"); + ret = -EINVAL; + goto free_md; + } + + /* Debug Tests */ + sectors_expected = dmc->size / MD_BLOCKS_PER_SECTOR; + if (dmc->size % MD_BLOCKS_PER_SECTOR) + sectors_expected++; + if (sectors_expected != sectors_read) { + pr_err("md_load: Sector mismatch! sectors_expected=%ld, sectors_read=%ld\n", + sectors_expected, sectors_read); + vfree((void *)EIO_CACHE(dmc)); + ret = -EIO; + goto free_md; + } + + /* Before we finish loading, we need to dirty the superblock and write it out */ + dmc->sb_state = CACHE_MD_STATE_DIRTY; + error = eio_sb_store(dmc); + if (error) { + vfree((void *)EIO_CACHE(dmc)); + pr_err("md_load: Could not write cache superblock sector(error %d)", error); + ret = 1; + goto free_md; + } + +free_md: + for (i = 0; i < nr_pages; i++) + kunmap(pages[i].bv_page); + kfree(pg_virt_addr); + + if (pages) { + for (i = 0; i < nr_pages; i++) + put_page(pages[i].bv_page); + kfree(pages); + pages = NULL; + } + +free_header: + /* Free header page here */ + if (header_page) { + kunmap(header_page[0].bv_page); + put_page(header_page[0].bv_page); + kfree(header_page); + header_page = NULL; + } + + pr_info("Cache metadata loaded from disk with %d valid %d dirty blocks", + num_valid, dirty_loaded); + return ret; +} + +void +eio_policy_free(struct cache_c *dmc) +{ + + if (dmc->policy_ops != NULL) { + eio_put_policy(dmc->policy_ops); + vfree(dmc->policy_ops); + } + if (dmc->sp_cache_blk != NULL) + vfree(dmc->sp_cache_blk); + if (dmc->sp_cache_set != NULL) + vfree(dmc->sp_cache_set); + + dmc->policy_ops = NULL; + dmc->sp_cache_blk = dmc->sp_cache_set = NULL; + return; +} + +static int +eio_clean_thread_init(struct cache_c *dmc) +{ + INIT_LIST_HEAD(&dmc->cleanq); + spin_lock_init(&dmc->clean_sl); + EIO_INIT_EVENT(&dmc->clean_event); + return eio_start_clean_thread(dmc); +} + +int +eio_handle_ssd_message(char *cache_name, char *ssd_name, dev_notifier_t note) +{ + struct cache_c *dmc; + + dmc = eio_cache_lookup(cache_name); + if (NULL == dmc) { + pr_err("eio_handle_ssd_message: cache %s does not exist", cache_name); + return -EINVAL; + } + + switch(note) { + + case NOTIFY_SSD_ADD: + /* Making sure that CACHE state is not active */ + if (CACHE_FAILED_IS_SET(dmc) || CACHE_DEGRADED_IS_SET(dmc)) + eio_resume_caching(dmc, ssd_name); + else + pr_err("eio_handle_ssd_message: SSD_ADD event called for ACTIVE cache \"%s\", ignoring!!!", + dmc->cache_name); + break; + + case NOTIFY_SSD_REMOVED: + eio_suspend_caching(dmc, note); + break; + + default: + pr_err("Wrong notifier passed for eio_handle_ssd_message\n"); + } + + return 0; +} + +static void +eio_init_ssddev_props(struct cache_c *dmc) +{ + struct request_queue *rq; + uint32_t max_hw_sectors, max_nr_pages; + uint32_t nr_pages = 0; + + rq = bdev_get_queue(dmc->cache_dev->bdev); + max_hw_sectors = to_bytes(queue_max_hw_sectors(rq)) / PAGE_SIZE; + max_nr_pages = (u_int32_t)bio_get_nr_vecs(dmc->cache_dev->bdev); + nr_pages = min_t(u_int32_t, max_hw_sectors, max_nr_pages); + dmc->bio_nr_pages = nr_pages; + + /* + * If the cache device is not a physical device (eg: lv), then + * driverfs_dev will be null and we make cache_gendisk_name a null + * string. The eio_notify_ssd_rm() function in this case, + * cannot detect device removal, and therefore, we will have to rely + * on user space udev for the notification. + */ + + if (dmc->cache_dev && dmc->cache_dev->bdev && + dmc->cache_dev->bdev->bd_disk && + dmc->cache_dev->bdev->bd_disk->driverfs_dev) { + strncpy(dmc->cache_gendisk_name, + dev_name(dmc->cache_dev->bdev->bd_disk->driverfs_dev), + DEV_PATHLEN); + } else { + dmc->cache_gendisk_name[0] = '\0'; + } +} + +static void +eio_init_srcdev_props(struct cache_c *dmc) +{ + /* Same applies for source device as well. */ + if (dmc->disk_dev && dmc->disk_dev->bdev && + dmc->disk_dev->bdev->bd_disk && + dmc->disk_dev->bdev->bd_disk->driverfs_dev) { + strncpy(dmc->cache_srcdisk_name, + dev_name(dmc->disk_dev->bdev->bd_disk->driverfs_dev), + DEV_PATHLEN); + } else { + dmc->cache_srcdisk_name[0] = '\0'; + } +} + + +int +eio_cache_create(cache_rec_short_t *cache) +{ + struct cache_c *dmc; + struct cache_c **nodepp; + unsigned int consecutive_blocks; + u_int64_t i; + index_t prev_set; + index_t cur_set; + sector_t order; + int error = -EINVAL; + uint32_t persistence = 0; + fmode_t mode = (FMODE_READ | FMODE_WRITE); + char *strerr = NULL; + + dmc = (struct cache_c *)kzalloc(sizeof(*dmc), GFP_KERNEL); + if (dmc == NULL) { + strerr = "Failed to allocate memory for cache context"; + error = -ENOMEM; + goto bad; + } + + /* + * Source device. + */ + + error = eio_ttc_get_device(cache->cr_src_devname, mode, &dmc->disk_dev); + if (error) { + strerr = "get_device for source device failed"; + goto bad1; + } + if (NULL == dmc->disk_dev) { + error = -EINVAL; + strerr = "Failed to lookup source device"; + goto bad1; + } + if ((dmc->disk_size = to_sector(eio_get_device_size(dmc->disk_dev))) >= EIO_MAX_SECTOR) { + strerr = "Source device too big to support"; + error = -EFBIG; + goto bad2; + } + strncpy(dmc->disk_devname, cache->cr_src_devname, DEV_PATHLEN); + + /* + * Cache device. + */ + + error = eio_ttc_get_device(cache->cr_ssd_devname, mode, &dmc->cache_dev); + if (error) { + strerr = "get_device for cache device failed"; + goto bad2; + } + if (NULL == dmc->cache_dev) { + error = -EINVAL; + strerr = "Failed to lookup source device"; + goto bad2; + } + if (dmc->disk_dev == dmc->cache_dev) { + error = -EINVAL; + strerr = "Same devices specified"; + goto bad3; + } + strncpy(dmc->cache_devname, cache->cr_ssd_devname, DEV_PATHLEN); + + if (cache->cr_name[0] != '\0') { + strncpy(dmc->cache_name, cache->cr_name, + sizeof (dmc->cache_name)); + /* make sure it is zero terminated */ + dmc->cache_name[sizeof (dmc->cache_name) - 1] = '\x00'; + } else { + strerr = "Need cache name"; + error = -EINVAL; + goto bad3; + } + + + strncpy(dmc->ssd_uuid, cache->cr_ssd_uuid, DEV_PATHLEN - 1); + + dmc->cache_dev_start_sect = eio_get_device_start_sect(dmc->cache_dev); + error = eio_do_preliminary_checks(dmc); + if (error) { + if (error == -EINVAL) + strerr = "Either Source and Cache devices belong to " + "same device or a cache already exists on" + " specified source device"; + else if(error == -EEXIST) + strerr = "Cache already exists"; + goto bad3; + } + + eio_init_ssddev_props(dmc); + eio_init_srcdev_props(dmc); + + /* + * Initialize the io callback queue. + */ + + dmc->callback_q = create_singlethread_workqueue("eio_callback"); + if (!dmc->callback_q) { + error = -ENOMEM; + strerr = "Failed to initialize callback workqueue"; + goto bad4; + } + error = eio_kcached_init(dmc); + if (error) { + strerr = "Failed to initialize kcached"; + goto bad4; + } + + /* + * We read policy before reading other args. The reason is that + * if there is a policy module loaded, we first need dmc->p_ops to be + * allocated so that it is non NULL. Once p_ops is !NULL, cache_blk_init + * and cache_set_init can set their pointers to dmc->p_ops->xxx + * + * policy_ops == NULL is not really an error. It just means that there + * is no registered policy and therefore we use EIO_REPL_RANDOM (random) + * as the replacement policy. + */ + + /* We do a kzalloc for dmc, but being extra careful here */ + dmc->sp_cache_blk = NULL; + dmc->sp_cache_set = NULL; + dmc->policy_ops = NULL; + if (cache->cr_policy) { + dmc->req_policy = cache->cr_policy; + if (dmc->req_policy && (dmc->req_policy < CACHE_REPL_FIRST || + dmc->req_policy > CACHE_REPL_LAST)) { + strerr = "Invalid cache policy"; + error = -EINVAL; + goto bad5; + } + } + + /* + * We need to determine the requested cache mode before we call + * eio_md_load becuase it examines dmc->mode. The cache mode is + * set as follows: + * 1. For a "reload" operation: + * - if mode is not provided as an argument, + it is read from superblock. + * - if mode is provided as an argument, + eio_md_load verifies that it is valid. + * 2. For a "create" operation: + * - if mode is not provided, it is set to CACHE_MODE_DEFAULT. + * - if mode is provided, it is validate and set. + */ + if (cache->cr_mode) { + dmc->mode = cache->cr_mode; + if (dmc->mode && (dmc->mode < CACHE_MODE_FIRST || + dmc->mode > CACHE_MODE_LAST)) { + strerr = "Invalid cache mode"; + error = -EINVAL; + goto bad5; + } + } + + dmc->cold_boot = cache->cr_cold_boot; + if ((dmc->cold_boot != 0) && (dmc->cold_boot != BOOT_FLAG_COLD_ENABLE)) { + strerr = "Invalid cold boot option"; + error = -EINVAL; + goto bad5; + } + + if (cache->cr_persistence) { + persistence = cache->cr_persistence; + if (persistence < CACHE_RELOAD || + persistence > CACHE_FORCECREATE) { + pr_err("ctr: persistence = %d", persistence); + strerr = "Invalid cache persistence"; + error = -EINVAL; + goto bad5; + } + dmc->persistence = persistence; + } + if (persistence == CACHE_RELOAD) { + if (eio_md_load(dmc)) { + strerr = "Failed to reload cache"; + error = -EINVAL; + goto bad5; + } + + /* + * "eio_md_load" will reset "dmc->persistence" from + * CACHE_RELOAD to CACHE_FORCECREATE in the case of + * cache superblock version mismatch and cache mode + * is Read-Only or Write-Through. + */ + if (dmc->persistence != persistence) { + persistence = dmc->persistence; + } + } + + /* + * Now that we're back from "eio_md_load" in the case of a reload, + * we're ready to finish setting up the mode and policy. + */ + if (dmc->mode == 0) { + dmc->mode = CACHE_MODE_DEFAULT; + pr_info("Setting mode to default"); + } else { + pr_info("Setting mode to %s ", + (dmc->mode == CACHE_MODE_WB) ? "write back" : + ((dmc->mode == CACHE_MODE_RO) ? "read only" : + "write through")); + } + + /* eio_policy_init() is already called from within eio_md_load() */ + if (persistence != CACHE_RELOAD) + (void)eio_policy_init(dmc); + + if (cache->cr_flags) { + int flags; + flags = cache->cr_flags; + if (flags == 0) + dmc->cache_flags &= ~CACHE_FLAGS_INVALIDATE; + else if (flags == 1) { + dmc->cache_flags |= CACHE_FLAGS_INVALIDATE; + pr_info("Enabling invalidate API"); + } else + pr_info("Ignoring unknown flags value: %u", flags); + } + + if (persistence == CACHE_RELOAD) + goto init; /* Skip reading cache parameters from command line */ + + if (cache->cr_blksize && cache->cr_ssd_sector_size) { + dmc->block_size = cache->cr_blksize / cache->cr_ssd_sector_size; + if (dmc->block_size & (dmc->block_size - 1)) { + strerr = "Invalid block size"; + error = -EINVAL; + goto bad5; + } + if (dmc->block_size == 0) + dmc->block_size = DEFAULT_CACHE_BLKSIZE; + } else + dmc->block_size = DEFAULT_CACHE_BLKSIZE; + dmc->block_shift = ffs(dmc->block_size) - 1; + dmc->block_mask = dmc->block_size - 1; + + /* + * dmc->size is specified in sectors here, and converted to blocks later + * + * Giving preference to kernel got cache size. + * Only when we can't get the cache size in kernel, we accept user passed size. + * User mode may be using a different API or could also do some rounding, so we + * prefer kernel getting the cache size. In case of device failure and coming back, we + * rely on the device size got in kernel and we hope that it is equal to the + * one we used for creating the cache, so we ideally should always use the kernel + * got cache size. + */ + dmc->size = to_sector(eio_get_device_size(dmc->cache_dev)); + if (dmc->size == 0) { + if (cache->cr_ssd_dev_size && cache->cr_ssd_sector_size) { + dmc->size = cache->cr_ssd_dev_size / cache->cr_ssd_sector_size; + } + + if (dmc->size == 0) { + strerr = "Invalid cache size or can't be fetched"; + error = -EINVAL; + goto bad5; + } + } + + dmc->cache_size = dmc->size; + + if (cache->cr_assoc) { + dmc->assoc = cache->cr_assoc; + if ((dmc->assoc & (dmc->assoc - 1)) || + dmc->assoc > EIO_MAX_ASSOC || + dmc->size < dmc->assoc) { + strerr = "Invalid cache associativity"; + error = -EINVAL; + goto bad5; + } + if (dmc->assoc == 0) + dmc->assoc = DEFAULT_CACHE_ASSOC; + } else + dmc->assoc = DEFAULT_CACHE_ASSOC; + + /* + * initialize to an invalid index + */ + + dmc->index_zero = dmc->assoc + 1; + + /* + * Although it's very unlikely, we need to make sure that + * for the given associativity and block size our source + * device will have less than 4 billion sets. + */ + + i = to_sector(eio_get_device_size(dmc->disk_dev)) / + (dmc->assoc * dmc->block_size); + if (i >= (((u_int64_t)1) << 32)) { + strerr = "Too many cache sets to support"; + goto bad5; + } + + consecutive_blocks = dmc->assoc; + dmc->consecutive_shift = ffs(consecutive_blocks) - 1; + + /* Initialize persistent thresholds */ + dmc->sysctl_active.dirty_high_threshold = DIRTY_HIGH_THRESH_DEF; + dmc->sysctl_active.dirty_low_threshold = DIRTY_LOW_THRESH_DEF; + dmc->sysctl_active.dirty_set_high_threshold = DIRTY_SET_HIGH_THRESH_DEF; + dmc->sysctl_active.dirty_set_low_threshold = DIRTY_SET_LOW_THRESH_DEF; + dmc->sysctl_active.autoclean_threshold = AUTOCLEAN_THRESH_DEF; + dmc->sysctl_active.time_based_clean_interval = TIME_BASED_CLEAN_INTERVAL_DEF(dmc); + + spin_lock_init(&dmc->cache_spin_lock); + if (persistence == CACHE_CREATE) { + error = eio_md_create(dmc,/* force */ 0, /* cold */ 1); + if (error) { + strerr = "Failed to create cache"; + goto bad5; + } + } else { + error = eio_md_create(dmc,/* force */ 1, /* cold */ 1); + if (error) { + strerr = "Failed to force create cache"; + goto bad5; + } + } + +init: + order = (dmc->size >> dmc->consecutive_shift) * + sizeof(struct cache_set); + + if (!eio_mem_available(dmc, order)) { + strerr = "System memory too low" + " for allocating cache set metadata"; + error = -ENOMEM; + vfree((void *)EIO_CACHE(dmc)); + goto bad5; + } + + dmc->cache_sets = (struct cache_set *)vmalloc((size_t)order); + if (!dmc->cache_sets) { + strerr = "Failed to allocate memory"; + error = -ENOMEM; + vfree((void *)EIO_CACHE(dmc)); + goto bad5; + } + + for (i = 0 ; i < (dmc->size >> dmc->consecutive_shift) ; i++) { + dmc->cache_sets[i].nr_dirty = 0; + spin_lock_init(&dmc->cache_sets[i].cs_lock); + init_rwsem(&dmc->cache_sets[i].rw_lock); + dmc->cache_sets[i].mdreq = NULL; + dmc->cache_sets[i].flags = 0; + } + error = eio_repl_sets_init(dmc->policy_ops); + if (error < 0) { + strerr = "Failed to allocate memory for cache policy"; + vfree((void *)dmc->cache_sets); + vfree((void *)EIO_CACHE(dmc)); + goto bad5; + } + eio_policy_lru_pushblks(dmc->policy_ops); + + + if (dmc->mode == CACHE_MODE_WB) { + error = eio_allocate_wb_resources(dmc); + if (error) { + vfree((void *)dmc->cache_sets); + vfree((void *)EIO_CACHE(dmc)); + goto bad5; + } + } + + dmc->sysctl_active.error_inject = 0; + dmc->sysctl_active.fast_remove = 0; + dmc->sysctl_active.zerostats = 0; + dmc->sysctl_active.do_clean = 0; + + atomic_set(&dmc->clean_index, 0); + + atomic64_set(&dmc->nr_ios, 0); + + /* + * sysctl_mem_limit_pct [0 - 100]. Before doing a vmalloc() + * make sure that the allocation size requested is less than + * sysctl_mem_limit_pct percentage of the free RAM available + * in the system. This is to avoid OOM errors in Linux. + * 0 => do the vmalloc without checking system memory. + */ + + dmc->sysctl_active.mem_limit_pct = 75; + + (void)wait_on_bit_lock((void *)&eio_control->synch_flags, EIO_UPDATE_LIST, + eio_wait_schedule, TASK_UNINTERRUPTIBLE); + dmc->next_cache = cache_list_head; + cache_list_head = dmc; + clear_bit(EIO_UPDATE_LIST,(void *)&eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_UPDATE_LIST); + + prev_set = -1; + for (i = 0 ; i < dmc->size ; i++) { + if (EIO_CACHE_STATE_GET(dmc, i) & VALID) + atomic64_inc(&dmc->eio_stats.cached_blocks); + if (EIO_CACHE_STATE_GET(dmc, i) & DIRTY) { + dmc->cache_sets[i / dmc->assoc].nr_dirty++; + atomic64_inc(&dmc->nr_dirty); + cur_set = i / dmc->assoc; + if (prev_set != cur_set) { + /* Move the given set at the head of the set LRU list */ + eio_touch_set_lru(dmc, cur_set); + prev_set = cur_set; + } + } + } + + INIT_WORK(&dmc->readfill_wq, eio_do_readfill); + + /* + * invalid index, but signifies cache successfully built + */ + + dmc->index_zero = dmc->assoc; + + eio_procfs_ctr(dmc); + + /* + * Activate Application Transparent Caching. + */ + + error = eio_ttc_activate(dmc); + if (error) { + goto bad6; + } + + /* + * In future if anyone adds code here and something fails, + * do call eio_ttc_deactivate(dmc) as part of cleanup. + */ + + return 0; + +bad6: + eio_procfs_dtr(dmc); + if (dmc->mode == CACHE_MODE_WB) { + eio_stop_async_tasks(dmc); + eio_free_wb_resources(dmc); + } + vfree((void *)dmc->cache_sets); + vfree((void *)EIO_CACHE(dmc)); + + (void)wait_on_bit_lock((void *)&eio_control->synch_flags, EIO_UPDATE_LIST, + eio_wait_schedule, TASK_UNINTERRUPTIBLE); + nodepp = &cache_list_head; + while (*nodepp != NULL) { + if (*nodepp == dmc) { + *nodepp = dmc->next_cache; + break; + } + nodepp = &((*nodepp)->next_cache); + } + clear_bit(EIO_UPDATE_LIST, (void *)&eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_UPDATE_LIST); +bad5: + eio_kcached_client_destroy(dmc); +bad4: +bad3: + eio_put_cache_device(dmc); +bad2: + eio_ttc_put_device(&dmc->disk_dev); +bad1: + eio_policy_free(dmc); + kfree(dmc); +bad: + if (strerr) + pr_err("Cache creation failed: %s.\n", strerr); + return error; +} + +/* + * Destroy the cache mapping. + */ + +int +eio_cache_delete(char *cache_name, int do_delete) +{ + struct cache_c *dmc; + struct cache_c **nodepp; + int ret, error; + int restart_async_task; + + ret = 0; + restart_async_task = 0; + + dmc = eio_cache_lookup(cache_name); + if (NULL == dmc) { + pr_err("cache delete: cache \"%s\" doesn't exist.", cache_name); + return -EINVAL; + } + + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (dmc->cache_flags & CACHE_FLAGS_SHUTDOWN_INPROG) { + pr_err("cache_delete: system shutdown in progress, cannot " + "delete cache %s", cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return -EINVAL; + } + if (dmc->cache_flags & CACHE_FLAGS_MOD_INPROG) { + pr_err("cache_delete: simultaneous edit/delete operation on cache" + " %s is not permitted", cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return -EINVAL; + } + dmc->cache_flags |= CACHE_FLAGS_MOD_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + + /* + * Earlier attempt to delete failed. + * Allow force deletes only for FAILED caches. + */ + if (unlikely(CACHE_STALE_IS_SET(dmc))) { + if (likely(CACHE_FAILED_IS_SET(dmc))) { + pr_err("cache_delete: Cache \"%s\" is in STALE state. Force deleting!!!", + dmc->cache_name); + goto force_delete; + } else { + if (atomic64_read(&dmc->nr_dirty) != 0) { + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_MOD_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + pr_err("cache_delete: Stale Cache detected with dirty blocks=%ld.\n", + atomic64_read(&dmc->nr_dirty)); + pr_err("cache_delete: Cache \"%s\" wont be deleted. Deleting will result in data corruption.\n", + dmc->cache_name); + return -EINVAL; + } + } + } + + eio_stop_async_tasks(dmc); + + /* + * Deactivate Application Transparent Caching. + * For wb cache, finish_nr_dirty may take long time. + * It should be guaranteed that normal cache delete should succeed + * only when finish_nr_dirty is completely done. + */ + + if (eio_ttc_deactivate(dmc, 0)) { + + /* If deactivate fails; only option is to delete cache. */ + pr_err("cache_delete: Failed to deactivate the cache \"%s\".", + dmc->cache_name); + if (CACHE_FAILED_IS_SET(dmc)) + pr_err("cache_delete: Use -f option to delete the cache \"%s\".", + dmc->cache_name); + ret = -EPERM; + dmc->cache_flags |= CACHE_FLAGS_STALE; + + /* Restart async tasks. */ + restart_async_task = 1; + goto out; + } + + if (!CACHE_FAILED_IS_SET(dmc)) + VERIFY(dmc->sysctl_active.fast_remove || (atomic64_read(&dmc->nr_dirty) == 0)); + + /* + * If ttc_deactivate succeeded... proceed with cache delete. + * Dont entertain device failure hereafter. + */ + if (unlikely(CACHE_FAILED_IS_SET(dmc)) || + unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + pr_err("cache_delete: Cannot update metadata of cache \"%s\" in failed/degraded mode.", + dmc->cache_name); + } else { + eio_md_store(dmc); + } + +force_delete: + eio_procfs_dtr(dmc); + + if (CACHE_STALE_IS_SET(dmc)) { + pr_info("Force deleting cache \"%s\"!!!.", dmc->cache_name); + eio_ttc_deactivate(dmc, 1); + } + + eio_free_wb_resources(dmc); + vfree((void *)EIO_CACHE(dmc)); + vfree((void *)dmc->cache_sets); + eio_ttc_put_device(&dmc->disk_dev); + eio_put_cache_device(dmc); + (void)wait_on_bit_lock((void *)&eio_control->synch_flags, EIO_UPDATE_LIST, + eio_wait_schedule, TASK_UNINTERRUPTIBLE); + nodepp = &cache_list_head; + while (*nodepp != NULL) { + if (*nodepp == dmc) { + *nodepp = dmc->next_cache; + break; + } + nodepp = &((*nodepp)->next_cache); + } + clear_bit(EIO_UPDATE_LIST, &eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_UPDATE_LIST); + +out: + if (restart_async_task) { + VERIFY(dmc->clean_thread == NULL); + error = eio_start_clean_thread(dmc); + if (error) + pr_err("cache_delete: Failed to restart async tasks. error=%d\n", error); + } + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_MOD_INPROG; + if (!ret) { + dmc->cache_flags |= CACHE_FLAGS_DELETED; + } + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + + if (!ret) { + eio_policy_free(dmc); + + /* + * We don't need synchronisation since at this point the dmc is + * no more accessible via lookup. + */ + + if (!(dmc->cache_flags & CACHE_FLAGS_SHUTDOWN_INPROG)) { + kfree(dmc); + } + } + + return ret; +} + +/* + * Reconstruct a degraded cache after the SSD is added. + * This function mimics the constructor eio_ctr() except + * for code that does not require re-initialization. + */ +int +eio_ctr_ssd_add(struct cache_c *dmc, char *dev) +{ + int r = 0; + struct eio_bdev *prev_cache_dev; + u_int32_t prev_persistence = dmc->persistence; + fmode_t mode = (FMODE_READ | FMODE_WRITE); + + /* verify if source device is present */ + VERIFY(dmc->eio_errors.no_source_dev == 0); + + /* mimic relevant portions from eio_ctr() */ + + prev_cache_dev = dmc->cache_dev; + r = eio_ttc_get_device(dev, mode, &dmc->cache_dev); + if (r) { + dmc->cache_dev = prev_cache_dev; + pr_err("ctr_ssd_add: Failed to lookup cache device %s", dev); + return -EINVAL; + } + /* + * For Linux, we have to put the old SSD device now because + * we did not do so during SSD removal. + */ + eio_ttc_put_device(&prev_cache_dev); + + /* sanity check */ + if (dmc->cache_size != to_sector(eio_get_device_size(dmc->cache_dev))) { + pr_err("ctr_ssd_add: Cache device size has changed, expected (%lu) found (%lu) \ + continuing in degraded mode", dmc->cache_size, \ + to_sector(eio_get_device_size(dmc->cache_dev))); + r = -EINVAL; + goto out; + } + + /* sanity check for cache device start sector */ + if (dmc->cache_dev_start_sect != eio_get_device_start_sect(dmc->cache_dev)) { + pr_err("ctr_ssd_add: Cache device starting sector changed, \ + expected (%lu) found (%lu) continuing in \ + degraded mode", dmc->cache_dev_start_sect, \ + eio_get_device_start_sect(dmc->cache_dev)); + r = -EINVAL; + goto out; + } + + strncpy(dmc->cache_devname, dev, DEV_PATHLEN); + eio_init_ssddev_props(dmc); + dmc->size = dmc->cache_size; /* dmc->size will be recalculated in eio_md_create() */ + + /* + * In case of writeback mode, trust the content of SSD and reload the MD. + */ + dmc->persistence = CACHE_FORCECREATE; + + eio_policy_free(dmc); + (void)eio_policy_init(dmc); + + r = eio_md_create(dmc, /* force */1,/* cold */ (dmc->mode != CACHE_MODE_WB)); + if (r) { + pr_err("ctr_ssd_add: Failed to create md, continuing in degraded mode"); + goto out; + } + + r = eio_repl_sets_init(dmc->policy_ops); + if (r < 0) { + pr_err("ctr_ssd_add: Failed to allocate memory for cache policy"); + goto out; + } + eio_policy_lru_pushblks(dmc->policy_ops); + if (dmc->mode != CACHE_MODE_WB) { + /* Cold cache will reset the stats */ + memset(&dmc->eio_stats, 0, sizeof(dmc->eio_stats)); + } + + return 0; +out: + dmc->persistence = prev_persistence; + + return r; +} + +/* + * Stop the async tasks for a cache(threads, scheduled works). + * Used during the cache remove + */ +void +eio_stop_async_tasks(struct cache_c *dmc) +{ + unsigned long flags = 0; + + if (dmc->clean_thread) { + dmc->sysctl_active.fast_remove = 1; + spin_lock_irqsave(&dmc->clean_sl, flags); + EIO_SET_EVENT_AND_UNLOCK(&dmc->clean_event, &dmc->clean_sl, + flags); + eio_wait_thread_exit(dmc->clean_thread, &dmc->clean_thread_running); + EIO_CLEAR_EVENT(&dmc->clean_event); + dmc->clean_thread = NULL; + } + + dmc->sysctl_active.fast_remove = CACHE_FAST_REMOVE_IS_SET(dmc) ? 1 : 0; + + if (dmc->mode == CACHE_MODE_WB) { + /* + * Prevent new I/Os to schedule the time based cleaning. + * Cancel existing delayed work + */ + dmc->sysctl_active.time_based_clean_interval = 0; + cancel_delayed_work_sync(&dmc->clean_aged_sets_work); + } +} + + + +int +eio_start_clean_thread(struct cache_c *dmc) +{ + VERIFY(dmc->clean_thread == NULL); + VERIFY(dmc->mode == CACHE_MODE_WB); + VERIFY(dmc->clean_thread_running == 0); + VERIFY(!(dmc->sysctl_active.do_clean & EIO_CLEAN_START)); + + dmc->clean_thread = eio_create_thread(eio_clean_thread_proc, + (void *)dmc, "eio_clean_thread"); + if (!dmc->clean_thread) { + return -EFAULT; + } + return 0; +} + +int +eio_allocate_wb_resources(struct cache_c *dmc) +{ + int nr_bvecs, nr_pages; + unsigned iosize; + int ret; + + VERIFY(dmc->clean_dbvecs == NULL); + VERIFY(dmc->clean_mdpages == NULL); + VERIFY(dmc->dbvec_count == 0); + VERIFY(dmc->mdpage_count == 0); + + /* Data page allocations are done in terms of "bio_vec" structures */ + iosize = (dmc->block_size * dmc->assoc) << SECTOR_SHIFT; + nr_bvecs = IO_BVEC_COUNT(iosize, dmc->block_size); + dmc->clean_dbvecs = (struct bio_vec *)kmalloc(sizeof(struct bio_vec) * nr_bvecs, GFP_KERNEL); + if (dmc->clean_dbvecs == NULL) { + pr_err("cache_create: Failed to allocated memory.\n"); + ret = -ENOMEM; + goto errout; + } + /* Allocate pages for each bio_vec */ + ret = eio_alloc_wb_bvecs(dmc->clean_dbvecs, nr_bvecs, dmc->block_size); + if (ret) { + goto errout; + } + VERIFY(dmc->clean_dbvecs != NULL); + dmc->dbvec_count = nr_bvecs; + + /* Metadata page allocations are done in terms of pages only */ + iosize = dmc->assoc * sizeof(struct flash_cacheblock); + nr_pages = IO_PAGE_COUNT(iosize); + dmc->clean_mdpages = (struct page **)kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + if (dmc->clean_mdpages == NULL) { + pr_err("cache_create: Failed to allocated memory.\n"); + ret = -ENOMEM; + eio_free_wb_bvecs(dmc->clean_dbvecs, dmc->dbvec_count, dmc->block_size); + goto errout; + } + ret = eio_alloc_wb_pages(dmc->clean_mdpages, nr_pages); + if (ret) { + eio_free_wb_bvecs(dmc->clean_dbvecs, dmc->dbvec_count, dmc->block_size); + goto errout; + } + VERIFY(dmc->clean_mdpages != NULL); + dmc->mdpage_count = nr_pages; + + /* + * For writeback cache: + * 1. Initialize the time based clean work queue + * 2. Initialize the dirty set lru + * 3. Initialize clean thread + */ + + /* + * Reset dmc->is_clean_aged_sets_sched. + * Time based clean will be enabled in eio_touch_set_lru() + * only when dmc->is_clean_aged_sets_sched is zero and + * dmc->sysctl_active.time_based_clean_interval > 0. + */ + + dmc->is_clean_aged_sets_sched = 0; + INIT_DELAYED_WORK(&dmc->clean_aged_sets_work, eio_clean_aged_sets); + dmc->dirty_set_lru = NULL; + ret = lru_init(&dmc->dirty_set_lru, (dmc->size >> dmc->consecutive_shift)); + if (ret == 0) { + spin_lock_init(&dmc->dirty_set_lru_lock); + ret = eio_clean_thread_init(dmc); + } + VERIFY(dmc->mdupdate_q == NULL); + dmc->mdupdate_q = create_singlethread_workqueue("eio_mdupdate"); + if (!dmc->mdupdate_q) { + ret = -ENOMEM; + } + + if (ret < 0) { + pr_err("cache_create: Failed to initialize dirty lru set or" + "clean/mdupdate thread for wb cache.\n"); + if (dmc->dirty_set_lru) { + lru_uninit(dmc->dirty_set_lru); + dmc->dirty_set_lru = NULL; + } + + eio_free_wb_pages(dmc->clean_mdpages, dmc->mdpage_count); + eio_free_wb_bvecs(dmc->clean_dbvecs, dmc->dbvec_count, dmc->block_size); + goto errout; + } + + goto out; + +errout: + if (dmc->clean_mdpages) { + kfree(dmc->clean_mdpages); + dmc->clean_mdpages = NULL; + dmc->mdpage_count = 0; + } + if (dmc->clean_dbvecs) { + kfree(dmc->clean_dbvecs); + dmc->clean_dbvecs = NULL; + dmc->dbvec_count = 0; + } + +out: + return ret; +} + +void +eio_free_wb_resources(struct cache_c *dmc) +{ + + if (dmc->mdupdate_q) { + flush_workqueue(dmc->mdupdate_q); + destroy_workqueue(dmc->mdupdate_q); + dmc->mdupdate_q = NULL; + } + if (dmc->dirty_set_lru) { + lru_uninit(dmc->dirty_set_lru); + dmc->dirty_set_lru = NULL; + } + if (dmc->clean_mdpages) { + eio_free_wb_pages(dmc->clean_mdpages, dmc->mdpage_count); + kfree(dmc->clean_mdpages); + dmc->clean_mdpages = NULL; + } + if (dmc->clean_dbvecs) { + eio_free_wb_bvecs(dmc->clean_dbvecs, dmc->dbvec_count, dmc->block_size); + kfree(dmc->clean_dbvecs); + dmc->clean_dbvecs = NULL; + } + + dmc->dbvec_count = dmc->mdpage_count = 0; + return; +} + +static int +eio_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct cache_c *dmc; + + + if (eio_reboot_notified == EIO_REBOOT_HANDLING_DONE) { + return NOTIFY_DONE; + } + + (void)wait_on_bit_lock((void *)&eio_control->synch_flags, EIO_HANDLE_REBOOT, + eio_wait_schedule, TASK_UNINTERRUPTIBLE); + if (eio_reboot_notified == EIO_REBOOT_HANDLING_DONE) { + clear_bit(EIO_HANDLE_REBOOT, (void *)&eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_HANDLE_REBOOT); + return NOTIFY_DONE; + } + VERIFY(eio_reboot_notified == 0); + eio_reboot_notified = EIO_REBOOT_HANDLING_INPROG; + + (void)wait_on_bit_lock((void *)&eio_control->synch_flags, EIO_UPDATE_LIST, + eio_wait_schedule, TASK_UNINTERRUPTIBLE); + for (dmc = cache_list_head; dmc != NULL; dmc = dmc->next_cache) { + if (unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + pr_err("notify_reboot: Cannot sync in failed / degraded mode"); + continue; + } + if (dmc->cold_boot && atomic64_read(&dmc->nr_dirty) && !eio_force_warm_boot) { + pr_info("Cold boot set for cache %s: Draining dirty blocks: %ld", + dmc->cache_name, atomic64_read(&dmc->nr_dirty)); + eio_clean_for_reboot(dmc); + } + eio_md_store(dmc); + } + clear_bit(EIO_UPDATE_LIST, (void *)&eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_UPDATE_LIST); + + eio_reboot_notified = EIO_REBOOT_HANDLING_DONE; + clear_bit(EIO_HANDLE_REBOOT, (void *)&eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_HANDLE_REBOOT); + return NOTIFY_DONE; +} + + +/* + * The SSD add/remove is handled using udev from the user space. The driver + * is notified from the user space via dmsetup message. Both device addition + * and removal events are handled in the driver by eio_handle_message(). + * + * The device remove has a special case. From the time the device is removed, + * until the time the driver gets notified from the user space could be a few msec + * or a couple of seconds. During this time, any IO to the SSD fails. While this + * is handled gracefully, the logs can get filled with IO error messages. + * + * In order to cover that gap, we handle the device removal within the kernel + * using this function. Note that using the scsi notifier function in the kernel + * (vs. receiving the message from user space) minimizes the notification delay + * between the time the SSD is removed until the driver is notified. This cannot, + * however, make this delay zero. Therefore, there will be a small window during + * which eio_io_callback() may fail on CACHEWRITE action. + * + * We still need the user space (udev) method of handling for the following + * reasons: + * (i) This notifier is only for a scsi device. + * (ii) The add/remove feature in user space can also be used to dynamically + * turn the cache on and off. + * + * This notifier is used only when SSD is removed. The add event can + * be caught using the BUS_NOTIFY_ADD_DEVICE in action. However, we only + * get a scsi handle and do not have a reference to our device pointer. + */ +static int +eio_notify_ssd_rm(struct notifier_block *nb, unsigned long action, void *data) +{ + struct device *dev = data; + struct cache_c *dmc; + const char *device_name; + size_t len; + unsigned long int flags = 0; + struct ssd_rm_list *ssd_list_ptr; + unsigned check_src = 0, check_ssd = 0; + dev_notifier_t notify = NOTIFY_INITIALIZER; + + + if (likely(action != BUS_NOTIFY_DEL_DEVICE)) + return 0; + + if (unlikely(dev == NULL)) { + pr_info("notify_cache_dev: device is NULL!"); + return 0; + } + + if (!scsi_is_sdev_device(dev)) + return 0; + + if ((device_name = dev_name(dev)) == NULL) + return 0; + len = strlen(device_name); + + /* push to a list for future processing as we could be in an interrupt context */ + for (dmc = cache_list_head; dmc != NULL; dmc = dmc->next_cache) { + notify = NOTIFY_INITIALIZER; + check_src = ('\0' == dmc->cache_srcdisk_name[0] ? 0 : 1); + check_ssd = ('\0' == dmc->cache_gendisk_name[0] ? 0 : 1); + + if (check_src == 0 && check_ssd == 0) + continue; + + /*Check if source dev name or ssd dev name is available or not. */ + if (check_ssd && 0 == strncmp(device_name, dmc->cache_gendisk_name, len)) { + pr_info("SSD Removed for cache name %s", dmc->cache_name); + notify = NOTIFY_SSD_REMOVED; + } + + if (check_src && 0 == strncmp(device_name, dmc->cache_srcdisk_name, len)) { + pr_info("SRC Removed for cache name %s", dmc->cache_name); + notify = NOTIFY_SRC_REMOVED; + } + + if (notify == NOTIFY_INITIALIZER) + continue; + + ssd_list_ptr = kmalloc(sizeof (struct ssd_rm_list), GFP_ATOMIC); + if (unlikely(ssd_list_ptr == NULL)) { + pr_err("Cannot allocate memory for ssd_rm_list"); + return -ENOMEM; + } + ssd_list_ptr->dmc = dmc; + ssd_list_ptr->action = action; + ssd_list_ptr->devt = dev->devt; + ssd_list_ptr->note = notify; + spin_lock_irqsave(&ssd_rm_list_lock, flags); + list_add_tail(&ssd_list_ptr->list, &ssd_rm_list); + ssd_rm_list_not_empty = 1; + spin_unlock_irqrestore(&ssd_rm_list_lock, flags); + } + + spin_lock_irqsave(&ssd_rm_list_lock, flags); + if (ssd_rm_list_not_empty) { + spin_unlock_irqrestore(&ssd_rm_list_lock, flags); + schedule_work(&_kcached_wq); + } else { + spin_unlock_irqrestore(&ssd_rm_list_lock, flags); + } + + return 0; +} + +/* + * Initiate a cache target. + */ +static int __init +eio_init(void) +{ + int r; + extern struct bus_type scsi_bus_type; + + + if (sizeof (sector_t) != 8 || sizeof (index_t) != 8) { + pr_err("init: EnhanceIO runs only in 64-bit architectures"); + return -EPERM; + } + + eio_ttc_init(); + r = eio_create_misc_device(); + if (r) { + return r; + } + + r = eio_jobs_init(); + if (r) { + (void)eio_delete_misc_device(); + return r; + } + atomic_set(&nr_cache_jobs, 0); + INIT_WORK(&_kcached_wq, eio_do_work); + + eio_module_procfs_init(); + eio_control = kmalloc(sizeof *eio_control, GFP_KERNEL); + if (eio_control == NULL) { + pr_err("init: Cannot allocate memory for eio_control"); + (void)eio_delete_misc_device(); + return -ENOMEM; + } + eio_control->synch_flags = 0; + + register_reboot_notifier(&eio_reboot_notifier); + r = bus_register_notifier(&scsi_bus_type, &eio_ssd_rm_notifier); + if (r) { + pr_err("init: bus register notifier failed %d", r); + (void)eio_delete_misc_device(); + } + return r; +} + + +/* + * Destroy a cache target. + */ +static void +eio_exit(void) +{ + int r; + extern struct bus_type scsi_bus_type; + + + unregister_reboot_notifier(&eio_reboot_notifier); + r = bus_unregister_notifier(&scsi_bus_type, &eio_ssd_rm_notifier); + if (r) + pr_err("exit: Bus unregister notifier failed %d", r); + + eio_jobs_exit(); + eio_module_procfs_exit(); + if (eio_control) { + eio_control->synch_flags = 0; + kfree(eio_control); + eio_control = NULL; + } + (void)eio_delete_misc_device(); +} + + +/* + * eio_get_device_size + */ +sector_t +eio_get_device_size(struct eio_bdev *dev) +{ + + return dev->bdev->bd_inode->i_size; +} + +/* + * To get starting sector of the device + */ +sector_t +eio_get_device_start_sect(struct eio_bdev *dev) +{ + + if (dev == NULL || dev->bdev == NULL || dev->bdev->bd_part == NULL) + return 0; + + return dev->bdev->bd_part->start_sect; +} + +module_init(eio_init); +module_exit(eio_exit); + +MODULE_DESCRIPTION(DM_NAME "STEC EnhanceIO target"); +MODULE_AUTHOR("STEC, Inc. based on code by Facebook"); + +MODULE_LICENSE("GPL"); + diff --git a/drivers/staging/enhanceio/eio_fifo.c b/drivers/staging/enhanceio/eio_fifo.c new file mode 100644 index 0000000..1d9aab2 --- /dev/null +++ b/drivers/staging/enhanceio/eio_fifo.c @@ -0,0 +1,265 @@ +/* + * eio_fifo.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "eio.h" +/* Generic policy functions prototypes */ +int eio_fifo_init(struct cache_c *); +void eio_fifo_exit(void); +int eio_fifo_cache_sets_init(struct eio_policy *); +int eio_fifo_cache_blk_init(struct eio_policy *); +void eio_fifo_find_reclaim_dbn(struct eio_policy *, index_t, index_t *); +int eio_fifo_clean_set(struct eio_policy *, index_t, int); +/* Per policy instance initialization */ +struct eio_policy *eio_fifo_instance_init(void); + + +/* Per cache set data structure */ +struct eio_fifo_cache_set { + index_t set_fifo_next; + index_t set_clean_next; +}; + + +/* + * Context that captures the FIFO replacement policy + */ +static struct eio_policy_header eio_fifo_ops = { + .sph_name = CACHE_REPL_FIFO, + .sph_instance_init = eio_fifo_instance_init, + +}; + + +/* + * Initialize FIFO policy. + */ +int +eio_fifo_init(struct cache_c *dmc) +{ + + return 0; +} + + +/* + * Initialize FIFO data structure called from ctr. + */ +int +eio_fifo_cache_sets_init(struct eio_policy *p_ops) +{ + int i; + sector_t order; + struct cache_c *dmc = p_ops->sp_dmc; + struct eio_fifo_cache_set *cache_sets; + + + pr_info("Initializing fifo cache sets\n"); + order = (dmc->size >> dmc->consecutive_shift) * sizeof (struct eio_fifo_cache_set); + + dmc->sp_cache_set = (struct eio_fifo_cache_set *)vmalloc((size_t) order); + if (dmc->sp_cache_set == NULL) + return -ENOMEM; + + cache_sets = (struct eio_fifo_cache_set *)dmc->sp_cache_set; + + for (i = 0; i < (int)(dmc->size >> dmc->consecutive_shift); i++) { + cache_sets[i].set_fifo_next = i * dmc->assoc; + cache_sets[i].set_clean_next = i * dmc->assoc; + } + + return 0; +} + + +/* + * The actual function that returns a victim block in index. + */ +void +eio_fifo_find_reclaim_dbn(struct eio_policy *p_ops, index_t start_index, index_t *index) +{ + index_t end_index; + int slots_searched = 0; + index_t i; + index_t set; + struct eio_fifo_cache_set *cache_sets; + struct cache_c *dmc = p_ops->sp_dmc; + + + set = start_index / dmc->assoc; + end_index = start_index + dmc->assoc; + cache_sets = (struct eio_fifo_cache_set *)dmc->sp_cache_set; + + i = cache_sets[set].set_fifo_next; + while (slots_searched < (int) dmc->assoc) { + VERIFY(i >= start_index); + VERIFY(i < end_index); + if (EIO_CACHE_STATE_GET(dmc, i) == VALID) { + *index = i; + break; + } + slots_searched++; + i++; + if (i == end_index) + i = start_index; + } + i++; + if (i == end_index) + i = start_index; + cache_sets[set].set_fifo_next = i; +} + + +/* + * Go through the entire set and clean. + */ +int +eio_fifo_clean_set(struct eio_policy *p_ops, index_t set, int to_clean) +{ + index_t i; + int scanned = 0, nr_writes = 0; + index_t start_index; + index_t end_index; + struct eio_fifo_cache_set *cache_sets; + struct cache_c *dmc; + + + dmc = p_ops->sp_dmc; + cache_sets = (struct eio_fifo_cache_set *)dmc->sp_cache_set; + start_index = set * dmc->assoc; + end_index = start_index + dmc->assoc; + i = cache_sets[set].set_clean_next; + + while ((scanned < (int)dmc->assoc) && (nr_writes < to_clean)) { + if ((EIO_CACHE_STATE_GET(dmc, i) & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { + EIO_CACHE_STATE_ON(dmc, i, DISKWRITEINPROG); + nr_writes++; + } + scanned++; + i++; + if (i == end_index) + i = start_index; + } + cache_sets[set].set_clean_next = i; + + return nr_writes; +} + + +/* + * FIFO is per set, so do nothing on a per block init. + */ +int +eio_fifo_cache_blk_init(struct eio_policy *p_ops) +{ + + return 0; +} + + +/* + * Allocate a new instance of eio_policy per dmc + */ +struct eio_policy * +eio_fifo_instance_init(void) +{ + struct eio_policy *new_instance; + + + new_instance = (struct eio_policy *)vmalloc(sizeof (struct eio_policy)); + if (new_instance == NULL) { + pr_err("ssdscache_fifo_instance_init: vmalloc failed"); + return NULL; + } + + /* Initialize the FIFO specific functions and variables */ + new_instance->sp_name = CACHE_REPL_FIFO; + new_instance->sp_policy.lru = NULL; + new_instance->sp_repl_init = eio_fifo_init; + new_instance->sp_repl_exit = eio_fifo_exit; + new_instance->sp_repl_sets_init = eio_fifo_cache_sets_init; + new_instance->sp_repl_blk_init = eio_fifo_cache_blk_init; + new_instance->sp_find_reclaim_dbn = eio_fifo_find_reclaim_dbn; + new_instance->sp_clean_set = eio_fifo_clean_set; + new_instance->sp_dmc = NULL; + + try_module_get(THIS_MODULE); + + pr_info("eio_fifo_instance_init: created new instance of FIFO"); + + return new_instance; +} + + +/* + * Cleanup an instance of eio_policy (called from dtr). + */ +void +eio_fifo_exit(void) +{ + + module_put(THIS_MODULE); +} + + +static +int __init +fifo_register(void) +{ + int ret; + + + ret = eio_register_policy(&eio_fifo_ops); + if (ret != 0) + pr_info("eio_fifo already registered"); + + return ret; +} + + +static +void __exit +fifo_unregister(void) +{ + int ret; + + + ret = eio_unregister_policy(&eio_fifo_ops); + if (ret != 0) + pr_err("eio_fifo unregister failed"); +} + +module_init(fifo_register); +module_exit(fifo_unregister); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("FIFO policy for EnhanceIO"); +MODULE_AUTHOR("STEC, Inc. based on code by Facebook"); diff --git a/drivers/staging/enhanceio/eio_ioctl.c b/drivers/staging/enhanceio/eio_ioctl.c new file mode 100644 index 0000000..dac3dbf --- /dev/null +++ b/drivers/staging/enhanceio/eio_ioctl.c @@ -0,0 +1,160 @@ +/* + * eio_ioctl.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "eio.h" +#include "eio_ttc.h" + +long +eio_ioctl(struct file *filp, unsigned cmd, unsigned long arg) +{ + int error = 0; + cache_rec_short_t *cache; + uint64_t ncaches; + dev_notifier_t note; + int do_delete = 0; + + + switch(cmd) { + case EIO_IOC_CREATE: + case EIO_IOC_ENABLE: + + cache = vmalloc(sizeof (cache_rec_short_t)); + if (!cache) { + return -ENOMEM; + } + if (copy_from_user(cache, (void __user *)arg, + sizeof (cache_rec_short_t))) { + vfree(cache); + return -EFAULT; + } + error = eio_cache_create(cache); + vfree(cache); + break; + + case EIO_IOC_DELETE: + do_delete = 1; + + case EIO_IOC_DISABLE: + + cache = vmalloc(sizeof (cache_rec_short_t)); + if (!cache) { + return -ENOMEM; + } + if (copy_from_user(cache, (void __user *)arg, + sizeof (cache_rec_short_t))) { + vfree(cache); + return -EFAULT; + } + error = eio_cache_delete(cache->cr_name, do_delete); + vfree(cache); + break; + + case EIO_IOC_EDIT: + cache = vmalloc(sizeof (cache_rec_short_t)); + if (!cache) { + return -ENOMEM; + } + + if (copy_from_user(cache, (void __user *)arg, + sizeof (cache_rec_short_t))) { + vfree(cache); + return -EFAULT; + } + error = eio_cache_edit(cache->cr_name, + (u_int32_t)cache->cr_mode, + (u_int32_t)cache->cr_policy); + vfree(cache); + break; + + case EIO_IOC_NCACHES: + ncaches = eio_get_cache_count(); + if (copy_to_user((uint64_t __user *)arg, &ncaches, + sizeof (uint64_t))) { + return -EFAULT; + } + break; + + case EIO_IOC_CACHE_LIST: + error = eio_get_cache_list((unsigned long __user *)arg); + break; + + case EIO_IOC_SET_WARM_BOOT: + eio_set_warm_boot(); + break; + + case EIO_IOC_SSD_ADD: + cache = vmalloc(sizeof (cache_rec_short_t)); + if (!cache) + return -ENOMEM; + + if (copy_from_user(cache, (void __user *)arg, + sizeof (cache_rec_short_t))) { + vfree(cache); + return -EFAULT; + } + note = NOTIFY_SSD_ADD; + error = eio_handle_ssd_message(cache->cr_name, cache->cr_ssd_devname, note); + vfree(cache); + + break; + + case EIO_IOC_SSD_REMOVE: + cache = vmalloc(sizeof (cache_rec_short_t)); + if (!cache) + return -ENOMEM; + if (copy_from_user(cache, (void __user *)arg, + sizeof (cache_rec_short_t))) { + vfree(cache); + return -EFAULT; + } + note = NOTIFY_SSD_REMOVED; + error = eio_handle_ssd_message(cache->cr_name, cache->cr_ssd_devname, note); + vfree(cache); + break; + + case EIO_IOC_SRC_ADD: + printk("Hello EIO_IOC_SRC_ADD called\n"); + break; + + case EIO_IOC_NOTIFY_REBOOT: + eio_reboot_handling(); + break; + + default: + error = EINVAL; + } + return error; +} + +long +eio_compact_ioctl(struct file *filp, unsigned cmd, unsigned long arg) +{ + return eio_ioctl(filp, cmd, arg); +} + diff --git a/drivers/staging/enhanceio/eio_ioctl.h b/drivers/staging/enhanceio/eio_ioctl.h new file mode 100644 index 0000000..3d752d5 --- /dev/null +++ b/drivers/staging/enhanceio/eio_ioctl.h @@ -0,0 +1,90 @@ +/* + * eio_ioctl.h + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +#ifndef EIO_IOCTL_H +#define EIO_IOCTL_H + +#define EIO_DEVPATH "/dev/eiodev" +#define MISC_DEVICE "eiodev" + +#define CACHE_NAME_LEN 31 +#define CACHE_NAME_SZ CACHE_NAME_LEN + 1 + +#define NAME_LEN 127 +#define NAME_SZ NAME_LEN + 1 + + +#define EIO_IOC ('E' << 8) + +typedef enum eio_ioc { + EIO_IOC_CREATE = EIO_IOC, + EIO_IOC_DELETE, + EIO_IOC_ENABLE, + EIO_IOC_DISABLE, + EIO_IOC_EDIT, + EIO_IOC_NCACHES, + EIO_IOC_CACHE_LIST, + EIO_IOC_SSD_ADD, + EIO_IOC_SSD_REMOVE, + EIO_IOC_SRC_ADD, + EIO_IOC_SRC_REMOVE, + EIO_IOC_NOTIFY_REBOOT, + EIO_IOC_SET_WARM_BOOT, + EIO_IOC_UNUSED +} eio_ioc_t; + +typedef struct cache_rec_short { + char cr_name[CACHE_NAME_SZ]; + char cr_src_devname[NAME_SZ]; + char cr_ssd_devname[NAME_SZ]; + char cr_ssd_uuid[NAME_SZ]; + uint64_t cr_src_dev_size; + uint64_t cr_ssd_dev_size; + uint32_t cr_src_sector_size; + uint32_t cr_ssd_sector_size; + uint32_t cr_flags; /* CACHE_FLAGS_INV* etc. */ + char cr_policy; + char cr_mode; + char cr_persistence; + char cr_cold_boot; + uint64_t cr_blksize; + uint64_t cr_assoc; +} cache_rec_short_t; + +typedef struct cache_list { + uint64_t ncaches; + cache_rec_short_t *cachelist; +} cache_list_t; + +#ifdef __KERNEL__ +long eio_ioctl(struct file *filp, unsigned cmd, unsigned long arg); +long eio_compact_ioctl(struct file *filp, unsigned cmd, unsigned long arg); +#endif /* __KERNEL__ */ + +#endif /* !EIO_IOCTL_H */ diff --git a/drivers/staging/enhanceio/eio_lru.c b/drivers/staging/enhanceio/eio_lru.c new file mode 100644 index 0000000..3c20f44 --- /dev/null +++ b/drivers/staging/enhanceio/eio_lru.c @@ -0,0 +1,342 @@ +/* + * eio_lru.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "eio.h" +/* Generic policy functions prototyes */ +int eio_lru_init(struct cache_c *); +void eio_lru_exit(void); +int eio_lru_cache_sets_init(struct eio_policy *); +int eio_lru_cache_blk_init(struct eio_policy *); +void eio_lru_find_reclaim_dbn(struct eio_policy *, index_t, index_t *); +int eio_lru_clean_set(struct eio_policy *, index_t, int); +/* Per policy instance initialization */ +struct eio_policy *eio_lru_instance_init(void); + +/* LRU specific policy functions prototype */ +void eio_lru_pushblks(struct eio_policy *); +void eio_reclaim_lru_movetail(struct cache_c *, index_t, struct eio_policy *); + + +/* Per cache set data structure */ +struct eio_lru_cache_set { + u_int16_t lru_head, lru_tail; +}; + +/* Per cache block data structure */ +struct eio_lru_cache_block { + u_int16_t lru_prev, lru_next; +}; + +/* LRU specifc data structures */ +static struct eio_lru eio_lru = { + .sl_lru_pushblks = eio_lru_pushblks, + .sl_reclaim_lru_movetail = eio_reclaim_lru_movetail, +}; + +/* + * Context that captures the LRU replacement policy + */ +static struct eio_policy_header eio_lru_ops = { + .sph_name = CACHE_REPL_LRU, + .sph_instance_init = eio_lru_instance_init, +}; + + +/* + * Intialize LRU. Called from ctr. + */ +int +eio_lru_init(struct cache_c *dmc) +{ + + return 0; +} + + +/* + * Initialize per set LRU data structures. + */ +int +eio_lru_cache_sets_init(struct eio_policy *p_ops) +{ + sector_t order; + int i; + struct cache_c *dmc = p_ops->sp_dmc; + struct eio_lru_cache_set *cache_sets; + + + order = (dmc->size >> dmc->consecutive_shift) * sizeof (struct eio_lru_cache_set); + + dmc->sp_cache_set = (struct eio_lru_cache_set *)vmalloc((size_t)order); + if (dmc->sp_cache_set == NULL) + return -ENOMEM; + + cache_sets = (struct eio_lru_cache_set *)dmc->sp_cache_set; + + for (i = 0 ; i < (int)(dmc->size >> dmc->consecutive_shift) ; i++) { + cache_sets[i].lru_tail = EIO_LRU_NULL; + cache_sets[i].lru_head = EIO_LRU_NULL; + } + pr_info("Initialized %d sets in LRU", i); + + return 0; +} + + +/* + * Initialize per block LRU data structures + */ +int +eio_lru_cache_blk_init(struct eio_policy *p_ops) +{ + sector_t order; + struct cache_c *dmc = p_ops->sp_dmc; + + + order = dmc->size * sizeof (struct eio_lru_cache_block); + + dmc->sp_cache_blk = (struct eio_lru_cache_block *)vmalloc((size_t)order); + if (dmc->sp_cache_blk == NULL) + return -ENOMEM; + + return 0; +} + + +/* + * Allocate a new instance of eio_policy per dmc + */ +struct eio_policy * +eio_lru_instance_init(void) +{ + struct eio_policy *new_instance; + + + new_instance = (struct eio_policy *)vmalloc(sizeof (struct eio_policy)); + if (new_instance == NULL) { + pr_err("eio_lru_instance_init: vmalloc failed"); + return NULL; + } + + /* Initialize the LRU specific functions and variables */ + new_instance->sp_name = CACHE_REPL_LRU; + new_instance->sp_policy.lru = &eio_lru; + new_instance->sp_repl_init = eio_lru_init; + new_instance->sp_repl_exit = eio_lru_exit; + new_instance->sp_repl_sets_init = eio_lru_cache_sets_init; + new_instance->sp_repl_blk_init = eio_lru_cache_blk_init; + new_instance->sp_find_reclaim_dbn = eio_lru_find_reclaim_dbn; + new_instance->sp_clean_set = eio_lru_clean_set; + new_instance->sp_dmc = NULL; + + try_module_get(THIS_MODULE); + + pr_info("eio_lru_instance_init: created new instance of LRU"); + + return new_instance; +} + + +/* + * Cleanup an instance of eio_policy (called from dtr). + */ +void +eio_lru_exit(void) +{ + + module_put(THIS_MODULE); +} + + +/* + * Find a victim block to evict and return it in index. + */ +void +eio_lru_find_reclaim_dbn(struct eio_policy *p_ops, + index_t start_index, index_t *index) +{ + index_t lru_rel_index; + struct eio_lru_cache_set *lru_sets; + struct eio_lru_cache_block *lru_blk; + struct cache_c *dmc = p_ops->sp_dmc; + index_t set; + + + set = start_index / dmc->assoc; + lru_sets = (struct eio_lru_cache_set *)(dmc->sp_cache_set); + + lru_rel_index = lru_sets[set].lru_head; + while (lru_rel_index != EIO_LRU_NULL) { + lru_blk = ((struct eio_lru_cache_block *)dmc->sp_cache_blk + lru_rel_index + start_index); + if (EIO_CACHE_STATE_GET(dmc, (lru_rel_index + start_index)) == VALID) { + VERIFY((lru_blk - (struct eio_lru_cache_block *)dmc->sp_cache_blk) == + (lru_rel_index + start_index)); + *index = lru_rel_index + start_index; + eio_reclaim_lru_movetail(dmc, *index, p_ops); + break; + } + lru_rel_index = lru_blk->lru_next; + } + + return; +} + + +/* + * Go through the entire set and clean. + */ +int +eio_lru_clean_set(struct eio_policy *p_ops, index_t set, int to_clean) +{ + struct cache_c *dmc = p_ops->sp_dmc; + index_t lru_rel_index; + int nr_writes = 0; + struct eio_lru_cache_set *lru_cache_sets; + struct eio_lru_cache_block *lru_cacheblk; + index_t dmc_idx; + index_t start_index; + + + lru_cache_sets = (struct eio_lru_cache_set *)dmc->sp_cache_set; + start_index = set * dmc->assoc; + lru_rel_index = lru_cache_sets[set].lru_head; + + while ((lru_rel_index != EIO_LRU_NULL) && (nr_writes < to_clean)) { + dmc_idx = lru_rel_index + start_index; + lru_cacheblk = ((struct eio_lru_cache_block *)dmc->sp_cache_blk + lru_rel_index + start_index); + VERIFY((lru_cacheblk - (struct eio_lru_cache_block *)dmc->sp_cache_blk) == (lru_rel_index + start_index)); + if ((EIO_CACHE_STATE_GET(dmc, dmc_idx) & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { + EIO_CACHE_STATE_ON(dmc, dmc_idx, DISKWRITEINPROG); + nr_writes++; + } + lru_rel_index = lru_cacheblk->lru_next; + } + + return nr_writes; +} + + +/* + * LRU specific functions. + */ +void +eio_reclaim_lru_movetail(struct cache_c *dmc, index_t index, struct eio_policy *p_ops) +{ + index_t set = index / dmc->assoc; + index_t start_index = set * dmc->assoc; + index_t my_index = index - start_index; + struct eio_lru_cache_block *cacheblk; + struct eio_lru_cache_set *cache_sets; + struct eio_lru_cache_block *blkptr; + + + cacheblk = (((struct eio_lru_cache_block *)(dmc->sp_cache_blk))+index); + cache_sets = (struct eio_lru_cache_set *)dmc->sp_cache_set; + blkptr = (struct eio_lru_cache_block *)(dmc->sp_cache_blk); + + /* Remove from LRU */ + if (likely((cacheblk->lru_prev != EIO_LRU_NULL) || + (cacheblk->lru_next != EIO_LRU_NULL))) { + if (cacheblk->lru_prev != EIO_LRU_NULL) + blkptr[cacheblk->lru_prev + start_index].lru_next = + cacheblk->lru_next; + else + cache_sets[set].lru_head = cacheblk->lru_next; + if (cacheblk->lru_next != EIO_LRU_NULL) + blkptr[cacheblk->lru_next + start_index].lru_prev = + cacheblk->lru_prev; + else + cache_sets[set].lru_tail = cacheblk->lru_prev; + } + /* And add it to LRU Tail */ + cacheblk->lru_next = EIO_LRU_NULL; + cacheblk->lru_prev = cache_sets[set].lru_tail; + if (cache_sets[set].lru_tail == EIO_LRU_NULL) + cache_sets[set].lru_head = (u_int16_t)my_index; + else + blkptr[cache_sets[set].lru_tail + start_index].lru_next = + (u_int16_t)my_index; + cache_sets[set].lru_tail = (u_int16_t)my_index; +} + + +void +eio_lru_pushblks(struct eio_policy *p_ops) +{ + struct cache_c *dmc = p_ops->sp_dmc; + struct eio_lru_cache_block *cache_block; + int i; + + + cache_block = dmc->sp_cache_blk; + for (i = 0 ; i < (int)dmc->size ; i++) { + cache_block[i].lru_prev = EIO_LRU_NULL; + cache_block[i].lru_next = EIO_LRU_NULL; + eio_reclaim_lru_movetail(dmc, i, p_ops); + } + return; +} + + +static +int __init +lru_register(void) +{ + int ret; + + + ret = eio_register_policy(&eio_lru_ops); + if (ret != 0) + pr_info("eio_lru already registered"); + + return ret; +} + + +static +void __exit +lru_unregister(void) +{ + int ret; + + + ret = eio_unregister_policy(&eio_lru_ops); + if (ret != 0) + pr_err("eio_lru unregister failed"); +} + +module_init(lru_register); +module_exit(lru_unregister); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LRU policy for EnhanceIO"); +MODULE_AUTHOR("STEC, Inc. based on code by Facebook"); + diff --git a/drivers/staging/enhanceio/eio_main.c b/drivers/staging/enhanceio/eio_main.c new file mode 100644 index 0000000..a46c737 --- /dev/null +++ b/drivers/staging/enhanceio/eio_main.c @@ -0,0 +1,3503 @@ +/* + * eio_main.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * Amit Kale <akale@stec-inc.com> + * Restructured much of the io code to split bio within map function instead + * of letting dm do it. + * Simplified queued logic for write through. + * Created per-cache spinlocks for reducing contention in IO codepath. + * Amit Kale <akale@stec-inc.com> + * Harish Pujari <hpujari@stec-inc.com> + * Designed and implemented the writeback caching mode + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "eio.h" +#include "eio_ttc.h" + +#define CTRACE(X) { } + + +/* + * TODO List : + * 1) sysctls : Create per-cache device sysctls instead of global sysctls. + * 2) Management of non cache pids : Needs improvement. Remove registration + * on process exits (with a pseudo filesstem'ish approach perhaps) ? + * 3) Breaking up the cache spinlock : Right now contention on the spinlock + * is not a problem. Might need change in future. + * 4) Use the standard linked list manipulation macros instead rolling our own. + * 5) Fix a security hole : A malicious process with 'ro' access to a file can + * potentially corrupt file data. This can be fixed by copying the data on a + * cache read miss. + */ + +static int eio_read_peek(struct cache_c *dmc, struct eio_bio *ebio); +static int eio_write_peek(struct cache_c *dmc, struct eio_bio *ebio); +static void eio_read(struct cache_c *dmc, struct bio_container *bc, + struct eio_bio *ebegin); +static void eio_write(struct cache_c *dmc, struct bio_container *bc, + struct eio_bio *ebegin); +static int eio_inval_block(struct cache_c *dmc, sector_t iosector); +static void eio_enqueue_readfill(struct cache_c *dmc, + struct kcached_job *job); +static int eio_acquire_set_locks(struct cache_c *dmc, + struct bio_container *bc); +static int eio_release_io_resources(struct cache_c *dmc, + struct bio_container *bc); +static void eio_clean_set(struct cache_c *dmc, index_t set, int whole, int force); +static void eio_do_mdupdate(struct work_struct *work); +static void eio_mdupdate_callback(int error, void *context); +static void eio_enq_mdupdate(struct bio_container *bc); +static void eio_uncached_read_done(struct kcached_job *job); +static void eio_addto_cleanq(struct cache_c *dmc, index_t set, int whole); +static int eio_alloc_mdreqs(struct cache_c *, struct bio_container *); +static void eio_check_dirty_set_thresholds(struct cache_c *dmc, index_t set); +static void eio_check_dirty_cache_thresholds(struct cache_c *dmc); +static void eio_post_mdupdate(struct work_struct *work); +static void eio_post_io_callback(struct work_struct *work); + +extern int eio_force_warm_boot; + +extern struct work_struct _kcached_wq; + +static void +bc_addfb(struct bio_container *bc, struct eio_bio *ebio) +{ + + atomic_inc(&bc->bc_holdcount); + + ebio->eb_bc = bc; +} + +static void +bc_put(struct bio_container *bc, unsigned int doneio) +{ + struct cache_c *dmc; + int data_dir; + long elapsed; + + if (atomic_dec_and_test(&bc->bc_holdcount)) { + if (bc->bc_dmc->mode == CACHE_MODE_WB) { + eio_release_io_resources(bc->bc_dmc, bc); + } + bc->bc_bio->bi_size = 0; + dmc = bc->bc_dmc; + + /* update iotime for latency */ + data_dir = bio_data_dir(bc->bc_bio); + elapsed = (long)jiffies_to_msecs(jiffies - bc->bc_iotime); + + if (data_dir == READ) + atomic64_add(elapsed, &dmc->eio_stats.rdtime_ms); + else + atomic64_add(elapsed, &dmc->eio_stats.wrtime_ms); + + bio_endio(bc->bc_bio, bc->bc_error); + atomic64_dec(&bc->bc_dmc->nr_ios); + kfree(bc); + } +} + +static void +eb_endio(struct eio_bio *ebio, int error) +{ + + VERIFY(ebio->eb_bc); + + //Propagate only main io errors and sizes + if (ebio->eb_iotype == EB_MAIN_IO) { + if (error) + ebio->eb_bc->bc_error = error; + bc_put(ebio->eb_bc, ebio->eb_size); + } else + bc_put(ebio->eb_bc, 0); + ebio->eb_bc = NULL; + kfree(ebio); +} + +static int +eio_io_async_pages(struct cache_c *dmc, struct eio_io_region *where, int rw, + struct page **pages, unsigned nr_pages, eio_notify_fn fn, void *context, + int hddio) +{ + struct eio_io_request req; + int error = 0; + + memset((char *)&req, 0, sizeof req); + + if (unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + if (where->bdev != dmc->disk_dev->bdev) { + pr_err("eio_io_async_pages: Cache is in degraded mode.\n"); + pr_err("eio_io_async_pages: Can not issue i/o to ssd device.\n"); + return -ENODEV; + } + } + + req.mtype = EIO_PAGES; + req.dptr.plist = pages; + req.num_bvecs = nr_pages; + req.notify = fn; + req.context = context; + req.hddio = hddio; + + error = eio_do_io(dmc, where, rw, &req); + + return error; +} + +static int +eio_io_async_bvec(struct cache_c *dmc, struct eio_io_region *where, int rw, + struct bio_vec *pages, unsigned nr_bvecs, eio_notify_fn fn, + void *context, int hddio) +{ + struct eio_io_request req; + int error = 0; + + memset((char *)&req, 0, sizeof req); + + if (unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + if (where->bdev != dmc->disk_dev->bdev) { + pr_err("eio_io_async_bvec: Cache is in degraded mode.\n"); + pr_err("eio_io_async_Bvec: Can not issue i/o to ssd device.\n"); + return -ENODEV; + } + } + + req.mtype = EIO_BVECS; + req.dptr.pages = pages; + req.num_bvecs = nr_bvecs; + req.notify = fn; + req.context = context; + req.hddio = hddio; + + error = eio_do_io(dmc, where, rw, &req); + + return error; +} + +static void +eio_flag_abios(struct cache_c *dmc, struct eio_bio *abio, int invalidated) +{ + struct eio_bio *nbio; + + while (abio) { + int invalidate; + unsigned long flags; + int cwip_on = 0; + int dirty_on = 0; + int callendio = 0; + nbio = abio->eb_next; + + VERIFY(!(abio->eb_iotype & EB_INVAL) || abio->eb_index == -1); + invalidate = !invalidated && (abio->eb_iotype & EB_INVAL); + + spin_lock_irqsave(&dmc->cache_sets[abio->eb_cacheset].cs_lock, flags); + + if (abio->eb_index != -1) { + if (EIO_CACHE_STATE_GET(dmc, abio->eb_index) & DIRTY) { + dirty_on = 1; + } + + if (unlikely(EIO_CACHE_STATE_GET(dmc, abio->eb_index) & CACHEWRITEINPROG)) { + cwip_on = 1; + } + } + + if (dirty_on) { + /* + * For dirty blocks, we don't change the cache state flags. + * We however, need to end the ebio, if this was the last + * hold on it. + */ + if (atomic_dec_and_test(&abio->eb_holdcount)) { + callendio = 1; + /* We shouldn't reach here when the DIRTY_INPROG flag + * is set on the cache block. It should either have been + * cleared to become DIRTY or INVALID elsewhere. + */ + VERIFY(EIO_CACHE_STATE_GET(dmc, abio->eb_index) != DIRTY_INPROG); + } + } else if (abio->eb_index != -1) { + if (invalidate) { + if (cwip_on) { + EIO_CACHE_STATE_ON(dmc, abio->eb_index, QUEUED); + } else { + EIO_CACHE_STATE_SET(dmc, abio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } + } else { + if (cwip_on) + EIO_CACHE_STATE_OFF(dmc, abio->eb_index, DISKWRITEINPROG); + else { + if (EIO_CACHE_STATE_GET(dmc, abio->eb_index) & QUEUED) { + EIO_CACHE_STATE_SET(dmc, abio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } else { + EIO_CACHE_STATE_SET(dmc, abio->eb_index, VALID); + } + } + } + } else { + VERIFY(invalidated || invalidate); + if (invalidate) + eio_inval_block(dmc, abio->eb_sector); + } + spin_unlock_irqrestore(&dmc->cache_sets[abio->eb_cacheset].cs_lock, flags); + if (!cwip_on && (!dirty_on || callendio)) + eb_endio(abio, 0); + abio = nbio; + } +} + +static void +eio_disk_io_callback(int error, void *context) +{ + struct kcached_job *job; + struct eio_bio *ebio; + struct cache_c *dmc; + unsigned long flags; + unsigned eb_cacheset; + + flags = 0; + job = (struct kcached_job *)context; + dmc = job->dmc; + ebio = job->ebio; + + VERIFY(ebio != NULL); + eb_cacheset = ebio->eb_cacheset; + + + if (unlikely(error)) + dmc->eio_errors.disk_read_errors++; + + spin_lock_irqsave(&dmc->cache_sets[eb_cacheset].cs_lock, flags); + /* Invalidate the cache block */ + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + spin_unlock_irqrestore(&dmc->cache_sets[eb_cacheset].cs_lock, flags); + + if (unlikely(error)) + pr_err("disk_io_callback: io error %d block %lu action %d", + error, job->job_io_regions.disk.sector, job->action); + + eb_endio(ebio, error); + ebio = NULL; + job->ebio = NULL; + eio_free_cache_job(job); + job = NULL; +} + +static void +eio_uncached_read_done(struct kcached_job *job) +{ + struct eio_bio *ebio = job->ebio; + struct cache_c *dmc = job->dmc; + struct eio_bio *iebio; + struct eio_bio *nebio; + unsigned long flags = 0; + + if (ebio->eb_bc->bc_dir == UNCACHED_READ) { + VERIFY(ebio != NULL); + iebio = ebio->eb_next; + while (iebio != NULL) { + nebio = iebio->eb_next; + if (iebio->eb_index != -1) { + spin_lock_irqsave(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + if (unlikely(EIO_CACHE_STATE_GET(dmc, iebio->eb_index) & QUEUED)) { + EIO_CACHE_STATE_SET(dmc, iebio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } else if (EIO_CACHE_STATE_GET(dmc, iebio->eb_index) & CACHEREADINPROG) { + //turn off the cache read in prog flag + EIO_CACHE_STATE_OFF(dmc, iebio->eb_index, BLOCK_IO_INPROG); + } else { + //Should never reach here + VERIFY(0); + } + spin_unlock_irqrestore(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + } + eb_endio(iebio, 0); + iebio = nebio; + } + eb_endio(ebio, 0); + eio_free_cache_job(job); + } else if (ebio->eb_bc->bc_dir == UNCACHED_READ_AND_READFILL) { + /* + * Kick off the READFILL. It will also do a read + * from SSD, in case of ALREADY_DIRTY block + */ + job->action = READFILL; + eio_enqueue_readfill(dmc, job); + } else { + /* Should never reach here for uncached read */ + VERIFY(0); + } +} + +static void +eio_io_callback(int error, void *context) +{ + struct kcached_job *job = (struct kcached_job *)context; + struct cache_c *dmc = job->dmc; + + job->error = error; + INIT_WORK(&job->work, eio_post_io_callback); + queue_work(dmc->callback_q, &job->work); + return; +} + +static void +eio_post_io_callback(struct work_struct *work) +{ + struct kcached_job *job; + struct cache_c *dmc; + struct eio_bio *ebio; + unsigned long flags = 0; + index_t index; + unsigned eb_cacheset; + u_int8_t cstate; + int callendio = 0; + int error; + + job = container_of(work, struct kcached_job, work); + dmc = job->dmc; + index = job->index; + error = job->error; + + VERIFY(index != -1 || job->action == WRITEDISK || job->action == READDISK); + ebio = job->ebio; + VERIFY(ebio != NULL); + VERIFY(ebio->eb_bc); + + eb_cacheset = ebio->eb_cacheset; + if (error) + pr_err("io_callback: io error %d block %lu action %d", + error, job->job_io_regions.disk.sector, job->action); + + switch (job->action) { + case WRITEDISK: + + atomic64_inc(&dmc->eio_stats.writedisk); + if (unlikely(error)) + dmc->eio_errors.disk_write_errors++; + if (unlikely(error) || (ebio->eb_iotype & EB_INVAL)) + eio_inval_range(dmc, ebio->eb_sector, ebio->eb_size); + if (ebio->eb_next) + eio_flag_abios(dmc, ebio->eb_next, + error || (ebio->eb_iotype & EB_INVAL)); + eb_endio(ebio, error); + job->ebio = NULL; + eio_free_cache_job(job); + return; + + case READDISK: + + if (unlikely(error) || unlikely(ebio->eb_iotype & EB_INVAL) + || CACHE_DEGRADED_IS_SET(dmc)) { + if (error) + dmc->eio_errors.disk_read_errors++; + eio_inval_range(dmc, ebio->eb_sector, ebio->eb_size); + eio_flag_abios(dmc, ebio->eb_next, 1); + } else if (ebio->eb_next) { + eio_uncached_read_done(job); + return; + } + eb_endio(ebio, error); + job->ebio = NULL; + eio_free_cache_job(job); + return; + + case READCACHE: + + //atomic64_inc(&dmc->eio_stats.readcache); + //SECTOR_STATS(dmc->eio_stats.ssd_reads, ebio->eb_size); + VERIFY(EIO_DBN_GET(dmc, index) == EIO_ROUND_SECTOR(dmc,ebio->eb_sector)); + cstate = EIO_CACHE_STATE_GET(dmc, index); + /* We shouldn't reach here for DIRTY_INPROG blocks. */ + VERIFY(cstate != DIRTY_INPROG); + if (unlikely(error)) { + dmc->eio_errors.ssd_read_errors++; + /* Retry read from HDD for non-DIRTY blocks. */ + if (cstate != ALREADY_DIRTY) { + spin_lock_irqsave(&dmc->cache_sets[eb_cacheset].cs_lock, flags); + EIO_CACHE_STATE_OFF(dmc, ebio->eb_index, + CACHEREADINPROG); + EIO_CACHE_STATE_ON(dmc, ebio->eb_index, + DISKREADINPROG); + spin_unlock_irqrestore(&dmc->cache_sets[eb_cacheset].cs_lock, flags); + + eio_push_ssdread_failures(job); + schedule_work(&_kcached_wq); + + return; + } + } + callendio = 1; + break; + + case READFILL: + + //atomic64_inc(&dmc->eio_stats.readfill); + //SECTOR_STATS(dmc->eio_stats.ssd_writes, ebio->eb_size); + VERIFY(EIO_DBN_GET(dmc, index) == ebio->eb_sector); + if (unlikely(error)) + dmc->eio_errors.ssd_write_errors++; + if (!(EIO_CACHE_STATE_GET(dmc, index) & CACHEWRITEINPROG)) { + printk(KERN_DEBUG "DISKWRITEINPROG absent in READFILL sector %llu io size %u\n", + (unsigned long long)ebio->eb_sector, ebio->eb_size); + } + callendio = 1; + break; + + case WRITECACHE: + + //SECTOR_STATS(dmc->eio_stats.ssd_writes, ebio->eb_size); + //atomic64_inc(&dmc->eio_stats.writecache); + cstate = EIO_CACHE_STATE_GET(dmc, index); + VERIFY(EIO_DBN_GET(dmc, index) == EIO_ROUND_SECTOR(dmc,ebio->eb_sector)); + /* CWIP is a must for WRITECACHE, except when it is DIRTY */ + VERIFY(cstate & (CACHEWRITEINPROG | DIRTY)); + if (likely(error == 0)) { + /* If it is a DIRTY inprog block, proceed for metadata update */ + if (cstate == DIRTY_INPROG) { + eio_md_write(job); + return; + } + } else { + /* TODO: ask if this if condition is required */ + if (dmc->mode == CACHE_MODE_WT) + dmc->eio_errors.disk_write_errors++; + dmc->eio_errors.ssd_write_errors++; + } + job->ebio = NULL; + break; + + default: + pr_err("io_callback: invalid action %d", job->action); + return; + } + + spin_lock_irqsave(&dmc->cache_sets[eb_cacheset].cs_lock, flags); + + cstate = EIO_CACHE_STATE_GET(dmc, index); + VERIFY(!(cstate & INVALID)); + + if (unlikely((job->action == WRITECACHE) && !(cstate & DISKWRITEINPROG))) { + /* + * Can reach here in 2 cases: + * 1. Uncached write case, where WRITEDISK has finished first + * 2. Cached write case + * + * For DIRTY or DIRTY inprog cases, use eb holdcount to determine + * if end ebio can be called. This is because, we don't set DWIP etc + * flags on those and we have to avoid double end ebio call + */ + VERIFY((cstate != DIRTY_INPROG) || error); + callendio = 1; + if ((cstate & DIRTY) && !atomic_dec_and_test(&ebio->eb_holdcount)) { + callendio = 0; + } + } + + if (cstate & DISKWRITEINPROG) { + /* uncached write and WRITEDISK is not yet finished */ + VERIFY(!(cstate & DIRTY)); /* For dirty blocks, we can't have DWIP flag */ + if (error) { + EIO_CACHE_STATE_ON(dmc, index, QUEUED); + } + EIO_CACHE_STATE_OFF(dmc, index, CACHEWRITEINPROG); + } else if (unlikely(error || (cstate & QUEUED))) { + /* Error or QUEUED is set: mark block as INVALID for non-DIRTY blocks */ + if (cstate != ALREADY_DIRTY) { + EIO_CACHE_STATE_SET(dmc, index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } + } else if (cstate & VALID) { + EIO_CACHE_STATE_OFF(dmc, index, BLOCK_IO_INPROG); + /* + * If we have NO_SSD_IO_INPROG flag set, then this block needs to be + * invalidated. There are three things that can happen -- (i) error, + * (ii) IOs are queued on this block, and (iii) success. + * + * If there was an error or if the QUEUED bit was set, then the logic + * in the if part will take care of setting the block to INVALID. + * Therefore, this is the success path where we invalidate if need be. + */ + + /* + * Harish: TBD + * NO_SSD_IO_INPROG need to be differently handled, in case block is DIRTY + */ + if ((cstate & NO_SSD_IO_INPROG) == NO_SSD_IO_INPROG) { + EIO_CACHE_STATE_OFF(dmc, index, VALID); + } + } + + spin_unlock_irqrestore(&dmc->cache_sets[eb_cacheset].cs_lock, flags); + + if (callendio) { + eb_endio(ebio, error); + } + + eio_free_cache_job(job); + job = NULL; + +} + +/* + * This function processes the kcached_job that + * needs to be scheduled on disk after ssd read failures. + */ +void +eio_ssderror_diskread(struct kcached_job *job) +{ + struct cache_c *dmc; + struct eio_bio *ebio; + index_t index; + int error; + unsigned long flags = 0; + + dmc = job->dmc; + error = 0; + + /* + * 1. Extract the ebio which needs to be scheduled on disk. + * 2. Verify cache block state is VALID + * 3. Make sure that the cache state in not IOINPROG + */ + /* Reset the ssd read error in the job. */ + job->error = 0; + ebio = job->ebio; + index = ebio->eb_index; + + VERIFY(index != -1); + + spin_lock_irqsave(&dmc->cache_sets[index / dmc->assoc].cs_lock, flags); + VERIFY(EIO_CACHE_STATE_GET(dmc, index) & DISKREADINPROG); + spin_unlock_irqrestore(&dmc->cache_sets[index / dmc->assoc].cs_lock, flags); + + VERIFY(ebio->eb_dir == READ); + + atomic64_inc(&dmc->eio_stats.readdisk); + SECTOR_STATS(dmc->eio_stats.disk_reads, ebio->eb_size); + job->action = READDISK; + + error = eio_io_async_bvec(dmc, &job->job_io_regions.disk, ebio->eb_dir, + ebio->eb_bv, ebio->eb_nbvec, + eio_disk_io_callback, job, 1); + + /* + * In case of disk i/o submission error clear ebio and kcached_job. + * This would return the actual read that was issued on ssd. + */ + if(error) + goto out; + + return; + +out: + /* We failed to submit the I/O to dm layer. The corresponding + * block should be marked as INVALID by turning off already set + * flags. + */ + spin_lock_irqsave(&dmc->cache_sets[index / dmc->assoc].cs_lock, flags); + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, INVALID); + spin_unlock_irqrestore(&dmc->cache_sets[index / dmc->assoc].cs_lock, flags); + + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + + eb_endio(ebio, error); + ebio = NULL; + job->ebio = NULL; + eio_free_cache_job(job); +} + +/* Adds clean set request to clean queue. */ +static void +eio_addto_cleanq(struct cache_c *dmc, index_t set, int whole) +{ + unsigned long flags = 0; + + spin_lock_irqsave(&dmc->cache_sets[set].cs_lock, flags); + + if (dmc->cache_sets[set].flags & SETFLAG_CLEAN_INPROG) { + /* Clean already in progress, just add to clean pendings */ + spin_unlock_irqrestore(&dmc->cache_sets[set].cs_lock, flags); + return; + } + + dmc->cache_sets[set].flags |= SETFLAG_CLEAN_INPROG; + if (whole) { + dmc->cache_sets[set].flags |= SETFLAG_CLEAN_WHOLE; + } + + spin_unlock_irqrestore(&dmc->cache_sets[set].cs_lock, flags); + + spin_lock_irqsave(&dmc->clean_sl, flags); + list_add_tail(&dmc->cache_sets[set].list, &dmc->cleanq); + atomic64_inc(&dmc->clean_pendings); + EIO_SET_EVENT_AND_UNLOCK(&dmc->clean_event, &dmc->clean_sl, flags); + return; +} + +/* + * Clean thread loops forever in this, waiting for + * new clean set requests in the clean queue. + */ +int +eio_clean_thread_proc(void *context) +{ + struct cache_c *dmc = (struct cache_c *)context; + unsigned long flags = 0; + u_int64_t systime; + index_t index; + + /* Sync makes sense only for writeback cache */ + VERIFY(dmc->mode == CACHE_MODE_WB); + + dmc->clean_thread_running = 1; + + /* + * Using sysctl_fast_remove to stop the clean thread + * works for now. Should have another flag specifically + * for such notification. + */ + for ( ; !dmc->sysctl_active.fast_remove; ) { + LIST_HEAD(setlist); + struct cache_set *set; + + eio_comply_dirty_thresholds(dmc, -1); + + if (dmc->sysctl_active.do_clean) { + /* pause the periodic clean */ + cancel_delayed_work_sync(&dmc->clean_aged_sets_work); + + /* clean all the sets */ + eio_clean_all(dmc); + + /* resume the periodic clean */ + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + dmc->is_clean_aged_sets_sched = 0; + if (dmc->sysctl_active.time_based_clean_interval && atomic64_read(&dmc->nr_dirty)) { + /* there is a potential race here, If a sysctl changes + the time_based_clean_interval to 0. However a strong + synchronisation is not necessary here + */ + schedule_delayed_work(&dmc->clean_aged_sets_work, + dmc->sysctl_active.time_based_clean_interval * 60 * HZ); + dmc->is_clean_aged_sets_sched = 1; + } + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + } + + if (dmc->sysctl_active.fast_remove) { + break; + } + + spin_lock_irqsave(&dmc->clean_sl, flags); + + while (!((!list_empty(&dmc->cleanq)) || dmc->sysctl_active.fast_remove || + dmc->sysctl_active.do_clean)) { + EIO_WAIT_EVENT(&dmc->clean_event, &dmc->clean_sl, flags); + } + + /* + * Move cleanq elements to a private list for processing. + */ + + list_add(&setlist, &dmc->cleanq); + list_del(&dmc->cleanq); + INIT_LIST_HEAD(&dmc->cleanq); + + spin_unlock_irqrestore(&dmc->clean_sl, flags); + + systime=jiffies; + while (!list_empty(&setlist)) { + set = list_entry((&setlist)->next, struct cache_set, list); + list_del(&set->list); + index = set - dmc->cache_sets; + if (!(dmc->sysctl_active.fast_remove)) { + eio_clean_set(dmc, index, + set->flags & SETFLAG_CLEAN_WHOLE, 0); + } else { + + /* + * Since we are not cleaning the set, we should + * put the set back in the lru list so that + * it is picked up at a later point. + * We also need to clear the clean inprog flag + * otherwise this set would never be cleaned. + */ + + spin_lock_irqsave(&dmc->cache_sets[index].cs_lock, flags); + dmc->cache_sets[index].flags &= + ~(SETFLAG_CLEAN_INPROG | SETFLAG_CLEAN_WHOLE); + spin_unlock_irqrestore(&dmc->cache_sets[index].cs_lock, flags); + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + lru_touch(dmc->dirty_set_lru, index, systime); + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + } + atomic64_dec(&dmc->clean_pendings); + } + } + + /* notifier for cache delete that the clean thread has stopped running */ + dmc->clean_thread_running = 0; + + eio_thread_exit(0); + + //Should never reach here + return 0; +} + +/* + * Cache miss support. We read the data from disk, write it to the ssd. + * To avoid doing 1 IO at a time to the ssd, when the IO is kicked off, + * we enqueue it to a "readfill" queue in the cache in cache sector order. + * The worker thread can then issue all of these IOs and do 1 unplug to + * start them all. + * + */ +static void +eio_enqueue_readfill(struct cache_c *dmc, struct kcached_job *job) +{ + unsigned long flags = 0; + struct kcached_job **j1, *next; + int do_schedule = 0; + + + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + /* Insert job in sorted order of cache sector */ + j1 = &dmc->readfill_queue; + while (*j1 != NULL && (*j1)->job_io_regions.cache.sector < + job->job_io_regions.cache.sector) + j1 = &(*j1)->next; + next = *j1; + *j1 = job; + job->next = next; + do_schedule = (dmc->readfill_in_prog == 0); + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + if (do_schedule) + schedule_work(&dmc->readfill_wq); +} + +void +eio_do_readfill(struct work_struct *work) +{ + struct kcached_job *job, *joblist; + struct eio_bio *ebio; + unsigned long flags = 0; + struct kcached_job *nextjob = NULL; + struct cache_c *dmc = container_of(work, struct cache_c, readfill_wq); + + + + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + if (dmc->readfill_in_prog) + goto out; + dmc->readfill_in_prog = 1; + while (dmc->readfill_queue != NULL) { + joblist = dmc->readfill_queue; + dmc->readfill_queue = NULL; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + for (job = joblist ; job != NULL ; job = nextjob) { + struct eio_bio *iebio; + struct eio_bio *next; + + nextjob = job->next; /* save for later because 'job' will be freed */ + VERIFY(job->action == READFILL); + /* Write to cache device */ + ebio = job->ebio; + iebio = ebio->eb_next; + VERIFY(iebio); + /* other iebios are anchored on this bio. Create + * jobs for them and then issue ios + */ + do { + struct kcached_job *job; + int err; + unsigned long flags; + index_t index; + next = iebio->eb_next; + index = iebio->eb_index; + if (index == -1) { + CTRACE("eio_do_readfill:1\n"); + /* Any INPROG(including DIRTY_INPROG) case would fall here */ + eb_endio(iebio, 0); + iebio = NULL; + } else { + spin_lock_irqsave(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + /* If this block was already valid, we don't need to write it */ + if (unlikely(EIO_CACHE_STATE_GET(dmc, index) & QUEUED)) { + //An invalidation request is queued. Can't do anything + CTRACE("eio_do_readfill:2\n"); + EIO_CACHE_STATE_SET(dmc, index, INVALID); + spin_unlock_irqrestore(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + eb_endio(iebio, 0); + iebio = NULL; + } else if ((EIO_CACHE_STATE_GET(dmc, index) & (VALID | DISKREADINPROG)) + == (VALID | DISKREADINPROG) ) { + /* Do readfill. */ + EIO_CACHE_STATE_SET(dmc, index, VALID | CACHEWRITEINPROG); + VERIFY(EIO_DBN_GET(dmc, index) == iebio->eb_sector); + spin_unlock_irqrestore(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + job = eio_new_job(dmc, iebio, iebio->eb_index); + if (unlikely(job == NULL)) { + err = -ENOMEM; + } else { + err = 0; + job->action = READFILL; + atomic_inc(&dmc->nr_jobs); + SECTOR_STATS(dmc->eio_stats.ssd_readfills, iebio->eb_size); + SECTOR_STATS(dmc->eio_stats.ssd_writes, iebio->eb_size); + atomic64_inc(&dmc->eio_stats.readfill); + atomic64_inc(&dmc->eio_stats.writecache); + err = eio_io_async_bvec(dmc, &job->job_io_regions.cache, WRITE, + iebio->eb_bv, iebio->eb_nbvec, + eio_io_callback, job, 0); + } + if (err) { + pr_err("eio_do_readfill: IO submission failed, block %llu", EIO_DBN_GET(dmc, index)); + spin_lock_irqsave(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + EIO_CACHE_STATE_SET(dmc, iebio->eb_index, INVALID); + spin_unlock_irqrestore(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + eb_endio(iebio, err); + + if (job) { + eio_free_cache_job(job); + job = NULL; + } + } + } else if (EIO_CACHE_STATE_GET(dmc, index) == ALREADY_DIRTY) { + + spin_unlock_irqrestore(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + + /* + * DIRTY block handling: + * Read the dirty data from the cache block to update + * the data buffer already read from the disk + */ + job = eio_new_job(dmc, iebio, iebio->eb_index); + if (unlikely(job == NULL)) { + err = -ENOMEM; + } else { + job->action = READCACHE; + SECTOR_STATS(dmc->eio_stats.ssd_reads, iebio->eb_size); + atomic64_inc(&dmc->eio_stats.readcache); + err = eio_io_async_bvec(dmc, &job->job_io_regions.cache, READ, + iebio->eb_bv, iebio->eb_nbvec, + eio_io_callback, job, 0); + } + + if (err) { + pr_err("eio_do_readfill: dirty block read IO submission failed, block %llu", + EIO_DBN_GET(dmc, index)); + /* can't invalidate the DIRTY block, just return error */ + eb_endio(iebio, err); + if (job) { + eio_free_cache_job(job); + job = NULL; + } + } + } else if ((EIO_CACHE_STATE_GET(dmc, index) & (VALID | CACHEREADINPROG)) + == (VALID|CACHEREADINPROG) ) { + //turn off the cache read in prog flag + //don't need to write the cache block + CTRACE("eio_do_readfill:3\n"); + EIO_CACHE_STATE_OFF(dmc, index, BLOCK_IO_INPROG); + spin_unlock_irqrestore(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + eb_endio(iebio, 0); + iebio = NULL; + } else { + panic("Unknown condition"); + spin_unlock_irqrestore(&dmc->cache_sets[iebio->eb_cacheset].cs_lock, flags); + } + } + iebio = next; + } while (iebio); + eb_endio(ebio, 0); + ebio = NULL; + eio_free_cache_job(job); + } + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + } + dmc->readfill_in_prog = 0; +out: + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + atomic64_inc(&dmc->eio_stats.ssd_readfill_unplugs); + eio_unplug_cache_device(dmc); +} + + +/* + * Map a block from the source device to a block in the cache device. + */ +static u_int32_t +hash_block(struct cache_c *dmc, sector_t dbn) +{ + u_int32_t set_number; + + set_number = eio_hash_block(dmc, dbn); + return set_number; +} + + +static void +find_valid_dbn(struct cache_c *dmc, sector_t dbn, + index_t start_index, index_t *index) +{ + index_t i; + index_t end_index = start_index + dmc->assoc; + + for (i = start_index ; i < end_index ; i++) { + if ((EIO_CACHE_STATE_GET(dmc, i) & VALID) && EIO_DBN_GET(dmc, i) == dbn) { + *index = i; + if ((EIO_CACHE_STATE_GET(dmc, i) & BLOCK_IO_INPROG) == 0) + eio_policy_reclaim_lru_movetail(dmc, i, dmc->policy_ops); + return; + } + } + *index = -1; +} + + +static index_t +find_invalid_dbn(struct cache_c *dmc, index_t start_index) +{ + index_t i; + index_t end_index = start_index + dmc->assoc; + + /* Find INVALID slot that we can reuse */ + for (i = start_index ; i < end_index ; i++) { + if (EIO_CACHE_STATE_GET(dmc, i) == INVALID) { + eio_policy_reclaim_lru_movetail(dmc, i, dmc->policy_ops); + return i; + } + } + return -1; +} + + +/* Search for a slot that we can reclaim */ +static void +find_reclaim_dbn(struct cache_c *dmc, index_t start_index, index_t *index) +{ + int i; + index_t idx; + + + if (dmc->policy_ops == NULL) { + /* + * "start_index" should already be the beginning index of the set. + * We're just being cautious here. + */ + start_index = (start_index / dmc->assoc) * dmc->assoc; + for (i = 0; i < (int)dmc->assoc; i++) { + idx = dmc->random++ % dmc->assoc; + if (EIO_CACHE_STATE_GET(dmc, start_index + idx) == VALID) { + *index = start_index + idx; + return; + } + } + } else + eio_find_reclaim_dbn(dmc->policy_ops, start_index, index); +} + +void +eio_set_warm_boot(void) +{ + eio_force_warm_boot = 1; + return; +} + +/* + * dbn is the starting sector. + */ +static int +eio_lookup(struct cache_c *dmc, struct eio_bio *ebio, index_t *index) +{ + sector_t dbn = EIO_ROUND_SECTOR(dmc, ebio->eb_sector) ; + u_int32_t set_number; + index_t invalid, oldest_clean = -1; + index_t start_index; + + + //ASK it is assumed that the lookup is being done for a single block + set_number = hash_block(dmc, dbn); + start_index = dmc->assoc * set_number; + find_valid_dbn(dmc, dbn, start_index, index); + if (*index >= 0) { + /* We found the exact range of blocks we are looking for */ + return VALID; + } + + invalid = find_invalid_dbn(dmc, start_index); + if (invalid == -1) { + /* We didn't find an invalid entry, search for oldest valid entry */ + find_reclaim_dbn(dmc, start_index, &oldest_clean); + } + /* + * Cache miss : + * We can't choose an entry marked INPROG, but choose the oldest + * INVALID or the oldest VALID entry. + */ + *index = start_index + dmc->assoc; + if (invalid != -1) { + *index = invalid; + return INVALID; + } else if (oldest_clean != -1) { + *index = oldest_clean; + return VALID; + } + return -1; +} + +/* Do metadata update for a set */ +static void +eio_do_mdupdate(struct work_struct *work) +{ + struct mdupdate_request *mdreq; + struct cache_set *set; + struct cache_c *dmc; + unsigned long flags; + index_t i; + index_t start_index; + index_t end_index; + index_t min_index; + index_t max_index; + struct flash_cacheblock *md_blocks; + struct eio_bio *ebio; + u_int8_t cstate; + struct eio_io_region region; + unsigned pindex; + int error, j; + index_t blk_index; + int k; + void *pg_virt_addr[2] = {NULL}; + u_int8_t sector_bits[2] = {0}; + int startbit, endbit; + int rw_flags = 0; + + mdreq = container_of(work, struct mdupdate_request, work); + dmc = mdreq->dmc; + set = &dmc->cache_sets[mdreq->set]; + + mdreq->error = 0; + VERIFY(mdreq->mdblk_bvecs); + + /* + * md_size = dmc->assoc * sizeof(struct flash_cacheblock); + * Currently, md_size is 8192 bytes, mdpage_count is 2 pages maximum. + */ + + VERIFY(mdreq->mdbvec_count && mdreq->mdbvec_count <= 2); + VERIFY((dmc->assoc == 512) || mdreq->mdbvec_count == 1); + for (k = 0; k < (int)mdreq->mdbvec_count; k++) + pg_virt_addr[k] = kmap(mdreq->mdblk_bvecs[k].bv_page); + + spin_lock_irqsave(&set->cs_lock, flags); + + start_index = mdreq->set * dmc->assoc; + end_index = start_index + dmc->assoc; + + pindex = 0; + md_blocks = (struct flash_cacheblock *)pg_virt_addr[pindex]; + j = MD_BLOCKS_PER_PAGE; + + /* initialize the md blocks to write */ + for (i = start_index; i < end_index; i++) { + cstate = EIO_CACHE_STATE_GET(dmc, i); + md_blocks->dbn = EIO_DBN_GET(dmc, i); + if (cstate == ALREADY_DIRTY) { + md_blocks->cache_state = + (VALID | DIRTY); + } else { + md_blocks->cache_state = INVALID; + } + md_blocks++; + j--; + + if ((j == 0) && (++pindex < mdreq->mdbvec_count)) { + md_blocks = (struct flash_cacheblock *)pg_virt_addr[pindex]; + j = MD_BLOCKS_PER_PAGE; + } + + } + + /* Update the md blocks with the pending mdlist */ + min_index = start_index; + max_index = start_index; + + pindex = 0; + md_blocks = (struct flash_cacheblock *)pg_virt_addr[pindex]; + + ebio = mdreq->pending_mdlist; + while (ebio) { + VERIFY(EIO_CACHE_STATE_GET(dmc, ebio->eb_index) == + DIRTY_INPROG); + + blk_index = ebio->eb_index - start_index; + pindex = INDEX_TO_MD_PAGE(blk_index); + blk_index = INDEX_TO_MD_PAGE_OFFSET(blk_index); + sector_bits[pindex] |= (1 << INDEX_TO_MD_SECTOR(blk_index)); + + md_blocks = (struct flash_cacheblock *)pg_virt_addr[pindex]; + md_blocks[blk_index].cache_state = (VALID | DIRTY); + + if (min_index > ebio->eb_index) { + min_index = ebio->eb_index; + } + + if (max_index < ebio->eb_index) { + max_index = ebio->eb_index; + } + + ebio = ebio->eb_next; + } + + /* + * Below code may be required when selective pages need to be + * submitted for metadata update. Currently avoiding the optimization + * for correctness validation. + */ + + /* + min_cboff = (min_index - start_index) / MD_BLOCKS_PER_CBLOCK(dmc); + max_cboff = (max_index - start_index) / MD_BLOCKS_PER_CBLOCK(dmc); + write_size = ((uint32_t)(max_cboff - min_cboff + 1)) << dmc->block_shift; + VERIFY(write_size && (write_size <= to_sector(mdreq->md_size))); + */ + + /* Move the pending mdlist to inprog list */ + mdreq->inprog_mdlist = mdreq->pending_mdlist; + mdreq->pending_mdlist = NULL; + + spin_unlock_irqrestore(&set->cs_lock, flags); + + for (k = 0; k < (int)mdreq->mdbvec_count; k++) + kunmap(mdreq->mdblk_bvecs[k].bv_page); + + /* + * Initiate the I/O to SSD for on-disk md update. + * Harish: TBD. Optimize to write only the affected blocks + */ + + region.bdev = dmc->cache_dev->bdev; + /*region.sector = dmc->md_start_sect + INDEX_TO_MD_SECTOR(start_index) + + (min_cboff << dmc->block_shift); */ + + atomic_set(&mdreq->holdcount, 1); + for (i = 0; i < mdreq->mdbvec_count; i++) { + if (!sector_bits[i]) { + continue; + } + startbit = -1; + j = 0; + while (startbit == -1) { + if (sector_bits[i] & (1 << j)) { + startbit = j; + } + j++; + } + endbit = -1; + j = 7; + while (endbit == -1) { + if (sector_bits[i] & (1 << j)) { + endbit = j; + } + j--; + } + VERIFY(startbit <= endbit && startbit >= 0 && startbit <= 7 && + endbit >= 0 && endbit <= 7); + VERIFY(dmc->assoc != 128 || endbit <= 3); + region.sector = dmc->md_start_sect + INDEX_TO_MD_SECTOR(start_index) + + i * SECTORS_PER_PAGE + startbit; + region.count = endbit - startbit + 1; + mdreq->mdblk_bvecs[i].bv_offset = to_bytes(startbit); + mdreq->mdblk_bvecs[i].bv_len = to_bytes(region.count); + + VERIFY(region.sector <= (dmc->md_start_sect + INDEX_TO_MD_SECTOR(end_index))); + atomic64_inc(&dmc->eio_stats.md_ssd_writes); + SECTOR_STATS(dmc->eio_stats.ssd_writes, to_bytes(region.count)); + atomic_inc(&mdreq->holdcount); + + /* + * Set SYNC for making metadata + * writes as high priority. + */ + rw_flags = WRITE | REQ_SYNC ; + error = eio_io_async_bvec(dmc, &region, rw_flags, + &mdreq->mdblk_bvecs[i], 1, + eio_mdupdate_callback, work, 0); + if (error && !(mdreq->error)) { + mdreq->error = error; + } + } + if (atomic_dec_and_test(&mdreq->holdcount)) { + INIT_WORK(&mdreq->work, eio_post_mdupdate); + queue_work(dmc->mdupdate_q, &mdreq->work); + } +} + +/* Callback function for ondisk metadata update */ +static void +eio_mdupdate_callback(int error, void *context) +{ + struct work_struct *work = (struct work_struct *)context; + struct mdupdate_request *mdreq; + + mdreq = container_of(work, struct mdupdate_request, work); + if (error && !(mdreq->error)) { + mdreq->error = error; + } + if (!atomic_dec_and_test(&mdreq->holdcount)) { + return; + } + INIT_WORK(&mdreq->work, eio_post_mdupdate); + queue_work(mdreq->dmc->mdupdate_q, &mdreq->work); +} + +static void +eio_post_mdupdate(struct work_struct *work) +{ + struct mdupdate_request *mdreq; + struct cache_set *set; + struct cache_c *dmc; + unsigned long flags; + struct eio_bio *ebio; + struct eio_bio *nebio; + int more_pending_mdupdates = 0; + int error; + index_t set_index; + + mdreq = container_of(work, struct mdupdate_request, work); + + dmc = mdreq->dmc; + VERIFY(dmc); + set_index = mdreq->set; + set = &dmc->cache_sets[set_index]; + error = mdreq->error; + + /* Update in-core cache metadata */ + + spin_lock_irqsave(&set->cs_lock, flags); + + /* + * Update dirty inprog blocks. + * On error, convert them to INVALID + * On success, convert them to ALREADY_DIRTY + */ + ebio = mdreq->inprog_mdlist; + while (ebio) { + VERIFY(EIO_CACHE_STATE_GET(dmc, ebio->eb_index) == DIRTY_INPROG); + if (unlikely(error)) { + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } else { + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, ALREADY_DIRTY); + set->nr_dirty++; + atomic64_inc(&dmc->nr_dirty); + atomic64_inc(&dmc->eio_stats.md_write_dirty); + } + ebio = ebio->eb_next; + } + + /* + * If there are more pending requests for md update, + * need to pick up those using the current mdreq. + */ + if (mdreq->pending_mdlist) { + more_pending_mdupdates = 1; + } else { + /* No request pending, we can free the mdreq */ + set->mdreq = NULL; + } + + /* + * After we unlock the set, we need to end the I/Os, + * which were processed as part of this md update + */ + + ebio = mdreq->inprog_mdlist; + mdreq->inprog_mdlist = NULL; + + spin_unlock_irqrestore(&set->cs_lock, flags); + + /* End the processed I/Os */ + while (ebio) { + nebio = ebio->eb_next; + eb_endio(ebio, error); + ebio = nebio; + } + + /* + * if dirty block was added + * 1. update the cache set lru list + * 2. check and initiate cleaning if thresholds are crossed + */ + if (!error) { + eio_touch_set_lru(dmc, set_index); + eio_comply_dirty_thresholds(dmc, set_index); + } + + if (more_pending_mdupdates) { + /* + * Schedule work to process the new + * pending mdupdate requests + */ + INIT_WORK(&mdreq->work, eio_do_mdupdate); + queue_work(dmc->mdupdate_q, &mdreq->work); + } else { + /* + * No more pending mdupdates. + * Free the mdreq. + */ + if (mdreq->mdblk_bvecs) { + eio_free_wb_bvecs(mdreq->mdblk_bvecs, mdreq->mdbvec_count, + SECTORS_PER_PAGE); + kfree(mdreq->mdblk_bvecs); + } + + kfree(mdreq); + } +} + +/* Enqueue metadata update for marking dirty blocks on-disk/in-core */ +static void +eio_enq_mdupdate(struct bio_container *bc) +{ + unsigned long flags = 0; + index_t set_index; + struct eio_bio *ebio; + struct cache_c *dmc = bc->bc_dmc; + struct cache_set *set = NULL; + struct mdupdate_request *mdreq; + int do_schedule; + + ebio = bc->bc_mdlist; + set_index = -1; + do_schedule = 0; + while (ebio) { + if (ebio->eb_cacheset != set_index) { + set_index = ebio->eb_cacheset; + set = &dmc->cache_sets[set_index]; + spin_lock_irqsave(&set->cs_lock, flags); + } + VERIFY(ebio->eb_cacheset == set_index); + + bc->bc_mdlist = ebio->eb_next; + + if (!set->mdreq) { + /* Pick up one mdreq from bc */ + mdreq = bc->mdreqs; + VERIFY(mdreq != NULL); + bc->mdreqs = bc->mdreqs->next; + mdreq->next = NULL; + mdreq->pending_mdlist = ebio; + mdreq->dmc = dmc; + mdreq->set = set_index; + set->mdreq = mdreq; + ebio->eb_next = NULL; + do_schedule = 1; + } else { + mdreq = set->mdreq; + VERIFY(mdreq != NULL); + ebio->eb_next = mdreq->pending_mdlist; + mdreq->pending_mdlist = ebio; + } + + ebio = bc->bc_mdlist; + if (!ebio || ebio->eb_cacheset != set_index) { + spin_unlock_irqrestore(&set->cs_lock, flags); + if (do_schedule) { + INIT_WORK(&mdreq->work, eio_do_mdupdate); + queue_work(dmc->mdupdate_q, &mdreq->work); + do_schedule = 0; + } + } + } + + VERIFY(bc->bc_mdlist == NULL); +} + +/* Kick-off a cache metadata update for marking the blocks dirty */ +void +eio_md_write(struct kcached_job *job) +{ + struct eio_bio *ebio = job->ebio; + struct eio_bio *nebio; + struct eio_bio *pebio; + struct bio_container *bc = ebio->eb_bc; + unsigned long flags; + int enqueue = 0; + + /* + * ebios are stored in ascending order of cache sets. + */ + + spin_lock_irqsave(&bc->bc_lock, flags); + VERIFY(bc->bc_mdwait > 0); + nebio = bc->bc_mdlist; + pebio = NULL; + while (nebio) { + if (nebio->eb_cacheset > ebio->eb_cacheset) { + break; + } + pebio = nebio; + nebio = nebio->eb_next; + } + ebio->eb_next = nebio; + if (!pebio) { + bc->bc_mdlist = ebio; + } else { + pebio->eb_next = ebio; + } + bc->bc_mdwait--; + if (bc->bc_mdwait == 0) { + enqueue = 1; + } + spin_unlock_irqrestore(&bc->bc_lock, flags); + + eio_free_cache_job(job); + + if (enqueue) { + eio_enq_mdupdate(bc); + } +} + +/* Ensure cache level dirty thresholds compliance. If required, trigger cache-wide clean */ +static void +eio_check_dirty_cache_thresholds(struct cache_c *dmc) +{ + if (DIRTY_CACHE_THRESHOLD_CROSSED(dmc)) { + int64_t required_cleans; + int64_t enqueued_cleans; + u_int64_t set_time; + index_t set_index; + unsigned long flags; + + spin_lock_irqsave(&dmc->clean_sl, flags); + if (atomic64_read(&dmc->clean_pendings) || dmc->clean_excess_dirty) { + /* Already excess dirty block cleaning is in progress */ + spin_unlock_irqrestore(&dmc->clean_sl, flags); + return; + } + dmc->clean_excess_dirty = 1; + spin_unlock_irqrestore(&dmc->clean_sl, flags); + + /* Clean needs to be triggered on the cache */ + required_cleans = atomic64_read(&dmc->nr_dirty) - + ((dmc->sysctl_active.dirty_low_threshold * dmc->size)/100); + enqueued_cleans = 0; + + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + do { + lru_rem_head(dmc->dirty_set_lru, &set_index, &set_time); + if (set_index == LRU_NULL) { + break; + } + + enqueued_cleans += dmc->cache_sets[set_index].nr_dirty; + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + eio_addto_cleanq(dmc, set_index, 1); + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + } while (enqueued_cleans <= required_cleans); + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + spin_lock_irqsave(&dmc->clean_sl, flags); + dmc->clean_excess_dirty = 0; + spin_unlock_irqrestore(&dmc->clean_sl, flags); + } +} + +/* Ensure set level dirty thresholds compliance. If required, trigger set clean */ +static void +eio_check_dirty_set_thresholds(struct cache_c *dmc, index_t set) +{ + if (DIRTY_SET_THRESHOLD_CROSSED(dmc, set)) { + eio_addto_cleanq(dmc, set, 0); + return; + } +} + +/* Ensure various cache thresholds compliance. If required trigger clean */ +void +eio_comply_dirty_thresholds(struct cache_c *dmc, index_t set) +{ + /* + * 1. Don't trigger new cleanings if + * - cache is not wb + * - autoclean threshold is crossed + * - fast remove in progress is set + * - cache is in failed mode. + * 2. Initiate set-wide clean, if set level dirty threshold is crossed + * 3. Initiate cache-wide clean, if cache level dirty threshold is crossed + */ + + if (unlikely(CACHE_FAILED_IS_SET(dmc))) { + pr_debug("eio_comply_dirty_thresholds: Cache %s is in failed mode.\n", + dmc->cache_name); + return; + } + + + if (AUTOCLEAN_THRESHOLD_CROSSED(dmc) || (dmc->mode != CACHE_MODE_WB)) { + return; + } + + if (set != -1) { + eio_check_dirty_set_thresholds(dmc, set); + } + eio_check_dirty_cache_thresholds(dmc); +} + +/* Do read from cache */ +static void +eio_cached_read(struct cache_c *dmc, struct eio_bio* ebio, int rw_flags) +{ + struct kcached_job *job; + index_t index = ebio->eb_index; + int err = 0; + + + job = eio_new_job(dmc, ebio, index); + + if (unlikely(job == NULL)) { + err = -ENOMEM; + } else { + job->action = READCACHE; /* Fetch data from cache */ + atomic_inc(&dmc->nr_jobs); + + SECTOR_STATS(dmc->eio_stats.read_hits, ebio->eb_size); + SECTOR_STATS(dmc->eio_stats.ssd_reads, ebio->eb_size); + atomic64_inc(&dmc->eio_stats.readcache); + err = eio_io_async_bvec(dmc, &job->job_io_regions.cache, rw_flags, + ebio->eb_bv, ebio->eb_nbvec, + eio_io_callback, job, 0); + + + } + if (err) { + unsigned long flags; + pr_err("eio_cached_read: IO submission failed, block %llu", EIO_DBN_GET(dmc, index)); + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + /* + * For already DIRTY block, invalidation is too costly, skip it. + * For others, mark the block as INVALID and return error. + */ + if (EIO_CACHE_STATE_GET(dmc, ebio->eb_index) != ALREADY_DIRTY) { + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + eb_endio(ebio, err); + ebio = NULL; + if (job) { + job->ebio = NULL; + eio_free_cache_job(job); + job = NULL; + } + } +} + +/* + * Invalidate any colliding blocks if they are !BUSY and !DIRTY. In BUSY case, + * we need to wait until the underlying IO is finished, and then proceed with + * the invalidation, so a QUEUED flag is added. + */ +static int +eio_inval_block_set_range(struct cache_c *dmc, int set, sector_t iosector, + unsigned iosize, int multiblk) +{ + int start_index, end_index, i; + sector_t endsector = iosector + to_sector(iosize); + + + start_index = dmc->assoc * set; + end_index = start_index + dmc->assoc; + for (i = start_index ; i < end_index ; i++) { + sector_t start_dbn; + sector_t end_dbn; + + if (EIO_CACHE_STATE_GET(dmc, i) & INVALID) + continue; + start_dbn = EIO_DBN_GET(dmc, i); + end_dbn = start_dbn + dmc->block_size; + + if (!(endsector <= start_dbn || iosector >= end_dbn)) { + + if (!(EIO_CACHE_STATE_GET(dmc, i) & (BLOCK_IO_INPROG | DIRTY | QUEUED))) { + EIO_CACHE_STATE_SET(dmc, i, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + if (multiblk) + continue; + return 0; + } + + /* Skip queued flag for DIRTY(inprog or otherwise) blocks. */ + if (!(EIO_CACHE_STATE_GET(dmc, i) & (DIRTY | QUEUED))) { + /* BLOCK_IO_INPROG is set. Set QUEUED flag */ + EIO_CACHE_STATE_ON(dmc, i, QUEUED); + } + + if (!multiblk) + return 1; + } + } + return 0; +} + +int +eio_invalidate_sanity_check(struct cache_c *dmc, u_int64_t iosector, + u_int64_t *num_sectors) +{ + u_int64_t disk_size; + + /* + * Sanity check the arguements + */ + if(unlikely(*num_sectors == 0)) { + pr_info("invaldate_sector_range: nothing to do because number of sectors specified is zero"); + return -EINVAL; + } + + disk_size = to_sector(eio_get_device_size(dmc->disk_dev)); + if (iosector >= disk_size) { + pr_err("eio_inval_range: nothing to do because starting sector is past last sector (%lu > %lu)", + (long unsigned int)iosector, (long unsigned int)disk_size); + return -EINVAL; + } + + if ((iosector + (*num_sectors)) > disk_size) { + pr_info("eio_inval_range: trimming range because there are less sectors to invalidate than requested. (%lu < %lu)", + (long unsigned int)(disk_size - iosector), (long unsigned int)*num_sectors); + *num_sectors = (disk_size - iosector); + } + + return 0; +} + + +#if defined (VMCACHE) +int +eio_invalidate_sector_range(char *cache_name, u_int64_t iosector, u_int64_t num_sectors) +{ + struct cache_c *dmc; + int ret; + + dmc = eio_find_cache(cache_name); + + if (dmc == NULL) { + pr_err("invalidate_sector_range: cache object with name=%s does not exist.", + cache_name); + return -EINVAL; + } + + ret = eio_invalidate_sanity_check(dmc, iosector, &num_sectors); + + if (ret == 0) + eio_inval_range(dmc, iosector, (unsigned)to_bytes(num_sectors)); + else + return ret; + + if (CACHE_VERBOSE_IS_SET(dmc)) { + pr_info("eio_inval_range: Invalidated sector range from sector=%lu to sector=%lu", + (long unsigned int)iosector, (long unsigned int)num_sectors); + } + + return ret; +} +EXPORT_SYMBOL(eio_invalidate_sector_range); +#endif /* VMCACHE */ + +void +eio_inval_range(struct cache_c *dmc, sector_t iosector, unsigned iosize) +{ + u_int32_t bset; + sector_t snum; + sector_t snext; + unsigned ioinset; + unsigned long flags; + int totalsshift = dmc->block_shift + dmc->consecutive_shift; + + snum = iosector; + while (iosize) { + bset = hash_block(dmc, snum); + snext = ((snum >> totalsshift) + 1) << totalsshift; + ioinset = (unsigned)to_bytes(snext - snum); + if (ioinset > iosize) + ioinset = iosize; + spin_lock_irqsave(&dmc->cache_sets[bset].cs_lock, flags); + eio_inval_block_set_range(dmc, bset, snum, ioinset, 1); + spin_unlock_irqrestore(&dmc->cache_sets[bset].cs_lock, flags); + snum = snext; + iosize -= ioinset; + } +} + +/* + * Invalidates all cached blocks without waiting for them to complete + * Should be called with incoming IO suspended + */ +int +eio_invalidate_cache(struct cache_c *dmc) +{ + u_int64_t i = 0; + unsigned long flags = 0; + sector_t disk_dev_size = to_bytes(eio_get_device_size(dmc->disk_dev)); + + + /* invalidate the whole cache */ + for (i = 0 ; i < (dmc->size >> dmc->consecutive_shift) ; i++) { + spin_lock_irqsave (&dmc->cache_sets[i].cs_lock, flags); + /* Harish: TBD. Apply proper fix for the cast to disk_dev_size */ + (void) eio_inval_block_set_range(dmc, (int) i, 0, + (unsigned)disk_dev_size, 0); + spin_unlock_irqrestore(&dmc->cache_sets[i].cs_lock, flags); + } /* end - for all cachesets (i) */ + + return (0); /* i suspect we may need to return different statuses in the future */ +} /* eio_invalidate_cache */ + +static int +eio_inval_block(struct cache_c *dmc, sector_t iosector) +{ + u_int32_t bset; + int queued; + + + //Chop lower bits of iosector + iosector = EIO_ROUND_SECTOR(dmc, iosector); + bset = hash_block(dmc, iosector); + queued = eio_inval_block_set_range(dmc, bset, iosector, + (unsigned)to_bytes(dmc->block_size), 0); + + return queued; +} + +/* Serving write I/Os, that involves both SSD and HDD */ +static int +eio_uncached_write(struct cache_c *dmc, struct eio_bio *ebio) +{ + struct kcached_job *job; + int err = 0; + index_t index = ebio->eb_index; + unsigned long flags = 0; + u_int8_t cstate; + + if (index == -1) { + /* + * No work, if block is not allocated. + * Ensure, invalidation of the block at the end + */ + ebio->eb_iotype |= EB_INVAL; + return 0; + } + + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + cstate = EIO_CACHE_STATE_GET(dmc, index); + VERIFY(cstate & (DIRTY | CACHEWRITEINPROG)); + if (cstate == ALREADY_DIRTY) { + /* + * Treat the dirty block cache write failure as + * I/O failure for the entire I/O + * Harish: TBD + * Can we live without this restriction + */ + ebio->eb_iotype = EB_MAIN_IO; + + /* + * We don't set inprog flag on dirty block. + * In lieu of the inprog flag, we are using the + * eb_holdcount for dirty block, so that the + * endio can be called, only when the write to disk + * and the write to cache both complete for the ebio + */ + atomic_inc(&ebio->eb_holdcount); + } else { + /* ensure DISKWRITEINPROG for uncached write on non-DIRTY blocks */ + EIO_CACHE_STATE_ON(dmc, index, DISKWRITEINPROG); + } + + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + + job = eio_new_job(dmc, ebio, index); + if (unlikely(job == NULL)) { + err = -ENOMEM; + } else { + job->action = WRITECACHE; + SECTOR_STATS(dmc->eio_stats.ssd_writes, ebio->eb_size); + atomic64_inc(&dmc->eio_stats.writecache); + err = eio_io_async_bvec(dmc, &job->job_io_regions.cache, WRITE, + ebio->eb_bv, ebio->eb_nbvec, + eio_io_callback, job, 0); + } + + if (err) { + pr_err("eio_uncached_write: IO submission failed, block %llu", + EIO_DBN_GET(dmc, index)); + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + if (EIO_CACHE_STATE_GET(dmc, ebio->eb_index) == ALREADY_DIRTY) { + /* + * Treat I/O failure on a DIRTY block as failure of entire I/O. + * Harish: TBD + * Can do better error handling by invalidation of the dirty + * block, if the cache block write failed, but disk write succeeded + */ + ebio->eb_bc->bc_error = err; + } else { + /* Mark the block as INVALID for non-DIRTY block. */ + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + /* Set the INVAL flag to ensure block is marked invalid at the end */ + ebio->eb_iotype |= EB_INVAL; + ebio->eb_index = -1; + } + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + if (job) { + job->ebio = NULL; + eio_free_cache_job(job); + job = NULL; + } + } + + return err; +} + +/* Serving write I/Os that can be fulfilled just by SSD */ +static int +eio_cached_write(struct cache_c *dmc, struct eio_bio *ebio, int rw_flags) +{ + struct kcached_job *job; + int err = 0; + index_t index = ebio->eb_index; + unsigned long flags = 0; + u_int8_t cstate; + + /* + * WRITE (I->DV) + * WRITE (V->DV) + * WRITE (V1->DV2) + * WRITE (DV->DV) + */ + + /* Possible only in writeback caching mode */ + VERIFY(dmc->mode == CACHE_MODE_WB); + + /* + * Harish: TBD + * Possibly don't need the spinlock-unlock here + */ + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + cstate = EIO_CACHE_STATE_GET(dmc, index); + if (!(cstate & DIRTY)) { + VERIFY(cstate & CACHEWRITEINPROG); + /* make sure the block is marked DIRTY inprogress */ + EIO_CACHE_STATE_SET(dmc, index, DIRTY_INPROG); + } + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + + job = eio_new_job(dmc, ebio, index); + if (unlikely(job == NULL)) { + err = -ENOMEM; + } else { + job->action = WRITECACHE; + + SECTOR_STATS(dmc->eio_stats.ssd_writes, ebio->eb_size); + atomic64_inc(&dmc->eio_stats.writecache); + VERIFY((rw_flags & 1) == WRITE); + err = eio_io_async_bvec(dmc, &job->job_io_regions.cache, rw_flags, + ebio->eb_bv, ebio->eb_nbvec, + eio_io_callback, job, 0); + + } + + if (err) { + pr_err("eio_cached_write: IO submission failed, block %llu", EIO_DBN_GET(dmc, index)); + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + cstate = EIO_CACHE_STATE_GET(dmc, index); + if (cstate == DIRTY_INPROG) { + /* A DIRTY(inprog) block should be invalidated on error */ + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } else { + /* An already DIRTY block don't have an option but just return error. */ + VERIFY(cstate == ALREADY_DIRTY); + } + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + eb_endio(ebio, err); + ebio = NULL; + if (job) { + job->ebio = NULL; + eio_free_cache_job(job); + job = NULL; + } + } + + return err; +} + + +static struct eio_bio * +eio_new_ebio(struct cache_c *dmc, struct bio *bio, unsigned *presidual_biovec, sector_t snum, + int iosize, struct bio_container *bc, int iotype) +{ + struct eio_bio *ebio; + int residual_biovec = *presidual_biovec; + int numbvecs = 0; + int ios; + + + if (residual_biovec) { + int bvecindex = bio->bi_idx; + int rbvindex; + + /* Calculate the number of bvecs required */ + ios = iosize; + while (ios > 0) { + int len; + + if (ios == iosize) + len = bio->bi_io_vec[bvecindex].bv_len - residual_biovec; + else + len = bio->bi_io_vec[bvecindex].bv_len; + + numbvecs++; + if (len > ios) + len = ios; + ios -= len; + bvecindex++; + } + ebio = kmalloc(sizeof (struct eio_bio) + numbvecs * sizeof (struct bio_vec), GFP_NOWAIT); + + if (!ebio) + return ERR_PTR(-ENOMEM); + + rbvindex = 0; + ios = iosize; + while (ios > 0) { + ebio->eb_rbv[rbvindex].bv_page = bio->bi_io_vec[bio->bi_idx].bv_page; + ebio->eb_rbv[rbvindex].bv_offset = bio->bi_io_vec[bio->bi_idx].bv_offset + residual_biovec; + ebio->eb_rbv[rbvindex].bv_len = bio->bi_io_vec[bio->bi_idx].bv_len - residual_biovec; + if (ebio->eb_rbv[rbvindex].bv_len > (unsigned)ios) { + residual_biovec += ios; + ebio->eb_rbv[rbvindex].bv_len = ios; + } else { + residual_biovec = 0; + bio->bi_idx++; + } + ios -= ebio->eb_rbv[rbvindex].bv_len; + rbvindex++; + } + VERIFY(rbvindex == numbvecs); + ebio->eb_bv = ebio->eb_rbv; + } else { + ebio = kmalloc(sizeof (struct eio_bio), GFP_NOWAIT); + + if (!ebio) + return ERR_PTR(-ENOMEM); + ebio->eb_bv = bio->bi_io_vec + bio->bi_idx; + ios = iosize; + while (ios > 0) { + numbvecs++; + if ((unsigned)ios < bio->bi_io_vec[bio->bi_idx].bv_len) { + residual_biovec = ios; + ios = 0; + } else { + ios -= bio->bi_io_vec[bio->bi_idx].bv_len; + bio->bi_idx++; + } + } + } + VERIFY(ios == 0); + VERIFY(numbvecs != 0); + *presidual_biovec = residual_biovec; + + ebio->eb_sector = snum; + ebio->eb_cacheset = hash_block(dmc, snum); + ebio->eb_size = iosize; + ebio->eb_dir = bio_data_dir(bio); + ebio->eb_next = NULL; + ebio->eb_index = -1; + ebio->eb_iotype = iotype; + ebio->eb_nbvec = numbvecs; + + bc_addfb(bc, ebio); + + /* Always set the holdcount for eb to 1, to begin with. */ + atomic_set(&ebio->eb_holdcount, 1); + + return ebio; +} + +/* Issues HDD I/O */ +static void +eio_disk_io(struct cache_c *dmc, struct bio *bio, + struct eio_bio *anchored_bios, struct bio_container *bc, + int force_inval) +{ + struct eio_bio *ebio; + struct kcached_job *job; + int residual_biovec = 0; + int error = 0; + + //disk io happens on whole bio. Reset bi_idx + bio->bi_idx = 0; + ebio = eio_new_ebio(dmc, bio, &residual_biovec, bio->bi_sector, bio->bi_size, bc, EB_MAIN_IO); + + if (unlikely(IS_ERR(ebio))) { + bc->bc_error = error = PTR_ERR(ebio); + ebio = NULL; + goto errout; + } + + if (force_inval) + ebio->eb_iotype |= EB_INVAL; + ebio->eb_next = anchored_bios; //Anchor the ebio list to this super bio + job = eio_new_job(dmc, ebio, -1); + + + if (unlikely(job == NULL)) { + error = -ENOMEM; + goto errout; + } + atomic_inc(&dmc->nr_jobs); + if (ebio->eb_dir == READ) { + job->action = READDISK; + SECTOR_STATS(dmc->eio_stats.disk_reads, bio->bi_size); + atomic64_inc(&dmc->eio_stats.readdisk); + } else { + job->action = WRITEDISK; + SECTOR_STATS(dmc->eio_stats.disk_writes, bio->bi_size); + atomic64_inc(&dmc->eio_stats.writedisk); + } + + + /* + * Pass the original bio flags as is, while doing + * read / write to HDD. + */ + VERIFY_BIO_FLAGS(ebio); + error = eio_io_async_bvec(dmc, &job->job_io_regions.disk, + GET_BIO_FLAGS(ebio), + ebio->eb_bv, ebio->eb_nbvec, + eio_io_callback, job, 1); + + if (error) { + job->ebio = NULL; + eio_free_cache_job(job); + goto errout; + } + return; + +errout: + eio_inval_range(dmc, bio->bi_sector, bio->bi_size); + eio_flag_abios(dmc, anchored_bios, error); + + if (ebio) + eb_endio(ebio, error); + return; +} + +//Given a sector number and biosize, returns cache io size +static unsigned int +eio_get_iosize(struct cache_c *dmc, sector_t snum, unsigned int biosize) +{ + unsigned int iosize; + unsigned int swithinblock = snum & (dmc->block_size - 1); + + + //Check whether io starts at a cache block boundary + if (swithinblock) + iosize = (unsigned int)to_bytes(dmc->block_size - swithinblock); + else + iosize = (unsigned int)to_bytes(dmc->block_size); + if (iosize > biosize) + iosize = biosize; + return iosize; +} + +/* Insert a new set sequence in sorted order to existing set sequence list */ +static int +insert_set_seq(struct set_seq **seq_list, index_t first_set, index_t last_set) +{ + struct set_seq *cur_seq = NULL; + struct set_seq *prev_seq = NULL; + struct set_seq *new_seq = NULL; + + VERIFY((first_set != -1) && (last_set != -1) && (last_set >= first_set)); + + for (cur_seq = *seq_list; cur_seq; prev_seq = cur_seq, cur_seq = cur_seq->next) { + if (first_set > cur_seq->last_set) { + /* go for the next seq in the sorted seq list */ + continue; + } + + if (last_set < cur_seq->first_set) { + /* break here to insert the new seq to seq list at this point */ + break; + } + + /* + * There is an overlap of the new seq with the current seq. + * Adjust the first_set field of the current seq to consume + * the overlap. + */ + if (first_set < cur_seq->first_set) { + cur_seq->first_set = first_set; + } + + if (last_set <= cur_seq->last_set) { + /* The current seq now fully encompasses the first and last sets */ + return 0; + } + + /* Increment the first set so as to start from, where the current seq left */ + first_set = cur_seq->last_set + 1; + } + + new_seq = kmalloc(sizeof(struct set_seq), GFP_NOWAIT); + if (new_seq == NULL) { + return -ENOMEM; + } + new_seq->first_set = first_set; + new_seq->last_set = last_set; + if (prev_seq) { + new_seq->next = prev_seq->next; + prev_seq->next = new_seq; + } else { + new_seq->next = *seq_list; + *seq_list = new_seq; + } + + return 0; +} + +/* Acquire read/shared lock for the sets covering the entire I/O range */ +static int +eio_acquire_set_locks(struct cache_c *dmc, struct bio_container *bc) +{ + struct bio *bio = bc->bc_bio; + sector_t round_sector; + sector_t end_sector; + sector_t set_size; + index_t cur_set; + index_t first_set; + index_t last_set; + index_t i; + struct set_seq *cur_seq; + struct set_seq *next_seq; + int error; + + /* + * Find first set using start offset of the I/O and lock it. + * Find next sets by adding the set offsets to the previous set + * Identify all the sequences of set numbers that need locking. + * Keep the sequences in sorted list. + * For each set in each sequence + * - acquire read lock on the set. + */ + + round_sector = EIO_ROUND_SET_SECTOR(dmc, bio->bi_sector); + set_size = dmc->block_size * dmc->assoc; + end_sector = bio->bi_sector + to_sector(bio->bi_size); + first_set = -1; + last_set = -1; + bc->bc_setspan = NULL; + + while (round_sector < end_sector) { + cur_set = hash_block(dmc, round_sector); + if (first_set == -1) { + first_set = cur_set; + last_set = cur_set; + } else if (cur_set == (last_set + 1)) { + last_set = cur_set; + } else { + /* + * Add the seq of start, end set to sorted (first, last) seq list + * and reinit the first and last set + */ + error = insert_set_seq(&bc->bc_setspan, first_set, last_set); + if (error) { + goto err_out; + } + first_set = cur_set; + last_set = cur_set; + } + + round_sector += set_size; + } + + /* Add the remaining first, last set sequence */ + + VERIFY((first_set != -1) && (last_set == cur_set)); + + if (bc->bc_setspan == NULL) { + /* No sequence was added, can use singlespan */ + cur_seq = &bc->bc_singlesspan; + cur_seq->first_set = first_set; + cur_seq->last_set = last_set; + cur_seq->next = NULL; + bc->bc_setspan = cur_seq; + } else { + error = insert_set_seq(&bc->bc_setspan, first_set, last_set); + if (error) { + goto err_out; + } + } + + /* Acquire read locks on the sets in the set span */ + for (cur_seq = bc->bc_setspan; cur_seq; cur_seq = cur_seq->next) { + for (i = cur_seq->first_set; i <= cur_seq->last_set; i++) { + down_read(&dmc->cache_sets[i].rw_lock); + } + } + + return 0; + +err_out: + + /* Free the seqs in the seq list, unless it is just the local seq */ + if (bc->bc_setspan != &bc->bc_singlesspan) { + for (cur_seq = bc->bc_setspan; cur_seq; cur_seq = next_seq) { + next_seq = cur_seq->next; + kfree(cur_seq); + } + } + return error; +} + + +/* + * Allocate mdreq and md_blocks for each set. + */ +static int +eio_alloc_mdreqs(struct cache_c *dmc, struct bio_container *bc) +{ + index_t i; + struct mdupdate_request *mdreq; + int nr_bvecs, ret; + struct set_seq *cur_seq; + + bc->mdreqs = NULL; + + for (cur_seq = bc->bc_setspan; cur_seq; cur_seq = cur_seq->next) { + for (i = cur_seq->first_set; i <= cur_seq->last_set; i++) { + mdreq = kzalloc(sizeof(*mdreq), GFP_NOWAIT); + if (mdreq) { + mdreq->md_size = dmc->assoc * sizeof(struct flash_cacheblock); + nr_bvecs = IO_BVEC_COUNT(mdreq->md_size, SECTORS_PER_PAGE); + + mdreq->mdblk_bvecs = (struct bio_vec *)kmalloc( + sizeof(struct bio_vec) * nr_bvecs, GFP_KERNEL); + if(mdreq->mdblk_bvecs) { + + ret = eio_alloc_wb_bvecs(mdreq->mdblk_bvecs, nr_bvecs, + SECTORS_PER_PAGE); + if (ret) { + pr_err("eio_alloc_mdreqs: failed to allocated pages\n"); + kfree(mdreq->mdblk_bvecs); + mdreq->mdblk_bvecs = NULL; + } + mdreq->mdbvec_count = nr_bvecs; + } + } + + if (unlikely((mdreq == NULL) || (mdreq->mdblk_bvecs == NULL))) { + struct mdupdate_request *nmdreq; + + mdreq = bc->mdreqs; + while (mdreq) { + nmdreq = mdreq->next; + if (mdreq->mdblk_bvecs) { + eio_free_wb_bvecs(mdreq->mdblk_bvecs, mdreq->mdbvec_count, + SECTORS_PER_PAGE); + kfree(mdreq->mdblk_bvecs); + } + kfree(mdreq); + mdreq = nmdreq; + } + bc->mdreqs = NULL; + return -ENOMEM; + } else { + mdreq->next = bc->mdreqs; + bc->mdreqs = mdreq; + } + } + } + + return 0; + +} + +/* + * Release: + * 1. the set locks covering the entire I/O range + * 2. any previously allocated memory for md update + */ +static int +eio_release_io_resources(struct cache_c *dmc, struct bio_container *bc) +{ + index_t i; + struct mdupdate_request *mdreq; + struct mdupdate_request *nmdreq; + struct set_seq *cur_seq; + struct set_seq *next_seq; + + /* Release read locks on the sets in the set span */ + for (cur_seq = bc->bc_setspan; cur_seq; cur_seq = cur_seq->next) { + for (i = cur_seq->first_set; i <= cur_seq->last_set; i++) { + up_read(&dmc->cache_sets[i].rw_lock); + } + } + + /* Free the seqs in the set span, unless it is single span */ + if (bc->bc_setspan != &bc->bc_singlesspan) { + for (cur_seq = bc->bc_setspan; cur_seq; cur_seq = next_seq) { + next_seq = cur_seq->next; + kfree(cur_seq); + } + } + + mdreq = bc->mdreqs; + while (mdreq) { + nmdreq = mdreq->next; + if (mdreq->mdblk_bvecs) { + eio_free_wb_bvecs(mdreq->mdblk_bvecs, mdreq->mdbvec_count, + SECTORS_PER_PAGE); + kfree(mdreq->mdblk_bvecs); + } + kfree(mdreq); + mdreq = nmdreq; + } + bc->mdreqs = NULL; + + return 0; +} + +/* + * Decide the mapping and perform necessary cache operations for a bio request. + */ +int +eio_map(struct cache_c *dmc, struct request_queue *rq, + struct bio *bio) +{ + sector_t sectors = to_sector(bio->bi_size); + struct eio_bio *ebio = NULL; + struct bio_container *bc; + sector_t snum; + unsigned int iosize; + unsigned int totalio; + unsigned int biosize; + unsigned int residual_biovec; + unsigned int force_uncached = 0; + int data_dir = bio_data_dir(bio); + + //bio list + struct eio_bio *ebegin = NULL; + struct eio_bio *eend = NULL; + struct eio_bio *enext = NULL; + + VERIFY(bio->bi_idx == 0); + + pr_debug("this needs to be removed immediately \n"); + + if (bio_rw_flagged(bio, REQ_DISCARD)) { + pr_debug("eio_map: Discard IO received. Invalidate incore start=%lu totalsectors=%d.\n", + (unsigned long)bio->bi_sector, (int)to_sector(bio->bi_size)); + bio_endio(bio, 0); + pr_err("eio_map: I/O with Discard flag received. Discard flag is not supported.\n"); + return 0; + } + + if (unlikely(dmc->cache_rdonly)) { + if (data_dir != READ) { + bio_endio(bio, -EPERM); + pr_debug("eio_map: cache is read only, write not permitted\n"); + return 0; + } + } + + if (sectors < SIZE_HIST) + atomic64_inc(&dmc->size_hist[sectors]); + + if (data_dir == READ) { + SECTOR_STATS(dmc->eio_stats.reads, bio->bi_size); + atomic64_inc(&dmc->eio_stats.readcount); + } else { + SECTOR_STATS(dmc->eio_stats.writes, bio->bi_size); + atomic64_inc(&dmc->eio_stats.writecount); + } + + /* + * Cache FAILED mode is like Hard failure. + * Dont allow I/Os to go through. + */ + if (unlikely(CACHE_FAILED_IS_SET(dmc))) { + //ASK confirm that once failed is set, it's never reset + /* Source device is not available. */ + CTRACE("eio_map:2 source device is not present. Cache is in Failed state\n"); + bio_endio(bio, -ENODEV); + bio = NULL; + return DM_MAPIO_SUBMITTED; + } + + /* WB cache will never be in degraded mode. */ + if (unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + VERIFY(dmc->mode != CACHE_MODE_WB); + force_uncached = 1; + } + + /* + * Process zero sized bios by passing original bio flags + * to both HDD and SSD. + */ + if (bio->bi_size == 0) { + eio_process_zero_size_bio(dmc, bio); + return DM_MAPIO_SUBMITTED; + } + + /* Create a bio container */ + + bc = kzalloc(sizeof (struct bio_container), GFP_NOWAIT); + if (!bc) { + bio_endio(bio, -ENOMEM); + return DM_MAPIO_SUBMITTED; + } + bc->bc_iotime = jiffies; + bc->bc_bio = bio; + bc->bc_dmc = dmc; + spin_lock_init(&bc->bc_lock); + atomic_set(&bc->bc_holdcount, 1); + bc->bc_error = 0; + + snum = bio->bi_sector; + totalio = bio->bi_size; + biosize = bio->bi_size; + residual_biovec = 0; + + if (dmc->mode == CACHE_MODE_WB) { + int ret; + /* + * For writeback, the app I/O and the clean I/Os + * need to be exclusive for a cache set. Acquire shared + * lock on the cache set for app I/Os and exclusive + * lock on the cache set for clean I/Os. + */ + if ((ret = eio_acquire_set_locks(dmc, bc)) != 0) { + bio_endio(bio, ret); + kfree(bc); + return DM_MAPIO_SUBMITTED; + } + } + + atomic64_inc(&dmc->nr_ios); + + /* + * Prepare for I/O processing. + * - Allocate ebios. + * - For reads, identify if we need to do uncached read + * - If force uncached I/O is set, invalidate the cache blocks for the I/O + */ + + if (force_uncached) { + eio_inval_range(dmc, snum, totalio); + } else { + while (biosize) { + iosize = eio_get_iosize(dmc, snum, biosize); + + if (IS_ERR(ebio = eio_new_ebio(dmc, bio, &residual_biovec, + snum, iosize, bc, EB_SUBORDINATE_IO))) { + bc->bc_error = -ENOMEM; + break; + } + + /* Anchor this ebio on ebio list. Preserve the order */ + if (ebegin) { + eend->eb_next = ebio; + } else { + ebegin = ebio; + } + eend = ebio; + + biosize -= iosize; + snum += to_sector(iosize); + } + } + + if (bc->bc_error) { + /* Error. Do ebio and bc cleanup. */ + ebio = ebegin; + while (ebio) { + enext = ebio->eb_next; + eb_endio(ebio, bc->bc_error); + ebio = enext; + } + + /* By now, the bc_holdcount must be 1 */ + VERIFY(atomic_read(&bc->bc_holdcount) == 1); + + /* Goto out to cleanup the bc(in bc_put()) */ + goto out; + } + + /* + * Start processing of the ebios. + * + * Note: don't return error from this point on. + * Error handling would be done as part of + * the processing of the ebios internally. + */ + if (force_uncached) { + VERIFY(dmc->mode != CACHE_MODE_WB); + if (data_dir == READ) { + atomic64_inc(&dmc->eio_stats.uncached_reads); + } else { + atomic64_inc(&dmc->eio_stats.uncached_writes); + } + eio_disk_io(dmc, bio, ebegin, bc, 1); + } else if (data_dir == READ) { + + /* read io processing */ + eio_read(dmc, bc, ebegin); + } else { + /* write io processing */ + eio_write(dmc, bc, ebegin); + } + +out: + + if (bc) + bc_put(bc, 0); + + return DM_MAPIO_SUBMITTED; +} + +/* + * Checks the cache block state, for deciding cached/uncached read. + * Also reserves/allocates the cache block, wherever necessary. + * + * Return values + * 1: cache hit + * 0: cache miss + */ +static int +eio_read_peek(struct cache_c *dmc, struct eio_bio *ebio) +{ + index_t index; + int res; + int retval = 0; + unsigned long flags; + u_int8_t cstate; + + + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + + res = eio_lookup(dmc, ebio, &index); + ebio->eb_index = -1; + + if (res < 0) { + atomic64_inc(&dmc->eio_stats.noroom); + goto out; + } + + cstate = EIO_CACHE_STATE_GET(dmc, index); + + if (cstate & (BLOCK_IO_INPROG | QUEUED)) { + /* + * We found a valid or invalid block but an io is on, so we can't + * proceed. Don't invalidate it. This implies that we'll + * have to read from disk. + * Read on a DIRTY | INPROG block (block which is going to be DIRTY) + * is also redirected to read from disk. + */ + goto out; + } + + if (res == VALID) { + VERIFY(cstate & VALID); + if ((EIO_DBN_GET(dmc, index) == + EIO_ROUND_SECTOR(dmc, ebio->eb_sector))) { + /* + * Read/write should be done on already DIRTY block + * without any inprog flag. + * Ensure that a failure of DIRTY block read is propagated to app. + * non-DIRTY valid blocks should have inprog flag. + */ + if (cstate == ALREADY_DIRTY) { + ebio->eb_iotype = EB_MAIN_IO; + /* + * Set to uncached read and readfill for now. + * It may change to CACHED_READ later, if all + * the blocks are found to be cached + */ + ebio->eb_bc->bc_dir = UNCACHED_READ_AND_READFILL; + } else { + EIO_CACHE_STATE_ON(dmc, index, CACHEREADINPROG); + } + retval = 1; + ebio->eb_index = index; + goto out; + } + + /* cache is marked readonly. Do not allow READFILL on SSD */ + if (unlikely(dmc->cache_rdonly)) + goto out; + + /* + * Found a block to be recycled. + * Its guranteed that it will be a non-DIRTY block + */ + VERIFY(!(cstate & DIRTY)); + if (to_sector(ebio->eb_size) == dmc->block_size) { + //We can recycle and then READFILL only if iosize is block size + atomic64_inc(&dmc->eio_stats.rd_replace); + EIO_CACHE_STATE_SET(dmc, index, VALID | DISKREADINPROG); + EIO_DBN_SET(dmc, index, (sector_t)ebio->eb_sector); + ebio->eb_index = index; + ebio->eb_bc->bc_dir = UNCACHED_READ_AND_READFILL; + } + goto out; + } + VERIFY(res == INVALID); + + /* cache is marked readonly. Do not allow READFILL on SSD */ + if (unlikely(dmc->cache_rdonly)) + goto out; + /* + * Found an invalid block to be used. + * Can recycle only if iosize is block size + */ + if (to_sector(ebio->eb_size) == dmc->block_size) { + VERIFY(cstate & INVALID); + EIO_CACHE_STATE_SET(dmc, index, VALID | DISKREADINPROG); + atomic64_inc(&dmc->eio_stats.cached_blocks); + EIO_DBN_SET(dmc, index, (sector_t)ebio->eb_sector); + ebio->eb_index = index; + ebio->eb_bc->bc_dir = UNCACHED_READ_AND_READFILL; + } + +out: + + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, + flags); + + /* + * Enqueue clean set if there is no room in the set + * Harish: TBD + * Ensure, a force clean + */ + if (res < 0) { + eio_comply_dirty_thresholds(dmc, ebio->eb_cacheset); + } + + return retval; +} + +/* + * Checks the cache block state, for deciding cached/uncached write. + * Also reserves/allocates the cache block, wherever necessary. + * + * Return values + * 1: cache block is available or newly allocated + * 0: cache block could not be got for the ebio + */ +static int +eio_write_peek(struct cache_c *dmc, struct eio_bio *ebio) +{ + index_t index; + int res; + int retval; + u_int8_t cstate; + unsigned long flags; + + + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, flags); + + res = eio_lookup(dmc, ebio, &index); + ebio->eb_index = -1; + retval = 0; + + if (res < 0) { + /* cache block not found and new block couldn't be allocated */ + atomic64_inc(&dmc->eio_stats.noroom); + ebio->eb_iotype |= EB_INVAL; + goto out; + } + + cstate = EIO_CACHE_STATE_GET(dmc, index); + + if (cstate & (BLOCK_IO_INPROG | QUEUED)) { + ebio->eb_iotype |= EB_INVAL; + /* treat as if cache block is not available */ + goto out; + } + + if ((res == VALID) && (EIO_DBN_GET(dmc, index) == + EIO_ROUND_SECTOR(dmc, ebio->eb_sector))) { + /* + * Cache hit. + * All except an already DIRTY block should have an INPROG flag. + * If it is a cached write, a DIRTY flag would be added later. + */ + SECTOR_STATS(dmc->eio_stats.write_hits, ebio->eb_size); + if (cstate != ALREADY_DIRTY) { + EIO_CACHE_STATE_ON(dmc, index, CACHEWRITEINPROG); + } else { + atomic64_inc(&dmc->eio_stats.dirty_write_hits); + } + ebio->eb_index = index; + /* + * A VALID block should get upgraded to DIRTY, only when we + * are updating the entire cache block(not partially). + * Otherwise, 2 sequential partial writes can lead to missing + * data when one write upgrades the cache block to DIRTY, while + * the other just writes to HDD. Subsequent read would be + * served from the cache block, which won't have the data from + * 2nd write. + */ + if ((cstate == ALREADY_DIRTY) || + (to_sector(ebio->eb_size) == dmc->block_size)) { + retval = 1; + } else { + retval = 0; + } + goto out; + + } + + /* + * cache miss with a new block allocated for recycle. + * Set INPROG flag, if the ebio size is equal to cache block size + */ + VERIFY(!(EIO_CACHE_STATE_GET(dmc, index) & DIRTY)); + if (to_sector(ebio->eb_size) == dmc->block_size) { + if (res == VALID) { + atomic64_inc(&dmc->eio_stats.wr_replace); + } else { + atomic64_inc(&dmc->eio_stats.cached_blocks); + } + EIO_CACHE_STATE_SET(dmc, index, VALID | CACHEWRITEINPROG); + EIO_DBN_SET(dmc, index, (sector_t)ebio->eb_sector); + ebio->eb_index = index; + retval = 1; + } else { + /* + * eb iosize smaller than cache block size shouldn't + * do cache write on a cache miss + */ + retval = 0; + ebio->eb_iotype |= EB_INVAL; + } + +out: + if ((retval == 1) && (dmc->mode == CACHE_MODE_WB) && + (cstate != ALREADY_DIRTY)) { + ebio->eb_bc->bc_mdwait++; + } + + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, + flags); + + /* + * Enqueue clean set if there is no room in the set + * Harish: TBD + * Ensure, a force clean + */ + if (res < 0) { + eio_comply_dirty_thresholds(dmc, ebio->eb_cacheset); + } + + return retval; +} + +/* Top level read function, called from eio_map */ +static void +eio_read(struct cache_c *dmc, struct bio_container *bc, + struct eio_bio *ebegin) +{ + int ucread = 0; + struct eio_bio *ebio; + struct eio_bio *enext; + + bc->bc_dir = UNCACHED_READ; + ebio = ebegin; + while (ebio) { + enext = ebio->eb_next; + if (eio_read_peek(dmc, ebio) == 0) { + ucread = 1; + } + ebio = enext; + } + + if (ucread) { + /* + * Uncached read. + * Start HDD I/O. Once that is finished + * readfill or dirty block re-read would start + */ + atomic64_inc(&dmc->eio_stats.uncached_reads); + eio_disk_io(dmc, bc->bc_bio, ebegin, bc, 0); + } else { + /* Cached read. Serve the read from SSD */ + + /* + * Pass all orig bio flags except UNPLUG. + * Unplug in the end if flagged. + */ + int rw_flags; + + rw_flags = 0; + + bc->bc_dir = CACHED_READ; + ebio = ebegin; + + VERIFY_BIO_FLAGS(ebio); + + VERIFY((rw_flags & 1) == READ); + while (ebio) { + enext = ebio->eb_next; + ebio->eb_iotype = EB_MAIN_IO; + + eio_cached_read(dmc, ebio, rw_flags); + ebio = enext; + } + } +} + +/* Top level write function called from eio_map */ +static void +eio_write(struct cache_c *dmc, struct bio_container *bc, + struct eio_bio *ebegin) +{ + int ucwrite = 0; + int error = 0; + struct eio_bio *ebio; + struct eio_bio *enext; + + if ((dmc->mode != CACHE_MODE_WB) || + (dmc->sysctl_active.do_clean & EIO_CLEAN_KEEP)) { + ucwrite = 1; + } + + ebio = ebegin; + while (ebio) { + enext = ebio->eb_next; + if (eio_write_peek(dmc, ebio) == 0) { + ucwrite = 1; + } + ebio = enext; + } + + if (ucwrite) { + /* + * Uncached write. + * Start both SSD and HDD writes + */ + atomic64_inc(&dmc->eio_stats.uncached_writes); + bc->bc_mdwait = 0; + bc->bc_dir = UNCACHED_WRITE; + ebio = ebegin; + while (ebio) { + enext = ebio->eb_next; + eio_uncached_write(dmc, ebio); + ebio = enext; + } + + eio_disk_io(dmc, bc->bc_bio, ebegin, bc, 0); + } else { + /* Cached write. Start writes to SSD blocks */ + + int rw_flags; + rw_flags = 0; + + bc->bc_dir = CACHED_WRITE; + if (bc->bc_mdwait) { + + /* + * mdreqs are required only if the write would cause a metadata + * update. + */ + + error = eio_alloc_mdreqs(dmc, bc); + } + + /* + * Pass all orig bio flags except UNPLUG. + * UNPLUG in the end if flagged. + */ + ebio = ebegin; + VERIFY_BIO_FLAGS(ebio); + + while (ebio) { + enext = ebio->eb_next; + ebio->eb_iotype = EB_MAIN_IO; + + if (!error) { + + eio_cached_write(dmc, ebio, WRITE | rw_flags); + + } else { + unsigned long flags; + u_int8_t cstate; + + pr_err("eio_write: IO submission failed, block %llu", + EIO_DBN_GET(dmc, ebio->eb_index)); + spin_lock_irqsave(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, + flags); + cstate = EIO_CACHE_STATE_GET(dmc, ebio->eb_index); + if (cstate != ALREADY_DIRTY) { + + /* + * A DIRTY(inprog) block should be invalidated on error. + */ + + EIO_CACHE_STATE_SET(dmc, ebio->eb_index, INVALID); + atomic64_dec_if_positive(&dmc->eio_stats.cached_blocks); + } + spin_unlock_irqrestore(&dmc->cache_sets[ebio->eb_cacheset].cs_lock, + flags); + eb_endio(ebio, error); + } + ebio = enext; + } + } +} + +/* + * Synchronous clean of all the cache sets. Callers of this function needs + * to handle the situation that clean operation was aborted midway. + */ + +void +eio_clean_all(struct cache_c *dmc) +{ + unsigned long flags = 0; + + VERIFY(dmc->mode == CACHE_MODE_WB); + for (atomic_set(&dmc->clean_index, 0); + (atomic_read(&dmc->clean_index) < (s32)(dmc->size >> dmc->consecutive_shift)) && + (dmc->sysctl_active.do_clean & EIO_CLEAN_START) && + (atomic64_read(&dmc->nr_dirty) > 0) && + (!(dmc->cache_flags & CACHE_FLAGS_SHUTDOWN_INPROG) && + !dmc->sysctl_active.fast_remove); + atomic_inc(&dmc->clean_index)) { + + if (unlikely(CACHE_FAILED_IS_SET(dmc))) { + pr_err("clean_all: CACHE \"%s\" is in FAILED state.", + dmc->cache_name); + break; + } + + eio_clean_set(dmc, (index_t)(atomic_read(&dmc->clean_index)), /* whole */ 1, /* force */1); + } + + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.do_clean &= ~EIO_CLEAN_START; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); +} + +/* + * Do unconditional clean of a cache. + * Useful for a cold enabled writeback cache. + */ +void +eio_clean_for_reboot(struct cache_c *dmc) +{ + index_t i; + + for (i = 0 ; i < (index_t)(dmc->size >> dmc->consecutive_shift) ; i++) { + eio_clean_set(dmc, i, /* whole */ 1, /* force */1); + } +} + +/* + * Used during the partial cache set clean. + * Uses reclaim policy(LRU/FIFO) information to + * identify the cache blocks that needs cleaning. + * The number of such cache blocks is determined + * by the high and low thresholds set. + */ +static void +eio_get_setblks_to_clean(struct cache_c *dmc, index_t set, int *ncleans) +{ + int i = 0; + int max_clean; + index_t start_index; + int nr_writes = 0; + + *ncleans = 0; + + max_clean = dmc->cache_sets[set].nr_dirty - + ((dmc->sysctl_active.dirty_set_low_threshold * dmc->assoc) / 100); + if (max_clean <= 0) { + /* Nothing to clean */ + return; + } + + start_index = set * dmc->assoc; + + /* + * Spinlock is not required here, as we assume that we have + * taken a write lock on the cache set, when we reach here + */ + if (dmc->policy_ops == NULL) { + /* Scan sequentially in the set and pick blocks to clean */ + while((i < (int)dmc->assoc) && (nr_writes < max_clean)) { + if ((EIO_CACHE_STATE_GET(dmc, start_index + i) & + (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { + EIO_CACHE_STATE_ON(dmc, start_index + i, + DISKWRITEINPROG); + nr_writes++; + } + i++; + } + } else { + nr_writes = eio_policy_clean_set(dmc->policy_ops, set, max_clean); + } + + *ncleans = nr_writes; +} + +/* Callback function, when synchronous I/O completes */ +static void +eio_sync_io_callback(int error, void *context) +{ + struct sync_io_context *sioc = (struct sync_io_context *)context; + + if (error) { + sioc->sio_error = error; + } + up_read(&sioc->sio_lock); +} + +/* + * Setup biovecs for preallocated biovecs per cache set. + */ + +struct bio_vec *setup_bio_vecs(struct bio_vec *bvec, index_t block_index, + unsigned block_size, unsigned total, + unsigned *num_bvecs) +{ + struct bio_vec *data = NULL; + index_t iovec_index; + + switch(block_size) { + case BLKSIZE_2K: + *num_bvecs = total; + iovec_index = block_index; + data = &bvec[iovec_index]; + break; + + case BLKSIZE_4K: + *num_bvecs = total; + iovec_index = block_index; + data = &bvec[iovec_index]; + break; + + case BLKSIZE_8K: + /* + * For 8k data block size, we need 2 bio_vecs + * per data block. + */ + *num_bvecs = total * 2; + iovec_index = block_index * 2; + data = &bvec[iovec_index]; + break; + } + + return data; +} + +/* Cleans a given cache set */ +static void +eio_clean_set(struct cache_c *dmc, index_t set, int whole, int force) +{ + struct eio_io_region where; + int error; + index_t i; + index_t j; + index_t start_index; + index_t end_index; + struct sync_io_context sioc; + int ncleans = 0; + int alloc_size; + struct flash_cacheblock *md_blocks = NULL; + unsigned long flags; + + int pindex, k; + index_t blkindex; + struct bio_vec *bvecs; + unsigned nr_bvecs, total; + void *pg_virt_addr[2] = {NULL}; + + /* Cache is failed mode, do nothing. */ + if (unlikely(CACHE_FAILED_IS_SET(dmc))) { + pr_debug("clean_set: CACHE \"%s\" is in FAILED state.", + dmc->cache_name); + goto err_out1; + } + + + /* Nothing to clean, if there are no dirty blocks */ + if (dmc->cache_sets[set].nr_dirty == 0) { + goto err_out1; + } + + /* If this is not the suitable time to clean, postpone it */ + if ((!force) && AUTOCLEAN_THRESHOLD_CROSSED(dmc)) { + eio_touch_set_lru(dmc, set); + goto err_out1; + } + + /* + * 1. Take exclusive lock on the cache set + * 2. Verify that there are dirty blocks to clean + * 3. Identify the cache blocks to clean + * 4. Read the cache blocks data from ssd + * 5. Write the cache blocks data to hdd + * 6. Update on-disk cache metadata + * 7. Update in-core cache metadata + */ + + start_index = set * dmc->assoc; + end_index = start_index + dmc->assoc; + + /* 1. exclusive lock. Let the ongoing writes to finish. Pause new writes */ + down_write(&dmc->cache_sets[set].rw_lock); + + /* 2. Return if there are no dirty blocks to clean */ + if (dmc->cache_sets[set].nr_dirty == 0) { + goto err_out2; + } + + /* 3. identify and mark cache blocks to clean */ + if (!whole) { + eio_get_setblks_to_clean(dmc, set, &ncleans); + } else { + for (i = start_index; i < end_index; i++) { + if (EIO_CACHE_STATE_GET(dmc, i) == ALREADY_DIRTY) { + EIO_CACHE_STATE_SET(dmc, i, CLEAN_INPROG); + ncleans++; + } + } + } + + /* If nothing to clean, return */ + if (!ncleans) { + goto err_out2; + } + + /* + * From this point onwards, make sure to reset + * the clean inflag on cache blocks before returning + */ + + /* 4. read cache set data */ + + init_rwsem(&sioc.sio_lock); + sioc.sio_error = 0; + + for (i = start_index; i < end_index; i++) { + if (EIO_CACHE_STATE_GET(dmc, i) == CLEAN_INPROG) { + + for (j = i; (j < end_index) && (EIO_CACHE_STATE_GET(dmc, j) == CLEAN_INPROG); j++) + ; + + blkindex = (i - start_index); + total = (j - i); + + /* + * Get the correct index and number of bvecs + * setup from dmc->clean_dbvecs before issuing i/o. + */ + bvecs = setup_bio_vecs(dmc->clean_dbvecs, blkindex, dmc->block_size, + total, &nr_bvecs); + VERIFY(bvecs != NULL); + VERIFY(nr_bvecs > 0); + + where.bdev = dmc->cache_dev->bdev; + where.sector = (i << dmc->block_shift) + dmc->md_sectors; + where.count = total * dmc->block_size; + + SECTOR_STATS(dmc->eio_stats.ssd_reads, to_bytes(where.count)); + down_read(&sioc.sio_lock); + error = eio_io_async_bvec(dmc, &where, READ, bvecs, nr_bvecs, + eio_sync_io_callback, &sioc, 0); + if (error) { + sioc.sio_error = error; + up_read(&sioc.sio_lock); + } + + bvecs = NULL; + i = j; + } + } + /* + * In above for loop, submit all READ I/Os to SSD + * and unplug the device for immediate submission to + * underlying device driver. + */ + eio_unplug_cache_device(dmc); + + /* wait for all I/Os to complete and release sync lock */ + down_write(&sioc.sio_lock); + up_write(&sioc.sio_lock); + + error = sioc.sio_error; + if (error) { + goto err_out3; + } + + /* 5. write to hdd */ + /* + * While writing the data to HDD, explicitly enable + * BIO_RW_SYNC flag to hint higher priority for these + * I/Os. + */ + for (i = start_index; i < end_index; i++) { + if (EIO_CACHE_STATE_GET(dmc, i) == CLEAN_INPROG) { + + blkindex = (i - start_index); + total = 1; + + bvecs = setup_bio_vecs(dmc->clean_dbvecs, blkindex, dmc->block_size, + total, &nr_bvecs); + VERIFY(bvecs != NULL); + VERIFY(nr_bvecs > 0); + + where.bdev = dmc->disk_dev->bdev; + where.sector = EIO_DBN_GET(dmc, i); + where.count = dmc->block_size; + + SECTOR_STATS(dmc->eio_stats.disk_writes, to_bytes(where.count)); + down_read(&sioc.sio_lock); + error = eio_io_async_bvec(dmc, &where, WRITE | REQ_SYNC, + bvecs, nr_bvecs, eio_sync_io_callback, + &sioc, 1); + + if (error) { + sioc.sio_error = error; + up_read(&sioc.sio_lock); + } + bvecs = NULL; + } + } + + /* wait for all I/Os to complete and release sync lock */ + down_write(&sioc.sio_lock); + up_write(&sioc.sio_lock); + + error = sioc.sio_error; + if (error) { + goto err_out3; + } + + /* 6. update on-disk cache metadata */ + + /* Harish: TBD. Do we have to consider sector alignment here ? */ + + /* + * md_size = dmc->assoc * sizeof(struct flash_cacheblock); + * Currently, md_size is 8192 bytes, mdpage_count is 2 pages maximum. + */ + + VERIFY(dmc->mdpage_count <= 2); + for (k = 0; k < dmc->mdpage_count; k++) + pg_virt_addr[k] = kmap(dmc->clean_mdpages[k]); + + alloc_size = dmc->assoc * sizeof(struct flash_cacheblock); + pindex = 0; + md_blocks = (struct flash_cacheblock *)pg_virt_addr[pindex]; + k = MD_BLOCKS_PER_PAGE; + + for (i = start_index; i < end_index; i++) { + + md_blocks->dbn = EIO_DBN_GET(dmc, i); + + if (EIO_CACHE_STATE_GET(dmc, i) == CLEAN_INPROG) { + md_blocks->cache_state = INVALID; + } else if (EIO_CACHE_STATE_GET(dmc, i) == ALREADY_DIRTY) { + md_blocks->cache_state = (VALID | DIRTY); + } else { + md_blocks->cache_state = INVALID; + } + + /* This was missing earlier. */ + md_blocks++; + k--; + + if (k == 0) { + md_blocks = (struct flash_cacheblock *)pg_virt_addr[++pindex]; + k = MD_BLOCKS_PER_PAGE; + } + } + + for (k = 0; k < dmc->mdpage_count; k++) + kunmap(dmc->clean_mdpages[k]); + + where.bdev = dmc->cache_dev->bdev; + where.sector = dmc->md_start_sect + INDEX_TO_MD_SECTOR(start_index); + where.count = to_sector(alloc_size); + error = eio_io_sync_pages(dmc, &where, WRITE, dmc->clean_mdpages, dmc->mdpage_count); + + if (error) { + goto err_out3; + } + + +err_out3: + + /* + * 7. update in-core cache metadata for clean_inprog blocks. + * If there was an error, set them back to ALREADY_DIRTY + * If no error, set them to VALID + */ + for (i = start_index; i < end_index; i++) { + if (EIO_CACHE_STATE_GET(dmc, i) == CLEAN_INPROG) { + if (error) { + EIO_CACHE_STATE_SET(dmc, i, ALREADY_DIRTY); + } else { + EIO_CACHE_STATE_SET(dmc, i, VALID); + VERIFY(dmc->cache_sets[set].nr_dirty > 0); + dmc->cache_sets[set].nr_dirty--; + atomic64_dec(&dmc->nr_dirty); + } + } + } + +err_out2: + + up_write(&dmc->cache_sets[set].rw_lock); + +err_out1: + + /* Reset clean flags on the set */ + + if (!force) { + spin_lock_irqsave(&dmc->cache_sets[set].cs_lock, flags); + dmc->cache_sets[set].flags &= ~(SETFLAG_CLEAN_INPROG | SETFLAG_CLEAN_WHOLE); + spin_unlock_irqrestore(&dmc->cache_sets[set].cs_lock, flags); + } + + if (dmc->cache_sets[set].nr_dirty) { + /* + * Lru touch the set, so that it can be picked + * up for whole set clean by clean thread later + */ + eio_touch_set_lru(dmc, set); + } + + return; +} + +/* + * Enqueues the dirty sets for clean, which had got dirtied long + * time back(aged). User tunable values to determine if a set has aged + */ +void +eio_clean_aged_sets(struct work_struct *work) +{ + struct cache_c *dmc; + unsigned long flags = 0; + index_t set_index; + u_int64_t set_time; + u_int64_t cur_time; + + dmc = container_of(work, struct cache_c, clean_aged_sets_work.work); + + /* + * In FAILED state, dont schedule cleaning of sets. + */ + if (unlikely(CACHE_FAILED_IS_SET(dmc))) { + pr_debug("clean_aged_sets: Cache \"%s\" is in failed mode.\n", + dmc->cache_name); + /* + * This is to make sure that this thread is rescheduled + * once CACHE is ACTIVE again. + */ + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + dmc->is_clean_aged_sets_sched = 0; + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + + return; + } + + cur_time=jiffies; + + /* Use the set LRU list to pick up the most aged sets. */ + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + do { + lru_read_head(dmc->dirty_set_lru, &set_index, &set_time); + if (set_index == LRU_NULL) { + break; + } + + if (((cur_time - set_time)/HZ) < + (dmc->sysctl_active.time_based_clean_interval * 60)) { + break; + } + lru_rem(dmc->dirty_set_lru, set_index); + + if (dmc->cache_sets[set_index].nr_dirty > 0) { + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + eio_addto_cleanq(dmc, set_index, 1); + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + } + } while (1); + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + + /* Re-schedule the aged set clean, unless the clean has to stop now */ + + if (dmc->sysctl_active.time_based_clean_interval == 0) { + goto out; + } + + schedule_delayed_work(&dmc->clean_aged_sets_work, + dmc->sysctl_active.time_based_clean_interval * 60 * HZ); +out: + return; +} + +/* Move the given set at the head of the set LRU list */ +void +eio_touch_set_lru(struct cache_c *dmc, index_t set) +{ + u_int64_t systime; + unsigned long flags; + + systime=jiffies; + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + lru_touch(dmc->dirty_set_lru, set, systime); + + if ((dmc->sysctl_active.time_based_clean_interval > 0) && + (dmc->is_clean_aged_sets_sched == 0)) { + schedule_delayed_work(&dmc->clean_aged_sets_work, + dmc->sysctl_active.time_based_clean_interval * 60 * HZ); + dmc->is_clean_aged_sets_sched = 1; + } + + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); +} diff --git a/drivers/staging/enhanceio/eio_mem.c b/drivers/staging/enhanceio/eio_mem.c new file mode 100644 index 0000000..cec50f5 --- /dev/null +++ b/drivers/staging/enhanceio/eio_mem.c @@ -0,0 +1,252 @@ +/* + * eio_mem.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "eio.h" + +#define SECTORS_PER_SET (dmc->assoc * dmc->block_size) +#define SECTORS_PER_SET_SHIFT (dmc->consecutive_shift + dmc->block_shift) +#define SECTORS_PER_SET_MASK (SECTORS_PER_SET - 1) + +#define EIO_DBN_TO_SET(dmc, dbn, set_number, wrapped) do { \ + u_int64_t value; \ + u_int64_t mid_i; \ + value = (dbn) >> SECTORS_PER_SET_SHIFT; \ + mid_i = (value) & (dmc)->num_sets_mask; \ + if (mid_i >= (dmc)->num_sets) { \ + (wrapped) = 1; \ + (set_number) = mid_i - (dmc)->num_sets; \ + } else { \ + (wrapped) = 0; \ + (set_number) = mid_i; \ + } \ +} while (0) + + +/* + * eio_mem_init + */ +int +eio_mem_init(struct cache_c *dmc) +{ + u_int32_t lsb_bits; + u_int32_t msb_bits_24; /* most significant bits in shrunk dbn */ + u_int64_t max_dbn; + u_int64_t num_sets_64; + + + /* + * Sanity check the number of sets. + */ + num_sets_64 = dmc->size / dmc->assoc; + if (num_sets_64 > UINT_MAX) { + pr_err("Number of cache sets (%lu) greater than maximum allowed (%u)", + (long unsigned int)num_sets_64, UINT_MAX); + return -1; + } + + /* + * Find the number of bits required to encode the set number and + * its corresponding mask value. + */ + dmc->num_sets = (u_int32_t)num_sets_64; + for (dmc->num_sets_bits = 0; (dmc->num_sets >> dmc->num_sets_bits) != 0; dmc->num_sets_bits++) + ; + dmc->num_sets_mask = ULLONG_MAX >> (64 - dmc->num_sets_bits); + + /* + * If we don't have at least 16 bits to save, we can't use small metadata. + */ + if (dmc->num_sets_bits < 16) { + dmc->cache_flags |= CACHE_FLAGS_MD8; + pr_info("Not enough sets to use small metadata"); + return 1; + } + + /* + * Now compute the largest sector number that we can shrink; then see + * if the source volume is smaller. + */ + lsb_bits = dmc->consecutive_shift + dmc->block_shift; + msb_bits_24 = 24 - 1 - lsb_bits; /* 1 for wrapped bit */ + max_dbn = ((u_int64_t)1) << (msb_bits_24 + dmc->num_sets_bits + lsb_bits); + if (to_sector(eio_get_device_size(dmc->disk_dev)) > max_dbn) { + dmc->cache_flags |= CACHE_FLAGS_MD8; + pr_info("Source volume too big to use small metadata"); + return 1; + } + + return 0; +} + + +/* + * eio_hash_block + */ +u_int32_t +eio_hash_block(struct cache_c *dmc, sector_t dbn) +{ + int wrapped; + u_int64_t set_number; + + + EIO_DBN_TO_SET(dmc, dbn, set_number, wrapped); + VERIFY(set_number < dmc->num_sets); + + return (u_int32_t)set_number; +} + + +/* + * eio_shrink_dbn + * + * Shrink a 5-byte "dbn" into a 3-byte "dbn" by eliminating 16 lower bits + * of the set number this "dbn" belongs to. + */ +unsigned int +eio_shrink_dbn(struct cache_c *dmc, sector_t dbn) +{ + u_int32_t dbn_24; + sector_t lsb; + sector_t wrapped; + sector_t msb; + sector_t set_number; + + + VERIFY(!EIO_MD8(dmc)); + if (unlikely(dbn == 0)) { + return 0; + } + + lsb = dbn & SECTORS_PER_SET_MASK; + EIO_DBN_TO_SET(dmc, dbn, set_number, wrapped); + msb = dbn >> (dmc->num_sets_bits + SECTORS_PER_SET_SHIFT); + dbn_24 = (unsigned int)(lsb | (wrapped << SECTORS_PER_SET_SHIFT) | (msb << (SECTORS_PER_SET_SHIFT + 1))); + + return dbn_24; +} + + +/* + * eio_expand_dbn + * + * Expand a 3-byte "dbn" into a 5-byte "dbn" by adding 16 lower bits + * of the set number this "dbn" belongs to. + */ +sector_t +eio_expand_dbn(struct cache_c *dmc, u_int64_t index) +{ + u_int32_t dbn_24; + u_int64_t set_number; + sector_t lsb; + sector_t msb; + sector_t dbn_40; + + + VERIFY(!EIO_MD8(dmc)); + /* + * Expanding "dbn" zero? + */ + if (index == dmc->index_zero && dmc->index_zero < (u_int64_t)dmc->assoc) { + return 0; + } + + dbn_24 = dmc->cache[index].md4_md & EIO_MD4_DBN_MASK; + if (dbn_24 == 0 && EIO_CACHE_STATE_GET(dmc, index) == INVALID) + return (sector_t)0; + + set_number = index / dmc->assoc; + lsb = dbn_24 & SECTORS_PER_SET_MASK; + msb = dbn_24 >> (SECTORS_PER_SET_SHIFT + 1); /* 1 for wrapped */ + /* had we wrapped? */ + if ((dbn_24 & SECTORS_PER_SET) != 0) { + dbn_40 = msb << (dmc->num_sets_bits + SECTORS_PER_SET_SHIFT); + dbn_40 |= (set_number + dmc->num_sets) << SECTORS_PER_SET_SHIFT; + dbn_40 |= lsb; + } else { + dbn_40 = msb << (dmc->num_sets_bits + SECTORS_PER_SET_SHIFT); + dbn_40 |= set_number << SECTORS_PER_SET_SHIFT; + dbn_40 |= lsb; + } + VERIFY(unlikely(dbn_40 < EIO_MAX_SECTOR)); + + return (sector_t)dbn_40; +} +EXPORT_SYMBOL(eio_expand_dbn); + + +/* + * eio_invalidate_md + */ +void +eio_invalidate_md(struct cache_c *dmc, u_int64_t index) +{ + + if (EIO_MD8(dmc)) + dmc->cache_md8[index].md8_md = EIO_MD8_INVALID; + else + dmc->cache[index].md4_md = EIO_MD4_INVALID; +} + + +/* + * eio_md4_dbn_set + */ +void +eio_md4_dbn_set(struct cache_c *dmc, u_int64_t index, u_int32_t dbn_24) +{ + + VERIFY((dbn_24 & ~EIO_MD4_DBN_MASK) == 0); + + /* retain "cache_state" */ + dmc->cache[index].md4_md &= ~EIO_MD4_DBN_MASK; + dmc->cache[index].md4_md |= dbn_24; + + /* XXX excessive debugging */ + if (dmc->index_zero < (u_int64_t)dmc->assoc && /* cache constructed and sector 0 already cached */ + index == dmc->index_zero && /* we're accessing sector 0 */ + dbn_24 != 0) { /* we're replacing sector 0 */ + dmc->index_zero = dmc->assoc; + } +} + + +/* + * eio_md8_dbn_set + */ +void +eio_md8_dbn_set(struct cache_c *dmc, u_int64_t index, sector_t dbn) +{ + + VERIFY((dbn & ~EIO_MD8_DBN_MASK) == 0); + + /* retain "cache_state" */ + dmc->cache_md8[index].md8_md &= ~EIO_MD8_DBN_MASK; + dmc->cache_md8[index].md8_md |= dbn; + + /* XXX excessive debugging */ + if (dmc->index_zero < (u_int64_t)dmc->assoc && /* cache constructed and sector 0 already cached */ + index == dmc->index_zero && /* we're accessing sector 0 */ + dbn != 0) { /* we're replacing sector 0 */ + dmc->index_zero = dmc->assoc; + } +} + diff --git a/drivers/staging/enhanceio/eio_policy.c b/drivers/staging/enhanceio/eio_policy.c new file mode 100644 index 0000000..189aadd --- /dev/null +++ b/drivers/staging/enhanceio/eio_policy.c @@ -0,0 +1,162 @@ +/* + * eio_policy.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "eio.h" + +LIST_HEAD(eio_policy_list); + + +int +eio_register_policy(struct eio_policy_header *new_policy) +{ + struct list_head *ptr; + struct eio_policy_header *curr; + + + list_for_each(ptr, &eio_policy_list) { + curr = list_entry(ptr, struct eio_policy_header, sph_list); + if (curr->sph_name == new_policy->sph_name) + return 1; + } + list_add_tail(&new_policy->sph_list, &eio_policy_list); + + pr_info("register_policy: policy %d added", new_policy->sph_name); + + return 0; +} +EXPORT_SYMBOL(eio_register_policy); + + +int +eio_unregister_policy(struct eio_policy_header *p_ops) +{ + struct list_head *ptr; + struct eio_policy_header *curr; + + + list_for_each(ptr, &eio_policy_list) { + curr = list_entry(ptr, struct eio_policy_header, sph_list); + if (curr->sph_name == p_ops->sph_name) { + list_del(&curr->sph_list); + pr_info("unregister_policy: policy %d removed", (int)p_ops->sph_name); + return 0; + } + } + + return 1; +} +EXPORT_SYMBOL(eio_unregister_policy); + + +struct eio_policy * +eio_get_policy(int policy) +{ + struct list_head *ptr; + struct eio_policy_header *curr; + + list_for_each(ptr, &eio_policy_list) { + curr = list_entry(ptr, struct eio_policy_header, sph_list); + if (curr->sph_name == policy) { + pr_info("get_policy: policy %d found", policy); + return curr->sph_instance_init(); + } + } + pr_info("get_policy: cannot find policy %d", policy); + + return NULL; +} + + +/* + * Decrement the reference count of the policy specific module + * and any other cleanup that is required when an instance of a + * policy is no longer required. + */ +void +eio_put_policy(struct eio_policy *p_ops) +{ + + if (p_ops == NULL) { + pr_err("put_policy: Cannot decrement reference count of NULL policy"); + return; + } + p_ops->sp_repl_exit(); +} + + +/* + * Wrappers for policy specific functions. These default to nothing if the + * default policy is being used. + */ +int +eio_repl_sets_init(struct eio_policy *p_ops) +{ + + return (p_ops && p_ops->sp_repl_sets_init) ? p_ops->sp_repl_sets_init(p_ops) : 0; +} + + +int +eio_repl_blk_init(struct eio_policy *p_ops) +{ + + return (p_ops && p_ops->sp_repl_blk_init) ? p_ops->sp_repl_blk_init(p_ops) : 0; +} + + +void +eio_find_reclaim_dbn(struct eio_policy *p_ops, + index_t start_index, index_t *index) +{ + + p_ops->sp_find_reclaim_dbn(p_ops, start_index, index); +} + + +int +eio_policy_clean_set(struct eio_policy *p_ops, index_t set, int to_clean) +{ + + return p_ops->sp_clean_set(p_ops, set, to_clean); +} + + +/* + * LRU Specific functions + */ +void +eio_policy_lru_pushblks(struct eio_policy *p_ops) +{ + + if (p_ops && p_ops->sp_name == CACHE_REPL_LRU) + p_ops->sp_policy.lru->sl_lru_pushblks(p_ops); +} + + +void +eio_policy_reclaim_lru_movetail(struct cache_c *dmc, index_t i, struct eio_policy *p_ops) +{ + + if (p_ops && p_ops->sp_name == CACHE_REPL_LRU) + p_ops->sp_policy.lru->sl_reclaim_lru_movetail(dmc, i, p_ops); +} + diff --git a/drivers/staging/enhanceio/eio_policy.h b/drivers/staging/enhanceio/eio_policy.h new file mode 100644 index 0000000..c585939 --- /dev/null +++ b/drivers/staging/enhanceio/eio_policy.h @@ -0,0 +1,106 @@ +/* + * eio_policy.h + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef EIO_POLICY_H +#define EIO_POLICY_H + +#include <linux/module.h> +#include <linux/list.h> + +/* + * Defines for policy types (EIO_REPL_XXX are in eio.h + * so that user space utilties can use those definitions. + */ + +/* + * The LRU pointers are maintained as set-relative offsets, instead of + * pointers. This enables us to store the LRU pointers per cacheblock + * using 4 bytes instead of 16 bytes. The upshot of this is that we + * are required to clamp the associativity at an 8K max. + * + * XXX - The above comment is from the original code. Looks like an error, + * maximum associativity should be 32K (2^15) and not 8K. + */ +#define EIO_MAX_ASSOC 8192 +#define EIO_LRU_NULL 0xFFFF + +/* Declerations to keep the compiler happy */ +struct cache_c; +struct eio_policy; +struct eio_lru; + +/* LRU specific data structures and functions */ +struct eio_lru { + void (*sl_lru_pushblks)(struct eio_policy *); + void (*sl_reclaim_lru_movetail)(struct cache_c *, index_t, struct eio_policy *); +}; + +/* Function prototypes for LRU wrappers in eio_policy.c */ +void eio_policy_lru_pushblks(struct eio_policy *); +void eio_policy_reclaim_lru_movetail(struct cache_c *, index_t, struct eio_policy *); + + +/* + * Context that captures the cache block replacement policy. + * There is one instance of this struct per dmc (cache) + */ +struct eio_policy { + int sp_name; + union { + struct eio_lru *lru; + } sp_policy; + int (*sp_repl_init)(struct cache_c *); + void (*sp_repl_exit)(void); + int (*sp_repl_sets_init)(struct eio_policy *); + int (*sp_repl_blk_init)(struct eio_policy *); + void (*sp_find_reclaim_dbn)(struct eio_policy *, + index_t start_index, index_t *index); + int (*sp_clean_set)(struct eio_policy *, index_t set, int); + struct cache_c *sp_dmc; +}; + +/* + * List of registered policies. There is one instance + * of this structure per policy type. + */ +struct eio_policy_header { + int sph_name; + struct eio_policy *(*sph_instance_init)(void); + struct list_head sph_list; +}; + + +/* Prototypes of generic functions in eio_policy */ +int *eio_repl_init(struct cache_c *); +int eio_repl_sets_init(struct eio_policy *); +int eio_repl_blk_init(struct eio_policy *); +void eio_find_reclaim_dbn(struct eio_policy *, index_t start_index, index_t *index); +int eio_policy_clean_set(struct eio_policy *, index_t, int); + + +int eio_register_policy(struct eio_policy_header *); +int eio_unregister_policy(struct eio_policy_header *); +struct eio_policy *eio_get_policy(int); +void eio_put_policy(struct eio_policy *); + +#endif /* EIO_POLICY_H */ + diff --git a/drivers/staging/enhanceio/eio_procfs.c b/drivers/staging/enhanceio/eio_procfs.c new file mode 100644 index 0000000..4403c14 --- /dev/null +++ b/drivers/staging/enhanceio/eio_procfs.c @@ -0,0 +1,1825 @@ +/* + * eio_procfs.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "eio.h" +#define EIO_RELEASE "ENHANCEIO" + +#ifndef ENHANCEIO_GIT_COMMIT_HASH +#define ENHANCEIO_GIT_COMMIT_HASH "unknown-git-version" +#endif /* !ENHANCEIO_GIT_COMMIT_HASH */ + +int +eio_version_query(size_t buf_sz, char *bufp) +{ + if (unlikely(buf_sz == 0) || unlikely(bufp == NULL)) + return -EINVAL; + snprintf(bufp, buf_sz, "EnhanceIO Version: %s %s (checksum disabled)", + EIO_RELEASE, ENHANCEIO_GIT_COMMIT_HASH); + + bufp[buf_sz - 1] = '\0'; + + return 0; +} + +static struct sysctl_table_dir *sysctl_handle_dir; + +/* + * eio_zerostats_sysctl + */ +static int +eio_zerostats_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + long long cached_blocks; + unsigned long flags = 0; + + /* fetch the new tunable value or post the existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.zerostats = dmc->sysctl_active.zerostats; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + /* do sanity check */ + + if ((dmc->sysctl_pending.zerostats != 0) && + (dmc->sysctl_pending.zerostats != 1)) { + pr_err("0 or 1 are the only valid values for zerostats"); + return -EINVAL; + } + + if (dmc->sysctl_pending.zerostats == dmc->sysctl_active.zerostats) { + /* same value. Nothing to work */ + return 0; + } + + /* Copy to active */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.zerostats = dmc->sysctl_pending.zerostats; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + if (dmc->sysctl_active.zerostats) { + /* + * The number of cached blocks should not be zero'd since + * these blocks are already on cache dev. Making this zero + * may lead to -ve count during block invalidate, and also, + * incorrectly indicating how much data is cached. + * + * TODO - should have used an spinlock, but existing spinlocks + * are inadequate to fully protect this + */ + + cached_blocks = atomic64_read(&dmc->eio_stats.cached_blocks); + memset(&dmc->eio_stats, 0, sizeof (struct eio_stats)); + atomic64_set(&dmc->eio_stats.cached_blocks, cached_blocks); + } + } + + return 0; +} + +/* + * eio_mem_limit_pct_sysctl + * - sets the eio sysctl mem_limit_pct value + */ +static int +eio_mem_limit_pct_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value or post the existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.mem_limit_pct = dmc->sysctl_active.mem_limit_pct; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + /* do sanity check */ + if ((dmc->sysctl_pending.mem_limit_pct < 0) || + (dmc->sysctl_pending.mem_limit_pct > 100)) { + pr_err("only valid percents are [0 - 100] for mem_limit_pct"); + return -EINVAL; + } + + if (dmc->sysctl_pending.mem_limit_pct == dmc->sysctl_active.mem_limit_pct) { + /* same value. Nothing more to do */ + return 0; + } + + /* Copy to active */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.mem_limit_pct = dmc->sysctl_pending.mem_limit_pct; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + return 0; +} + +/* + * eio_clean_sysctl + */ +static int +eio_clean_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value or post the existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.do_clean = dmc->sysctl_active.do_clean; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + /* Do sanity check */ + + if (dmc->mode != CACHE_MODE_WB) { + /* do_clean is only valid for writeback cache */ + pr_err("do_clean is only valid for writeback cache"); + return -EINVAL; + } + + if (dmc->sysctl_pending.do_clean & ~(EIO_CLEAN_START | EIO_CLEAN_KEEP)) { + pr_err("do_clean should be either clean start/clean keep"); + return -EINVAL; + } + + if (dmc->sysctl_pending.do_clean == dmc->sysctl_active.do_clean) { + /* New and old values are same. No work required */ + return 0; + } + + /* Copy to active and apply the new tunable value */ + + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + + if (dmc->cache_flags & CACHE_FLAGS_MOD_INPROG) { + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + pr_err("do_clean called while cache modification in progress"); + return -EBUSY; + } else { + dmc->sysctl_active.do_clean = dmc->sysctl_pending.do_clean; + + if (dmc->sysctl_active.do_clean) { + atomic_set(&dmc->clean_index, 0); + dmc->sysctl_active.do_clean |= EIO_CLEAN_START; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* + * Wake up the clean thread. + * Sync thread will do the clean and once complete + * will reset the clean_start flag. + * The clean_keep flag will remain set(unless reset + * by user) and will prevent new I/Os from making + * the blocks dirty. + */ + + spin_lock_irqsave(&dmc->clean_sl, flags); + EIO_SET_EVENT_AND_UNLOCK(&dmc->clean_event, + &dmc->clean_sl, flags); + } else { + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + } + } + + return 0; +} + +/* + * eio_dirty_high_threshold_sysctl + */ +static int +eio_dirty_high_threshold_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value or post the existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.dirty_high_threshold = dmc->sysctl_active.dirty_high_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + int error; + uint32_t old_value; + + /* do sanity check */ + + if (dmc->mode != CACHE_MODE_WB) { + pr_err("dirty_high_threshold is only valid for writeback cache"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_high_threshold > 100) { + pr_err("dirty_high_threshold percentage should be [0 - 100]"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_high_threshold < dmc->sysctl_active.dirty_low_threshold) { + pr_err("dirty high shouldn't be less than dirty low threshold"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_high_threshold == dmc->sysctl_active.dirty_high_threshold) { + /* new is same as old value. No need to take any action */ + return 0; + } + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + old_value = dmc->sysctl_active.dirty_high_threshold; + dmc->sysctl_active.dirty_high_threshold = dmc->sysctl_pending.dirty_high_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + /* Store the change persistently */ + error = eio_sb_store(dmc); + if (error) { + /* restore back the old value and return error */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.dirty_high_threshold = old_value; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + return error; + } + + /* if we reduced the high threshold, check if we require cache cleaning */ + if (old_value > dmc->sysctl_active.dirty_high_threshold) { + eio_comply_dirty_thresholds(dmc, -1); + } + } + + return 0; +} + +/* + * eio_dirty_low_threshold_sysctl + */ +static int +eio_dirty_low_threshold_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value or post the existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.dirty_low_threshold = dmc->sysctl_active.dirty_low_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + int error; + uint32_t old_value; + + /* do sanity check */ + + if (dmc->mode != CACHE_MODE_WB) { + pr_err("dirty_low_threshold is valid for only writeback cache"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_low_threshold > 100) { + pr_err("dirty_low_threshold percentage should be [0 - 100]"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_low_threshold > dmc->sysctl_active.dirty_high_threshold) { + pr_err("dirty low shouldn't be more than dirty high threshold"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_low_threshold == dmc->sysctl_active.dirty_low_threshold) { + /* new is same as old value. No need to take any action */ + return 0; + } + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + old_value = dmc->sysctl_active.dirty_low_threshold; + dmc->sysctl_active.dirty_low_threshold = dmc->sysctl_pending.dirty_low_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + /* Store the change persistently */ + error = eio_sb_store(dmc); + if (error) { + /* restore back the old value and return error */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.dirty_low_threshold = old_value; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + return error; + } + + if (old_value > dmc->sysctl_active.dirty_low_threshold) { + /* + * Although the low threshold set shouldn't trigger new cleans, + * but because we set the tunables one at a time from user mode, + * it is possible that the high threshold value triggering clean + * did not happen and should get triggered now that the low value + * has been changed, so we are calling the comply function here + */ + eio_comply_dirty_thresholds(dmc, -1); + } + } + + return 0; +} + +/* + * eio_dirty_set_high_threshold_sysctl + */ +static int +eio_dirty_set_high_threshold_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value or post the existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.dirty_set_high_threshold = dmc->sysctl_active.dirty_set_high_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + int error; + uint32_t old_value; + u_int64_t i; + + /* do sanity check */ + + if (dmc->mode != CACHE_MODE_WB) { + pr_err("dirty_set_high_threshold is valid only for writeback cache"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_set_high_threshold > 100) { + pr_err("dirty_set_high_threshold percentage should be [0 - 100]"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_set_high_threshold < dmc->sysctl_active.dirty_set_low_threshold) { + pr_err("dirty_set_high_threshold shouldn't be less than dirty low threshold"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_set_high_threshold == dmc->sysctl_active.dirty_set_high_threshold) { + /* new is same as old value. No need to take any action */ + return 0; + } + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + old_value = dmc->sysctl_active.dirty_set_high_threshold; + dmc->sysctl_active.dirty_set_high_threshold = dmc->sysctl_pending.dirty_set_high_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + /* Store the change persistently */ + error = eio_sb_store(dmc); + if (error) { + /* restore back the old value and return error */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.dirty_set_high_threshold = old_value; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + return error; + } + + if (old_value > dmc->sysctl_active.dirty_set_high_threshold) { + /* Check each set for dirty blocks cleaning */ + for (i = 0 ; i < (dmc->size >> dmc->consecutive_shift); i++) { + eio_comply_dirty_thresholds(dmc, i); + } + } + } + + return 0; +} + +/* + * eio_dirty_set_low_threshold_sysctl + */ +static int +eio_dirty_set_low_threshold_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value or post the existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.dirty_set_low_threshold = dmc->sysctl_active.dirty_set_low_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + int error; + uint32_t old_value; + u_int64_t i; + + /* do sanity check */ + + if (dmc->mode != CACHE_MODE_WB) { + pr_err("dirty_set_low_threshold is valid only for writeback cache"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_set_low_threshold > 100) { + pr_err("dirty_set_low_threshold percentage should be [0 - 100]"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_set_low_threshold > dmc->sysctl_active.dirty_set_high_threshold) { + pr_err("dirty_set_low_threshold shouldn't be more than dirty_set_high_threshold"); + return -EINVAL; + } + + if (dmc->sysctl_pending.dirty_set_low_threshold == dmc->sysctl_active.dirty_set_low_threshold) { + /* new is same as old value. No need to take any action */ + return 0; + } + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + old_value = dmc->sysctl_active.dirty_set_low_threshold; + dmc->sysctl_active.dirty_set_low_threshold = dmc->sysctl_pending.dirty_set_low_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + /* Store the change persistently */ + error = eio_sb_store(dmc); + if (error) { + /* restore back the old value and return error */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.dirty_set_low_threshold = old_value; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + return error; + } + + /* + * Although the low threshold value shouldn't trigger new cleans, + * but because we set the tunables one at a time from user mode, + * it is possible that the high threshold value triggering clean + * did not happen and should get triggered now that the low value + * has been changed, so we are calling the comply function again + */ + if (old_value > dmc->sysctl_active.dirty_set_low_threshold) { + /* Check each set for dirty blocks cleaning */ + for (i = 0 ; i < (dmc->size >> dmc->consecutive_shift); i++) { + eio_comply_dirty_thresholds(dmc, i); + } + } + } + + return 0; +} + +/* + * eio_autoclean_threshold_sysctl + */ +static int +eio_autoclean_threshold_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value or post existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.autoclean_threshold = dmc->sysctl_active.autoclean_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + int error; + int old_value; + + /* do sanity check */ + + if (dmc->mode != CACHE_MODE_WB) { + pr_err("autoclean_threshold is valid only for writeback cache"); + return -EINVAL; + } + + if ((dmc->sysctl_pending.autoclean_threshold < 0) || + (dmc->sysctl_pending.autoclean_threshold > AUTOCLEAN_THRESH_MAX)) { + pr_err("autoclean_threshold is valid range is 0 to %d", AUTOCLEAN_THRESH_MAX); + return -EINVAL; + } + + if (dmc->sysctl_pending.autoclean_threshold == dmc->sysctl_active.autoclean_threshold) { + /* new is same as old value. No need to take any action */ + return 0; + } + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + old_value = dmc->sysctl_active.autoclean_threshold; + dmc->sysctl_active.autoclean_threshold = dmc->sysctl_pending.autoclean_threshold; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + /* Store the change persistently */ + error = eio_sb_store(dmc); + if (error) { + /* restore back the old value and return error */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.autoclean_threshold = old_value; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + return error; + } + + /* Ensure new thresholds are being complied */ + eio_comply_dirty_thresholds(dmc, -1); + } + + return 0; +} + +/* + * eio_time_based_clean_interval_sysctl + */ +static int +eio_time_based_clean_interval_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + + /* fetch the new tunable value or post existing value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.time_based_clean_interval = dmc->sysctl_active.time_based_clean_interval; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + int error; + uint32_t old_value; + + /* do sanity check */ + + if (dmc->mode != CACHE_MODE_WB) { + pr_err("time_based_clean_interval is valid only for writeback cache"); + return -EINVAL; + } + + if (dmc->sysctl_pending.time_based_clean_interval > TIME_BASED_CLEAN_INTERVAL_MAX) { + /* valid values are 0 to TIME_BASED_CLEAN_INTERVAL_MAX */ + pr_err("time_based_clean_interval valid range is 0 to %u", TIME_BASED_CLEAN_INTERVAL_MAX); + return -EINVAL; + } + + if (dmc->sysctl_pending.time_based_clean_interval == dmc->sysctl_active.time_based_clean_interval) { + /* new is same as old value */ + return 0; + } + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + old_value = dmc->sysctl_active.time_based_clean_interval; + dmc->sysctl_active.time_based_clean_interval = dmc->sysctl_pending.time_based_clean_interval; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + /* Store the change persistently */ + error = eio_sb_store(dmc); + if (error) { + /* restore back the old value and return error */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.time_based_clean_interval = old_value; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + return error; + } + + /* Reschedule the time based clean, based on new interval */ + cancel_delayed_work_sync(&dmc->clean_aged_sets_work); + spin_lock_irqsave(&dmc->dirty_set_lru_lock, flags); + dmc->is_clean_aged_sets_sched = 0; + if (dmc->sysctl_active.time_based_clean_interval && atomic64_read(&dmc->nr_dirty)) { + schedule_delayed_work(&dmc->clean_aged_sets_work, + dmc->sysctl_active.time_based_clean_interval * 60 * HZ); + dmc->is_clean_aged_sets_sched = 1; + } + spin_unlock_irqrestore(&dmc->dirty_set_lru_lock, flags); + } + + return 0; +} + +static void eio_sysctl_register_writeback(struct cache_c *dmc); +static void eio_sysctl_unregister_writeback(struct cache_c *dmc); +static void eio_sysctl_register_invalidate(struct cache_c *dmc); +static void eio_sysctl_unregister_invalidate(struct cache_c *dmc); + +/* + * eio_control_sysctl + */ +int +eio_control_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + int rv = 0; + struct cache_c *dmc = (struct cache_c *)table->extra1; + unsigned long flags = 0; + + /* fetch the new tunable value */ + + if (!write) { + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_pending.control = dmc->sysctl_active.control; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + } + + proc_dointvec(table, write, buffer, length, ppos); + + /* do write processing */ + + if (write) { + /* do sanity check */ + + if (dmc->sysctl_pending.control > CACHE_CONTROL_FLAG_MAX || + dmc->sysctl_pending.control < 0) { + /* valid values are from 0 till CACHE_CONTROL_FLAG_MAX */ + pr_err("control valid values are from 0 till %d", CACHE_CONTROL_FLAG_MAX); + return -EINVAL; + } + + if (dmc->sysctl_pending.control == dmc->sysctl_active.control) { + /* new is same as old value. No work required */ + return 0; + } + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.control = dmc->sysctl_pending.control; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + switch (dmc->sysctl_active.control) { + case CACHE_VERBOSE_OFF: + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_VERBOSE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + pr_info("Turning off verbose mode"); + break; + case CACHE_VERBOSE_ON: + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags |= CACHE_FLAGS_VERBOSE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + pr_info("Turning on verbose mode"); + break; + case CACHE_WRITEBACK_ON: + if (dmc->sysctl_handle_writeback == NULL) + eio_sysctl_register_writeback(dmc); + break; + case CACHE_WRITEBACK_OFF: + if (dmc->sysctl_handle_writeback) + eio_sysctl_unregister_writeback(dmc); + break; + case CACHE_INVALIDATE_ON: + if (dmc->sysctl_handle_invalidate == NULL) { + eio_sysctl_register_invalidate(dmc); + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags |= CACHE_FLAGS_INVALIDATE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + } else + pr_info("Invalidate API already registered"); + break; + case CACHE_INVALIDATE_OFF: + if (dmc->sysctl_handle_invalidate) { + eio_sysctl_unregister_invalidate(dmc); + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_INVALIDATE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + } else + pr_info("Invalidate API not registered"); + break; + case CACHE_FAST_REMOVE_ON: + if (dmc->mode != CACHE_MODE_WB) { + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags |= CACHE_FLAGS_FAST_REMOVE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (CACHE_VERBOSE_IS_SET(dmc)) + pr_info("Turning on fast remove"); + } else { +#ifdef EIO_DEBUG + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags |= CACHE_FLAGS_FAST_REMOVE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (CACHE_VERBOSE_IS_SET(dmc)) + pr_info("Turning on fast remove"); +#else + pr_err("Invalid control value: 0x%x", dmc->sysctl_active.control); + rv = -1; +#endif /* EIO_DEBUG */ + } + break; + case CACHE_FAST_REMOVE_OFF: + if (dmc->mode != CACHE_MODE_WB) { + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_FAST_REMOVE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (CACHE_VERBOSE_IS_SET(dmc)) + pr_info("Turning off fast remove"); + } else { +#ifdef EIO_DEBUG + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_FAST_REMOVE; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (CACHE_VERBOSE_IS_SET(dmc)) + pr_info("Turning off fast remove"); +#else + pr_err("Invalid control value: 0x%x", dmc->sysctl_active.control); + rv = -1; +#endif /* EIO_DEBUG */ + } + break; + default: + pr_err("Invalid control value: 0x%x", dmc->sysctl_active.control); + rv = -1; + } + } + + return rv; +} + +#define PROC_STR "enhanceio" +#define PROC_VER_STR "enhanceio/version" +#define PROC_STATS "stats" +#define PROC_ERRORS "errors" +#define PROC_IOSZ_HIST "io_hist" +#define PROC_CONFIG "config" + +static int eio_invalidate_sysctl(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); +static void *eio_find_sysctl_data(struct cache_c *dmc, ctl_table *vars); +static char *eio_cons_sysctl_devname(struct cache_c *dmc); +static char *eio_cons_procfs_cachename(struct cache_c *dmc, char *path_component); +static void eio_sysctl_register_common(struct cache_c *dmc); +static void eio_sysctl_unregister_common(struct cache_c *dmc); +static void eio_sysctl_register_dir(void); +static void eio_sysctl_unregister_dir(void); +static int eio_stats_show(struct seq_file *seq, void *v); +static int eio_stats_open(struct inode *inode, struct file *file); +static int eio_errors_show(struct seq_file *seq, void *v); +static int eio_errors_open(struct inode *inode, struct file *file); +static int eio_iosize_hist_show(struct seq_file *seq, void *v); +static int eio_iosize_hist_open(struct inode *inode, struct file *file); +static int eio_version_show(struct seq_file *seq, void *v); +static int eio_version_open(struct inode *inode, struct file *file); +static int eio_config_show(struct seq_file *seq, void *v); +static int eio_config_open(struct inode *inode, struct file *file); + +static struct file_operations eio_version_operations = { + .open = eio_version_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct file_operations eio_stats_operations = { + .open = eio_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct file_operations eio_errors_operations = { + .open = eio_errors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct file_operations eio_iosize_hist_operations = { + .open = eio_iosize_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct file_operations eio_config_operations = { + .open = eio_config_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Each ctl_table array needs to be 1 more than the actual number of + * entries - zero padded at the end ! Therefore the NUM_*_SYSCTLS + * is 1 more than then number of sysctls. + */ + +#define PROC_SYS_ROOT_NAME "dev" +#define PROC_SYS_DIR_NAME "enhanceio" +#define PROC_SYS_CACHE_NAME "enhanceio-dev" + +/* + * The purpose of sysctl_table_dir is to create the "enhanceio" + * dir under /proc/sys/dev/. The creation is done during module + * load time and the dir is removed when module is removed. + * + * This was added because otherwise, the first cache instance + * falsely assumes that /proc/sys/kernel/ is its parent instead + * of /proc/sys/dev leading to an incorrect number of reference + * count. When you have multiple cache instances, removing the + * last one results in the kernel's reference count to be 0 + * leading to a kernel warning at runtime. Hopefully, this will + * be fixed in the kernel sometime. + */ +static struct sysctl_table_dir { + struct ctl_table_header *sysctl_header; + ctl_table vars[0 + 1]; + ctl_table dev[0 + 1]; + ctl_table dir[1 + 1]; + ctl_table root[1 + 1]; +} sysctl_template_dir = { + .vars = { }, + .dev = { }, + .dir = { + { + .procname = PROC_SYS_DIR_NAME, + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = sysctl_template_dir.dev, + }, + }, + .root = { + { + .procname = PROC_SYS_ROOT_NAME, + .maxlen = 0, + .mode = 0555, + .child = sysctl_template_dir.dir, + }, + }, +}; + + +#define NUM_COMMON_SYSCTLS 3 + +static struct sysctl_table_common { + struct ctl_table_header *sysctl_header; + ctl_table vars[NUM_COMMON_SYSCTLS + 1]; + ctl_table dev[1 + 1]; + ctl_table dir[1 + 1]; + ctl_table root[1 + 1]; +} sysctl_template_common = { + .vars = { + { /* 1 */ + .procname = "zero_stats", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &eio_zerostats_sysctl, + }, + { /* 2 */ + .procname = "mem_limit_pct", + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &eio_mem_limit_pct_sysctl, + }, + { /* 3 */ + .procname = "control", + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &eio_control_sysctl, + }, + }, + .dev = { + { + .procname = PROC_SYS_CACHE_NAME, + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = sysctl_template_common.vars, + }, + }, + .dir = { + { + .procname = PROC_SYS_DIR_NAME, + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = sysctl_template_common.dev, + }, + }, + .root = { + { + .procname = PROC_SYS_ROOT_NAME, + .maxlen = 0, + .mode = 0555, + .child = sysctl_template_common.dir, + }, + }, +}; + +#define NUM_WRITEBACK_SYSCTLS 7 + +static struct sysctl_table_writeback { + struct ctl_table_header *sysctl_header; + ctl_table vars[NUM_WRITEBACK_SYSCTLS + 1]; + ctl_table dev[1 + 1]; + ctl_table dir[1 + 1]; + ctl_table root[1 + 1]; +} sysctl_template_writeback = { + .vars = { + { /* 1 */ + .procname = "do_clean", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &eio_clean_sysctl, + }, + { /* 2 */ + .procname = "time_based_clean_interval", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &eio_time_based_clean_interval_sysctl, + }, + { /* 3 */ + .procname = "autoclean_threshold", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &eio_autoclean_threshold_sysctl, + }, + { /* 4 */ + .procname = "dirty_high_threshold", + .maxlen = sizeof(uint32_t), + .mode = 0644, + .proc_handler = &eio_dirty_high_threshold_sysctl, + }, + { /* 5 */ + .procname = "dirty_low_threshold", + .maxlen = sizeof(uint32_t), + .mode = 0644, + .proc_handler = &eio_dirty_low_threshold_sysctl, + }, + { /* 6 */ + .procname = "dirty_set_high_threshold", + .maxlen = sizeof(uint32_t), + .mode = 0644, + .proc_handler = &eio_dirty_set_high_threshold_sysctl, + }, + { /* 7 */ + .procname = "dirty_set_low_threshold", + .maxlen = sizeof(uint32_t), + .mode = 0644, + .proc_handler = &eio_dirty_set_low_threshold_sysctl, + }, + }, + .dev = { + { + .procname = PROC_SYS_CACHE_NAME, + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = sysctl_template_writeback.vars, + }, + }, + .dir = { + { + .procname = PROC_SYS_DIR_NAME, + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = sysctl_template_writeback.dev, + }, + }, + .root = { + { + .procname = PROC_SYS_ROOT_NAME, + .maxlen = 0, + .mode = 0555, + .child = sysctl_template_writeback.dir, + }, + }, +}; + +#define NUM_INVALIDATE_SYSCTLS (1) +static struct sysctl_table_invalidate { + struct ctl_table_header *sysctl_header; + ctl_table vars[NUM_INVALIDATE_SYSCTLS + 1]; + ctl_table dev[1 + 1]; + ctl_table dir[1 + 1]; + ctl_table root[1 + 1]; +} sysctl_template_invalidate = { + .vars = { + { /* 1 */ + .procname = "invalidate", + .maxlen = sizeof (u_int64_t), + .mode = 0644, + .proc_handler = &eio_invalidate_sysctl, + }, + }, + .dev = { + { + .procname = PROC_SYS_CACHE_NAME, + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = sysctl_template_invalidate.vars, + }, + }, + .dir = { + { + .procname = PROC_SYS_DIR_NAME, + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = sysctl_template_invalidate.dev, + }, + }, + .root = { + { + .procname = PROC_SYS_ROOT_NAME, + .maxlen = 0, + .mode = 0555, + .child = sysctl_template_invalidate.dir, + }, + }, +}; + + +/* + * eio_module_procfs_init -- called from "eio_init()" + */ +void +eio_module_procfs_init(void) +{ + struct proc_dir_entry *entry; + + if (proc_mkdir(PROC_STR, NULL)) { + entry = create_proc_entry(PROC_VER_STR, 0, NULL); + if (entry) + entry->proc_fops = &eio_version_operations; + } + eio_sysctl_register_dir(); +} + + +/* + * eio_module_procfs_exit -- called from "eio_exit()" + */ +void +eio_module_procfs_exit(void) +{ + (void)remove_proc_entry(PROC_VER_STR, NULL); + (void)remove_proc_entry(PROC_STR, NULL); + + eio_sysctl_unregister_dir(); +} + + +/* + * eio_procfs_ctr -- called from "eio_ctr()" + */ +void +eio_procfs_ctr(struct cache_c *dmc) +{ + char *s; + struct proc_dir_entry *entry; + + s = eio_cons_procfs_cachename(dmc, ""); + entry = proc_mkdir(s, NULL); + kfree(s); + if (entry == NULL) { + pr_err("Failed to create /proc/%s", s); + return; + } + + s = eio_cons_procfs_cachename(dmc, PROC_STATS); + entry = create_proc_entry(s, 0, NULL); + if (entry) { + entry->proc_fops = &eio_stats_operations; + entry->data = dmc; + } + kfree(s); + + s = eio_cons_procfs_cachename(dmc, PROC_ERRORS); + entry = create_proc_entry(s, 0, NULL); + if (entry) { + entry->proc_fops = &eio_errors_operations; + entry->data = dmc; + } + kfree(s); + + s = eio_cons_procfs_cachename(dmc, PROC_IOSZ_HIST); + entry = create_proc_entry(s, 0, NULL); + if (entry) { + entry->proc_fops = &eio_iosize_hist_operations; + entry->data = dmc; + } + kfree(s); + + + s = eio_cons_procfs_cachename(dmc, PROC_CONFIG); + entry = create_proc_entry(s, 0, NULL); + if (entry) { + entry->proc_fops = &eio_config_operations; + entry->data = dmc; + } + kfree(s); + + eio_sysctl_register_common(dmc); + if (dmc->mode == CACHE_MODE_WB) + eio_sysctl_register_writeback(dmc); + if (CACHE_INVALIDATE_IS_SET(dmc)) + eio_sysctl_register_invalidate(dmc); +} + + +/* + * eio_procfs_dtr -- called from "eio_dtr()" + */ +void +eio_procfs_dtr(struct cache_c *dmc) +{ + char *s; + + s = eio_cons_procfs_cachename(dmc, PROC_STATS); + remove_proc_entry(s, NULL); + kfree(s); + + s = eio_cons_procfs_cachename(dmc, PROC_ERRORS); + remove_proc_entry(s, NULL); + kfree(s); + + s = eio_cons_procfs_cachename(dmc, PROC_IOSZ_HIST); + remove_proc_entry(s, NULL); + kfree(s); + + s = eio_cons_procfs_cachename(dmc, PROC_CONFIG); + remove_proc_entry(s, NULL); + kfree(s); + + s = eio_cons_procfs_cachename(dmc, ""); + remove_proc_entry(s, NULL); + kfree(s); + + if (dmc->sysctl_handle_invalidate) + eio_sysctl_unregister_invalidate(dmc); + if (dmc->sysctl_handle_writeback) + eio_sysctl_unregister_writeback(dmc); + eio_sysctl_unregister_common(dmc); +} + + +static spinlock_t invalidate_spin_lock; + +/* + * eio_invalidate_sysctl + */ +static int +eio_invalidate_sysctl(ctl_table *table, int write, void __user *buffer, + size_t *length, loff_t *ppos) +{ + static int have_sector; + static u_int64_t sector; + static u_int64_t num_sectors; + int rv; + unsigned long int flags; + struct cache_c *dmc; + + + spin_lock_irqsave(&invalidate_spin_lock, flags); + + dmc = (struct cache_c *)table->extra1; + if (dmc == NULL) { + pr_err("Cannot invalidate due to unexpected NULL cache pointer"); + spin_unlock_irqrestore(&invalidate_spin_lock, flags); + return -EBUSY; + } + + table->extra1 = NULL; + proc_doulongvec_minmax(table, write, buffer, length, ppos); + table->extra1 = dmc; + + spin_unlock_irqrestore(&invalidate_spin_lock, flags); + + rv = 0; + + if (write) { + /* Harish: TBD. Need to put appropriate sanity checks */ + + /* update the active value with the new tunable value */ + spin_lock_irqsave(&dmc->cache_spin_lock, flags); + dmc->sysctl_active.invalidate = dmc->sysctl_pending.invalidate; + spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); + + /* apply the new tunable value */ + + if (have_sector) { + num_sectors = dmc->sysctl_active.invalidate; + + rv = eio_invalidate_sanity_check(dmc, sector, &num_sectors); + + /* Invalidate only if sanity passes and reset the return value. */ + if (rv == 0) + eio_inval_range(dmc, sector, (unsigned)to_bytes(num_sectors)); + + rv = 0; + have_sector = 0; + + } else { + sector = dmc->sysctl_active.invalidate; + have_sector = 1; + num_sectors = 0; + } + } + + if (CACHE_VERBOSE_IS_SET(dmc) && num_sectors) { + pr_info("eio_inval_range: Invalidated sector range from sector=%lu to sector=%lu", + (long unsigned int)sector, (long unsigned int)num_sectors); + } + + return rv; +} + +/* + * eio_find_sysctl_data + */ +static void * +eio_find_sysctl_data(struct cache_c *dmc, ctl_table *vars) +{ + + if (strcmp(vars->procname, "do_clean") == 0) return (void *)&dmc->sysctl_pending.do_clean; + if (strcmp(vars->procname, "time_based_clean_interval") == 0) return (void *)&dmc->sysctl_pending.time_based_clean_interval; + if (strcmp(vars->procname, "dirty_high_threshold") == 0) return (void *)&dmc->sysctl_pending.dirty_high_threshold; + if (strcmp(vars->procname, "dirty_low_threshold") == 0) return (void *)&dmc->sysctl_pending.dirty_low_threshold; + if (strcmp(vars->procname, "dirty_set_high_threshold") == 0) return (void *)&dmc->sysctl_pending.dirty_set_high_threshold; + if (strcmp(vars->procname, "dirty_set_low_threshold") == 0) return (void *)&dmc->sysctl_pending.dirty_set_low_threshold; + if (strcmp(vars->procname, "autoclean_threshold") == 0) return (void *)&dmc->sysctl_pending.autoclean_threshold; + if (strcmp(vars->procname, "zero_stats") == 0) return (void *)&dmc->sysctl_pending.zerostats; + if (strcmp(vars->procname, "mem_limit_pct") == 0) return (void *)&dmc->sysctl_pending.mem_limit_pct; + if (strcmp(vars->procname, "control") == 0) return (void *)&dmc->sysctl_pending.control; + if (strcmp(vars->procname, "invalidate") == 0) return (void *)&dmc->sysctl_pending.invalidate; + + pr_err("Cannot find sysctl data for %s", vars->procname); + return NULL; +} + + +/* + * eio_cons_sysctl_devname + */ +static char * +eio_cons_sysctl_devname(struct cache_c *dmc) +{ + char *pathname; + + if (dmc->cache_name[0]) { + pathname = kzalloc(strlen(dmc->cache_name) + 1, GFP_KERNEL); + if (pathname) + strcpy(pathname, dmc->cache_name); + else + pr_err("Failed to allocate memory"); + } else { + pr_err("Cache name is NULL"); + pathname = NULL; + } + + return pathname; +} + + +/* + * eio_cons_procfs_cachename + */ +static char * +eio_cons_procfs_cachename(struct cache_c *dmc, char *path_component) +{ + char *pathname; + + if (dmc->cache_name[0]) { + pathname = kzalloc(strlen(PROC_SYS_DIR_NAME) + 1 + strlen(dmc->cache_name) + 1 + + strlen(path_component) + 1, GFP_KERNEL); + if (pathname) { + strcpy(pathname, PROC_SYS_DIR_NAME); + strcat(pathname, "/"); + strcat(pathname, dmc->cache_name); + if (strcmp(path_component, "") != 0) { + strcat(pathname, "/"); + strcat(pathname, path_component); + } + } else + pr_err("Failed to allocate memory"); + } else { + pr_err("Cache name is NULL"); + pathname = NULL; + } + + return pathname; +} + + +static void +eio_sysctl_register_dir(void) +{ + struct sysctl_table_dir *dir; + + + dir = kmemdup(&sysctl_template_dir, sizeof sysctl_template_dir, GFP_KERNEL); + if (unlikely(dir == NULL)) { + pr_err("Failed to allocate memory for dir sysctl"); + return; + } + + dir->dir[0].child = dir->dev; + dir->root[0].child = dir->dir; + dir->sysctl_header = register_sysctl_table(dir->root); + if (unlikely(dir->sysctl_header == NULL)) { + pr_err("Failed to register dir sysctl"); + goto out; + } + + sysctl_handle_dir = dir; + return; +out: + kfree(dir); +} + + +static void +eio_sysctl_unregister_dir(void) +{ + if (sysctl_handle_dir != NULL) { + unregister_sysctl_table(sysctl_handle_dir->sysctl_header); + kfree(sysctl_handle_dir); + sysctl_handle_dir = NULL; + } +} + +/* + * eio_sysctl_register_common + */ +static void +eio_sysctl_register_common(struct cache_c *dmc) +{ + unsigned int i; + struct sysctl_table_common *common; + + + common = kmemdup(&sysctl_template_common, sizeof sysctl_template_common, GFP_KERNEL); + if (common == NULL) { + pr_err("Failed to allocate memory for common sysctl"); + return; + } + for (i = 0 ; i < ARRAY_SIZE(common->vars) - 1 ; i++) { + common->vars[i].data = eio_find_sysctl_data(dmc, &common->vars[i]); + common->vars[i].extra1 = dmc; + } + + common->dev[0].procname = eio_cons_sysctl_devname(dmc); + common->dev[0].child = common->vars; + common->dir[0].child = common->dev; + common->root[0].child = common->dir; + common->sysctl_header = register_sysctl_table(common->root); + if (common->sysctl_header == NULL) { + pr_err("Failed to register common sysctl"); + goto out; + } + + dmc->sysctl_handle_common = common; + return; +out: + kfree(common->dev[0].procname); + kfree(common); +} + + +/* + * eio_sysctl_unregister_common + */ +static void +eio_sysctl_unregister_common(struct cache_c *dmc) +{ + struct sysctl_table_common *common; + + common = dmc->sysctl_handle_common; + if (common != NULL) { + dmc->sysctl_handle_common = NULL; + unregister_sysctl_table(common->sysctl_header); + kfree(common->dev[0].procname); + kfree(common); + } +} + + +/* + * eio_sysctl_register_writeback + */ +static void +eio_sysctl_register_writeback(struct cache_c *dmc) +{ + unsigned int i; + struct sysctl_table_writeback *writeback; + + writeback = kmemdup(&sysctl_template_writeback, sizeof sysctl_template_writeback, GFP_KERNEL); + if (writeback == NULL) { + pr_err("Failed to allocate memory for writeback sysctl"); + return; + } + for (i = 0 ; i < ARRAY_SIZE(writeback->vars) - 1 ; i++) { + writeback->vars[i].data = eio_find_sysctl_data(dmc, &writeback->vars[i]); + writeback->vars[i].extra1 = dmc; + } + + writeback->dev[0].procname = eio_cons_sysctl_devname(dmc); + writeback->dev[0].child = writeback->vars; + writeback->dir[0].child = writeback->dev; + writeback->root[0].child = writeback->dir; + writeback->sysctl_header = register_sysctl_table(writeback->root); + if (writeback->sysctl_header == NULL) { + pr_err("Failed to register writeback sysctl"); + goto out; + } + + dmc->sysctl_handle_writeback = writeback; + return; +out: + kfree(writeback->dev[0].procname); + kfree(writeback); +} + + +/* + * eio_sysctl_unregister_writeback + */ +static void +eio_sysctl_unregister_writeback(struct cache_c *dmc) +{ + struct sysctl_table_writeback *writeback; + + writeback = dmc->sysctl_handle_writeback; + if (writeback != NULL) { + dmc->sysctl_handle_writeback = NULL; + unregister_sysctl_table(writeback->sysctl_header); + kfree(writeback->dev[0].procname); + kfree(writeback); + } +} + + +/* + * eio_sysctl_register_invalidate + */ +static void +eio_sysctl_register_invalidate(struct cache_c *dmc) +{ + unsigned int i; + struct sysctl_table_invalidate *invalidate; + + invalidate = kmemdup(&sysctl_template_invalidate, sizeof sysctl_template_invalidate, GFP_KERNEL); + if (invalidate == NULL) { + pr_err("Failed to allocate memory for invalidate sysctl"); + return; + } + for (i = 0 ; i < ARRAY_SIZE(invalidate->vars) - 1 ; i++) { + invalidate->vars[i].data = eio_find_sysctl_data(dmc, &invalidate->vars[i]); + invalidate->vars[i].extra1 = dmc; + } + + invalidate->dev[0].procname = eio_cons_sysctl_devname(dmc); + invalidate->dev[0].child = invalidate->vars; + invalidate->dir[0].child = invalidate->dev; + invalidate->root[0].child = invalidate->dir; + invalidate->sysctl_header = register_sysctl_table(invalidate->root); + if (invalidate->sysctl_header == NULL) { + pr_err("Failed to register invalidate sysctl"); + goto out; + } + + dmc->sysctl_handle_invalidate = invalidate; + spin_lock_init(&invalidate_spin_lock); + return; +out: + kfree(invalidate->dev[0].procname); + kfree(invalidate); +} + +/* + * eio_sysctl_unregister_invalidate + */ +static void +eio_sysctl_unregister_invalidate(struct cache_c *dmc) +{ + struct sysctl_table_invalidate *invalidate; + + invalidate = dmc->sysctl_handle_invalidate; + if (invalidate != NULL) { + dmc->sysctl_handle_invalidate = NULL; + unregister_sysctl_table(invalidate->sysctl_header); + kfree(invalidate->dev[0].procname); + kfree(invalidate); + } +} + + +/* + * eio_stats_show + */ +static int +eio_stats_show(struct seq_file *seq, void *v) +{ + struct cache_c *dmc = seq->private; + struct eio_stats *stats = &dmc->eio_stats; + int read_hit_pct, write_hit_pct, dirty_write_hit_pct; + + if (atomic64_read(&stats->reads) > 0) + read_hit_pct = atomic64_read(&stats->read_hits) * 100LL / atomic64_read(&stats->reads); + else + read_hit_pct = 0; + + if (atomic64_read(&stats->writes) > 0) { + write_hit_pct = atomic64_read(&stats->write_hits) * 100LL / atomic64_read(&stats->writes); + dirty_write_hit_pct = atomic64_read(&stats->dirty_write_hits) * 100 / atomic64_read(&stats->writes); + } else { + write_hit_pct = 0; + dirty_write_hit_pct = 0; + } + + seq_printf(seq, "%-26s %12lld\n", "reads", (int64_t) atomic64_read(&stats->reads)); + seq_printf(seq, "%-26s %12lld\n", "writes", (int64_t) atomic64_read(&stats->writes)); + + seq_printf(seq, "%-26s %12lld\n", "read_hits", (int64_t) atomic64_read(&stats->read_hits)); + seq_printf(seq, "%-26s %12d\n", "read_hit_pct", read_hit_pct); + + seq_printf(seq, "%-26s %12lld\n", "write_hits", (int64_t) atomic64_read(&stats->write_hits)); + seq_printf(seq, "%-26s %12u\n", "write_hit_pct", write_hit_pct); + + seq_printf(seq, "%-26s %12lld\n", "dirty_write_hits", (int64_t) atomic64_read(&stats->dirty_write_hits)); + seq_printf(seq, "%-26s %12d\n", "dirty_write_hit_pct", dirty_write_hit_pct); + + if ((int64_t)(atomic64_read(&stats->cached_blocks)) < 0) + atomic64_set(&stats->cached_blocks, 0); + seq_printf(seq, "%-26s %12lld\n", "cached_blocks", (int64_t) atomic64_read(&stats->cached_blocks)); + + seq_printf(seq, "%-26s %12lld\n", "rd_replace", (int64_t) atomic64_read(&stats->rd_replace)); + seq_printf(seq, "%-26s %12lld\n", "wr_replace", (int64_t) atomic64_read(&stats->wr_replace)); + + seq_printf(seq, "%-26s %12lld\n", "noroom", (int64_t) atomic64_read(&stats->noroom)); + + seq_printf(seq, "%-26s %12lld\n", "cleanings", (int64_t) atomic64_read(&stats->cleanings)); + seq_printf(seq, "%-26s %12lld\n", "md_write_dirty", (int64_t) atomic64_read(&stats->md_write_dirty)); + seq_printf(seq, "%-26s %12lld\n", "md_write_clean", (int64_t) atomic64_read(&stats->md_write_clean)); + seq_printf(seq, "%-26s %12lld\n", "md_ssd_writes", (int64_t) atomic64_read(&stats->md_ssd_writes)); + seq_printf(seq, "%-26s %12d\n", "do_clean", dmc->sysctl_active.do_clean); + seq_printf(seq, "%-26s %12lld\n", "nr_blocks", dmc->size); + seq_printf(seq, "%-26s %12lld\n", "nr_dirty", (int64_t) atomic64_read(&dmc->nr_dirty)); + seq_printf(seq, "%-26s %12u\n", "nr_sets", (uint32_t) dmc->num_sets); + seq_printf(seq, "%-26s %12d\n", "clean_index", (uint32_t) atomic_read(&dmc->clean_index)); + + seq_printf(seq, "%-26s %12lld\n", "uncached_reads", (int64_t) atomic64_read(&stats->uncached_reads)); + seq_printf(seq, "%-26s %12lld\n", "uncached_writes", (int64_t) atomic64_read(&stats->uncached_writes)); + seq_printf(seq, "%-26s %12lld\n", "uncached_map_size", (int64_t) atomic64_read(&stats->uncached_map_size)); + seq_printf(seq, "%-26s %12lld\n", "uncached_map_uncacheable", (int64_t) atomic64_read(&stats->uncached_map_uncacheable)); + + seq_printf(seq, "%-26s %12lld\n", "disk_reads", (int64_t) atomic64_read(&stats->disk_reads)); + seq_printf(seq, "%-26s %12lld\n", "disk_writes", (int64_t) atomic64_read(&stats->disk_writes)); + seq_printf(seq, "%-26s %12lld\n", "ssd_reads", (int64_t) atomic64_read(&stats->ssd_reads)); + seq_printf(seq, "%-26s %12lld\n", "ssd_writes", (int64_t) atomic64_read(&stats->ssd_writes)); + seq_printf(seq, "%-26s %12lld\n", "ssd_readfills", (int64_t) atomic64_read(&stats->ssd_readfills)); + seq_printf(seq, "%-26s %12lld\n", "ssd_readfill_unplugs", (int64_t) atomic64_read(&stats->ssd_readfill_unplugs)); + + seq_printf(seq, "%-26s %12lld\n", "readdisk", (int64_t) atomic64_read(&stats->readdisk)); + seq_printf(seq, "%-26s %12lld\n", "writedisk", (int64_t) atomic64_read(&stats->readdisk)); + seq_printf(seq, "%-26s %12lld\n", "readcache", (int64_t) atomic64_read(&stats->readcache)); + seq_printf(seq, "%-26s %12lld\n", "readfill", (int64_t) atomic64_read(&stats->readfill)); + seq_printf(seq, "%-26s %12lld\n", "writecache", (int64_t) atomic64_read(&stats->writecache)); + + seq_printf(seq, "%-26s %12lld\n", "readcount", (int64_t) atomic64_read(&stats->readcount)); + seq_printf(seq, "%-26s %12lld\n", "writecount", (int64_t) atomic64_read(&stats->writecount)); + seq_printf(seq, "%-26s %12lld\n", "kb_reads", (int64_t) atomic64_read(&stats->reads) / 2); + seq_printf(seq, "%-26s %12lld\n", "kb_writes", (int64_t) atomic64_read(&stats->writes) / 2); + seq_printf(seq, "%-26s %12lld\n", "rdtime_ms", (int64_t) atomic64_read(&stats->rdtime_ms)); + seq_printf(seq, "%-26s %12lld\n", "wrtime_ms", (int64_t) atomic64_read(&stats->wrtime_ms)); + return 0; +} + + +/* + * eio_stats_open + */ +static int +eio_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, &eio_stats_show, PDE(inode)->data); +} + + +/* + * eio_errors_show + */ +static int +eio_errors_show(struct seq_file *seq, void *v) +{ + struct cache_c *dmc = seq->private; + + seq_printf(seq, "disk_read_errors %4u\n", dmc->eio_errors.disk_read_errors); + seq_printf(seq, "disk_write_errors %4u\n", dmc->eio_errors.disk_write_errors); + seq_printf(seq, "ssd_read_errors %4u\n", dmc->eio_errors.ssd_read_errors); + seq_printf(seq, "ssd_write_errors %4u\n", dmc->eio_errors.ssd_write_errors); + seq_printf(seq, "memory_alloc_errors %4u\n", dmc->eio_errors.memory_alloc_errors); + seq_printf(seq, "no_cache_dev %4u\n", dmc->eio_errors.no_cache_dev); + seq_printf(seq, "no_source_dev %4u\n", dmc->eio_errors.no_source_dev); + + return 0; +} + + +/* + * eio_errors_open + */ +static int +eio_errors_open(struct inode *inode, struct file *file) +{ + return single_open(file, &eio_errors_show, PDE(inode)->data); +} + + +/* + * eio_iosize_hist_show + */ +static int +eio_iosize_hist_show(struct seq_file *seq, void *v) +{ + int i; + struct cache_c *dmc = seq->private; + + + for (i = 1 ; i <= SIZE_HIST - 1; i++) { + if (atomic64_read(&dmc->size_hist[i]) == 0) + continue; + + if (i == 1) + seq_printf(seq, "%u %12lld\n", i * 512, (int64_t) atomic64_read(&dmc->size_hist[i])); + else if (i < 20) + seq_printf(seq, "%u %12lld\n", i * 512, (int64_t) atomic64_read(&dmc->size_hist[i])); + else + seq_printf(seq, "%u %12lld\n", i * 512, (int64_t) atomic64_read(&dmc->size_hist[i])); + } + + return 0; +} + + +/* + * eio_iosize_hist_open + */ +static int +eio_iosize_hist_open(struct inode *inode, struct file *file) +{ + + return single_open(file, &eio_iosize_hist_show, PDE(inode)->data); +} + +/* + * eio_version_show + */ +static int +eio_version_show(struct seq_file *seq, void *v) +{ + char buf[128]; + + + memset(buf, 0, sizeof buf); + eio_version_query(sizeof buf, buf); + seq_printf(seq, "%s\n", buf); + + return 0; +} + + +/* + * eio_version_open + */ +static int +eio_version_open(struct inode *inode, struct file *file) +{ + return single_open(file, &eio_version_show, PDE(inode)->data); +} + + +/* + * eio_config_show + */ +static int +eio_config_show(struct seq_file *seq, void *v) +{ + struct cache_c *dmc = seq->private; + + + seq_printf(seq, "src_name %s\n", dmc->disk_devname); + seq_printf(seq, "ssd_name %s\n", dmc->cache_devname); + seq_printf(seq, "src_size %lu\n", (long unsigned int)dmc->disk_size); + seq_printf(seq, "ssd_size %lu\n", (long unsigned int)dmc->size); + + seq_printf(seq, "set_size %10u\n", dmc->assoc); + seq_printf(seq, "block_size %10u\n", (dmc->block_size) << SECTOR_SHIFT); + seq_printf(seq, "mode %10u\n", dmc->mode); + seq_printf(seq, "eviction %10u\n", dmc->req_policy); + seq_printf(seq, "num_sets %10u\n", dmc->num_sets); + seq_printf(seq, "num_blocks %10lu\n", (long unsigned int)dmc->size); + seq_printf(seq, "metadata %s\n", CACHE_MD8_IS_SET(dmc) ? "large" : "small"); + seq_printf(seq, "state %s\n", CACHE_DEGRADED_IS_SET(dmc) ? "degraded" : + (CACHE_FAILED_IS_SET(dmc) ? "failed" : "normal")); + seq_printf(seq, "flags 0x%08x\n", dmc->cache_flags); + + return 0; +} + + +/* + * eio_config_open + */ +static int +eio_config_open(struct inode *inode, struct file *file) +{ + + return single_open(file, &eio_config_show, PDE(inode)->data); +} diff --git a/drivers/staging/enhanceio/eio_setlru.c b/drivers/staging/enhanceio/eio_setlru.c new file mode 100644 index 0000000..5a15647 --- /dev/null +++ b/drivers/staging/enhanceio/eio_setlru.c @@ -0,0 +1,193 @@ +/* + * eio_setlru.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Amit Kale <akale@stec-inc.com> + * Harish Pujari <hpujari@stec-inc.com> + * Generic lru implementation used mainly for cache sets. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + */ + +#include "eio.h" + +/* Initialize the lru list */ +int +lru_init(lru_list_t **llist, index_t max) +{ + index_t i = 0; + + VERIFY(max > 0); + *llist = vmalloc((sizeof(lru_list_t) + + (max - 1) * sizeof(lru_elem_t))); + if (*llist == NULL) { + return -ENOMEM; + } + + (*llist)->ll_head = LRU_NULL; + (*llist)->ll_tail = LRU_NULL; + (*llist)->ll_max = max; + (*llist)->ll_size = 0; + + for (i = 0; i < max; i++) { + (*llist)->ll_elem[i].le_next = LRU_NULL; + (*llist)->ll_elem[i].le_prev = LRU_NULL; + (*llist)->ll_elem[i].le_key = 0; + } + + return 0; +} + +/* Uninitialize the lru list */ +void +lru_uninit(lru_list_t *llist) +{ + if (llist) { + vfree(llist); + } +} + +/* Add a new entry to lru list */ +int +lru_add(lru_list_t *llist, index_t index, u_int64_t key) +{ + if (!llist || (index >= llist->ll_max)) { + return -EINVAL; + } + + llist->ll_elem[index].le_prev = llist->ll_tail; + llist->ll_elem[index].le_next = LRU_NULL; + llist->ll_elem[index].le_key = key; + + if (llist->ll_tail != LRU_NULL) { + llist->ll_elem[llist->ll_tail].le_next = index; + } else { + VERIFY(llist->ll_head == LRU_NULL); + llist->ll_head = index; + } + llist->ll_tail = index; + llist->ll_size++; + + return 0; +} + +/* Remove an entry from the lru list */ +int +lru_rem(lru_list_t *llist, index_t index) +{ + if (!llist || (index >= llist->ll_max) || (index == LRU_NULL)) { + return -EINVAL; + } + + if (llist->ll_head == LRU_NULL && llist->ll_tail == LRU_NULL) { + + /* + * No element in the list. + */ + + return -EINVAL; + } + + if (llist->ll_elem[index].le_prev == LRU_NULL && + llist->ll_elem[index].le_next == LRU_NULL && + llist->ll_head != index && llist->ll_tail != index) { + + /* + * Element not in list. + */ + + return 0; + } + + if (llist->ll_elem[index].le_prev != LRU_NULL) { + llist->ll_elem[llist->ll_elem[index].le_prev].le_next = llist->ll_elem[index].le_next; + } + + if (llist->ll_elem[index].le_next != LRU_NULL) { + llist->ll_elem[llist->ll_elem[index].le_next].le_prev = llist->ll_elem[index].le_prev; + } + + if (llist->ll_head == index) { + llist->ll_head = llist->ll_elem[index].le_next; + } + + if (llist->ll_tail == index) { + llist->ll_tail = llist->ll_elem[index].le_prev; + } + + llist->ll_elem[index].le_prev = LRU_NULL; + llist->ll_elem[index].le_next = LRU_NULL; + VERIFY(llist->ll_size != 0); + llist->ll_size--; + + return 0; +} + +/* Move up the given lru element */ +int +lru_touch(lru_list_t *llist, index_t index, u_int64_t key) +{ + if (!llist || (index >= llist->ll_max)) { + return -EINVAL; + } + + if (llist->ll_tail == index) { + llist->ll_elem[index].le_key = key; + } else { + lru_rem(llist, index); + lru_add(llist, index, key); + } + + return 0; +} + +/* Read the element at the head of the lru */ +int +lru_read_head(lru_list_t *llist, index_t *index, u_int64_t *key) +{ + if (!llist || !index || !key) { + return -EINVAL; + } + + *index = llist->ll_head; + if (llist->ll_head == LRU_NULL) { + *index = LRU_NULL; + *key = 0; + } else { + *index = llist->ll_head; + *key = llist->ll_elem[*index].le_key; + } + + return 0; +} + +/* Remove the element at the head of the lru */ +int +lru_rem_head(lru_list_t *llist, index_t *index, u_int64_t *key) +{ + if (!llist || !index || !key) { + return -EINVAL; + } + + *index = llist->ll_head; + if (llist->ll_head == LRU_NULL) { + *index = LRU_NULL; + *key = 0; + } else { + *index = llist->ll_head; + *key = llist->ll_elem[*index].le_key; + lru_rem(llist, *index); + } + + return 0; +} diff --git a/drivers/staging/enhanceio/eio_setlru.h b/drivers/staging/enhanceio/eio_setlru.h new file mode 100644 index 0000000..5a2e5b7 --- /dev/null +++ b/drivers/staging/enhanceio/eio_setlru.h @@ -0,0 +1,49 @@ +/* + * eio_setlru.h + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Amit Kale <akale@stec-inc.com> + * Harish Pujari <hpujari@stec-inc.com> + * Generic lru implementation used mainly for cache sets + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + */ + +#ifndef _EIO_SETLRU_H_ +#define _EIO_SETLRU_H_ + +#define LRU_NULL -1 + +typedef struct lru_elem { + index_t le_next; + index_t le_prev; + u_int64_t le_key; +} lru_elem_t; + +typedef struct lru_ls { + index_t ll_head; + index_t ll_tail; + index_t ll_max; + u_int64_t ll_size; + lru_elem_t ll_elem[1]; +} lru_list_t; + +int lru_init(lru_list_t **llist, index_t max); +void lru_uninit(lru_list_t *llist); +int lru_add(lru_list_t *llist, index_t index, u_int64_t key); +int lru_rem(lru_list_t *llist, index_t index); +int lru_touch(lru_list_t *llist, index_t index, u_int64_t key); +int lru_read_head(lru_list_t *llist, index_t *index, u_int64_t *key); +int lru_rem_head(lru_list_t *llist, index_t *index, u_int64_t *key); + +#endif /* _EIO_SETLRU_H_ */ diff --git a/drivers/staging/enhanceio/eio_subr.c b/drivers/staging/enhanceio/eio_subr.c new file mode 100644 index 0000000..6c6d50d --- /dev/null +++ b/drivers/staging/enhanceio/eio_subr.c @@ -0,0 +1,472 @@ +/* + * eio_subr.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * Made EnhanceIO specific changes. + * Saied Kazemi <skazemi@stec-inc.com> + * Siddharth Choudhuri <schoudhuri@stec-inc.com> + * + * Copyright 2010 Facebook, Inc. + * Author: Mohan Srinivasan (mohan@facebook.com) + * + * Based on DM-Cache: + * Copyright (C) International Business Machines Corp., 2006 + * Author: Ming Zhao (mingzhao@ufl.edu) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "eio.h" +#include "eio_ttc.h" + +static DEFINE_SPINLOCK(_job_lock); +static u_int64_t _job_lock_flags; + +extern mempool_t *_job_pool; + +extern atomic_t nr_cache_jobs; + +static LIST_HEAD(_io_jobs); +static LIST_HEAD(_disk_read_jobs); + +int +eio_io_empty(void) +{ + + return list_empty(&_io_jobs); +} + +struct kcached_job * +eio_alloc_cache_job(void) +{ + struct kcached_job *job; + + + job = mempool_alloc(_job_pool, GFP_NOIO); + if (likely(job)) + atomic_inc(&nr_cache_jobs); + return job; +} + + +void +eio_free_cache_job(struct kcached_job *job) +{ + + mempool_free(job, _job_pool); + atomic_dec(&nr_cache_jobs); +} + +/* + * Functions to push and pop a job onto the head of a given job list. + */ +static struct kcached_job * +eio_pop(struct list_head *jobs) +{ + struct kcached_job *job = NULL; + unsigned long flags = 0; + + + spin_lock_irqsave(&_job_lock, flags); + if (!list_empty(jobs)) { + job = list_entry(jobs->next, struct kcached_job, list); + list_del(&job->list); + } + spin_unlock_irqrestore(&_job_lock, flags); + return job; +} + + +static void +eio_push(struct list_head *jobs, struct kcached_job *job) +{ + unsigned long flags = 0; + + + spin_lock_irqsave(&_job_lock, flags); + list_add_tail(&job->list, jobs); + spin_unlock_irqrestore(&_job_lock, flags); +} + +void +eio_push_ssdread_failures(struct kcached_job *job) +{ + + eio_push(&_disk_read_jobs, job); +} + +static void +eio_push_io(struct kcached_job *job) +{ + + eio_push(&_io_jobs, job); +} + +static void +eio_process_jobs(struct list_head *jobs, void (*fn) (struct kcached_job *)) +{ + struct kcached_job *job; + + + while ((job = eio_pop(jobs)) != NULL) + (void)fn(job); +} + +static void +eio_process_ssd_rm_list(void) +{ + unsigned long int flags = 0; + struct ssd_rm_list *ssd_list_ptr; + extern int ssd_rm_list_not_empty; + extern spinlock_t ssd_rm_list_lock; + extern struct list_head ssd_rm_list; + + + spin_lock_irqsave(&ssd_rm_list_lock, flags); + if (likely(list_empty(&ssd_rm_list))) { + spin_unlock_irqrestore(&ssd_rm_list_lock, flags); + return; + } + + while (!list_empty(&ssd_rm_list)) { + ssd_list_ptr = list_entry(ssd_rm_list.next, struct ssd_rm_list, list); + if (ssd_list_ptr->action == BUS_NOTIFY_DEL_DEVICE) + eio_suspend_caching(ssd_list_ptr->dmc, ssd_list_ptr->note); + else + pr_err("eio_process_ssd_rm_list: Unknown status (0x%x)\n", ssd_list_ptr->action); + list_del(&ssd_list_ptr->list); + kfree(ssd_list_ptr); + } + ssd_rm_list_not_empty = 0; + spin_unlock_irqrestore(&ssd_rm_list_lock, flags); +} + +/* + * Entry point of the "events" kernel thread. + */ +void +eio_do_work(struct work_struct *unused) +{ + extern int ssd_rm_list_not_empty; + + + if (unlikely(ssd_rm_list_not_empty)) + eio_process_ssd_rm_list(); + eio_process_jobs(&_disk_read_jobs, eio_ssderror_diskread); +} + +struct kcached_job * +eio_new_job(struct cache_c *dmc, struct eio_bio* bio, index_t index) +{ + struct kcached_job *job; + + + VERIFY((bio != NULL) || (index != -1)); + + job = eio_alloc_cache_job(); + if (unlikely(job == NULL)) { + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->eio_errors.memory_alloc_errors++; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return NULL; + } + job->dmc = dmc; + job->index = index; + job->error = 0; + job->ebio = bio; + if (index != -1) { + job->job_io_regions.cache.bdev = dmc->cache_dev->bdev; + if (bio) { + job->job_io_regions.cache.sector = (index << dmc->block_shift) + dmc->md_sectors + + (bio->eb_sector - EIO_ROUND_SECTOR(dmc, bio->eb_sector)); + VERIFY(to_sector(bio->eb_size) <= dmc->block_size); + job->job_io_regions.cache.count = to_sector(bio->eb_size); + } else { + job->job_io_regions.cache.sector = (index << dmc->block_shift) + dmc->md_sectors; + job->job_io_regions.cache.count = dmc->block_size; + } + } + + job->job_io_regions.disk.bdev = dmc->disk_dev->bdev; + if (bio) { + job->job_io_regions.disk.sector = bio->eb_sector; + job->job_io_regions.disk.count = to_sector(bio->eb_size); + } else { + job->job_io_regions.disk.sector = EIO_DBN_GET(dmc, index); + job->job_io_regions.disk.count = dmc->block_size; + } + job->next = NULL; + job->md_sector = NULL; + + return job; +} + +static void +eio_sync_endio(struct bio *bio, int error) +{ + if(error) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + pr_err("eio_sync_endio: error: %d\n", error); + } + + if(bio->bi_private) + complete(bio->bi_private); +} + +int +eio_io_sync_pages(struct cache_c *dmc, struct eio_io_region *where, int rw, + struct page **pages, int num_bvecs) +{ + struct eio_io_request req; + int error; + + req.mtype = EIO_PAGES; + req.dptr.plist = pages; + req.num_bvecs = num_bvecs; + req.notify = NULL; + req.context = NULL; + req.hddio = 0; + + if ((unlikely(CACHE_FAILED_IS_SET(dmc)) || + unlikely(CACHE_DEGRADED_IS_SET(dmc))) && (!CACHE_SSD_ADD_INPROG_IS_SET(dmc))) + error = -ENODEV; + else + error = eio_do_io(dmc, where, rw, &req); + + if (error) + return error; + + return 0; +} + +int +eio_io_sync_vm(struct cache_c *dmc, struct eio_io_region *where, int rw, + struct bio_vec *pages, int num_bvecs) +{ + struct eio_io_request req; + int error; + + memset((char *)&req, 0, sizeof req); + + /* Fill up the appropriate fields + in eio_io_request */ + req.mtype = EIO_BVECS; + req.dptr.pages = pages; + req.num_bvecs = num_bvecs; + req.notify = NULL; + req.context = NULL; + req.hddio = 0; + + if ((unlikely(CACHE_FAILED_IS_SET(dmc)) || + unlikely(CACHE_DEGRADED_IS_SET(dmc))) && (!CACHE_SSD_ADD_INPROG_IS_SET(dmc))) + error = -ENODEV; + else + error = eio_do_io(dmc, where, rw, &req); + + if (error) + return error; + + return 0; +} + +void +eio_unplug_cache_device(struct cache_c *dmc) +{ + struct request_queue *q; + struct block_device *bdev; + + + if (unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) + return; + + bdev = dmc->cache_dev->bdev; + q = bdev_get_queue(bdev); +} + +void +eio_unplug_disk_device(struct cache_c *dmc) +{ + struct request_queue *q; + struct block_device *bdev; + + if (unlikely(CACHE_DEGRADED_IS_SET(dmc))) + return; + + bdev = dmc->disk_dev->bdev; + q = bdev_get_queue(bdev); +} + +void +eio_plug_cache_device(struct cache_c *dmc) +{ + struct block_device *bdev; + struct request_queue *q; + + if (unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) + return; + + bdev = dmc->cache_dev->bdev; + q = bdev_get_queue(bdev); +} + +void +eio_plug_disk_device(struct cache_c *dmc) +{ + struct block_device *bdev; + struct request_queue *q; + + if (unlikely(CACHE_DEGRADED_IS_SET(dmc))) + return; + + bdev = dmc->disk_dev->bdev; + q = bdev_get_queue(bdev); +} + +/* + * For Linux, we do not do a dm_put_device() when the device underneath + * disappears. The logic to handle the IOs to a missing device is handled + * by the kernel proper. We will get an IO error if an IO is done on a + * device that does not exist. + */ +void +eio_suspend_caching(struct cache_c *dmc, dev_notifier_t note) +{ + + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (dmc->mode != CACHE_MODE_WB && CACHE_FAILED_IS_SET(dmc)) { + pr_err("suspend caching: Cache \"%s\" is already in FAILED state, exiting.\n", + dmc->cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return; + } + + switch(note) { + + case NOTIFY_SRC_REMOVED: + if (CACHE_DEGRADED_IS_SET(dmc)) + dmc->cache_flags &= ~CACHE_FLAGS_DEGRADED; + dmc->cache_flags |= CACHE_FLAGS_FAILED; + dmc->eio_errors.no_source_dev = 1; + atomic64_set(&dmc->eio_stats.cached_blocks, 0); + pr_info("suspend_caching: Source Device Removed. Cache \"%s\" is in Failed mode.\n", + dmc->cache_name); + break; + + case NOTIFY_SSD_REMOVED: + if (dmc->mode == CACHE_MODE_WB) { + /* + * For writeback + * - Cache should never be in degraded mode + * - ssd removal should result in FAILED state + * - the cached block should not be reset. + */ + VERIFY(!CACHE_DEGRADED_IS_SET(dmc)); + dmc->cache_flags |= CACHE_FLAGS_FAILED; + pr_info("suspend caching: SSD Device Removed. Cache \"%s\" is in Failed mode.\n", + dmc->cache_name); + } else { + if (CACHE_DEGRADED_IS_SET(dmc) || CACHE_SSD_ADD_INPROG_IS_SET(dmc)) { + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + pr_err("suspend_caching: Cache \"%s\" is either degraded or device add in progress, exiting.\n", + dmc->cache_name); + return; + } + dmc->cache_flags |= CACHE_FLAGS_DEGRADED; + atomic64_set(&dmc->eio_stats.cached_blocks, 0); + pr_info("suspend caching: Cache \"%s\" is in Degraded mode.\n", dmc->cache_name); + } + dmc->eio_errors.no_cache_dev = 1; + break; + + default: + pr_err("suspend_caching: incorrect notify message.\n"); + break; + } + + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); +} + + +void +eio_put_cache_device(struct cache_c *dmc) +{ + + eio_ttc_put_device(&dmc->cache_dev); +} + + +void +eio_resume_caching(struct cache_c *dmc, char *dev) +{ + int r; + + + if (dmc == NULL || dev == NULL) { + pr_err("resume_caching: Null device or cache instance when resuming caching.\n"); + return; + } + if (strlen(dev) >= DEV_PATHLEN) { + pr_err("resume_caching: Device name %s too long.\n", dev); + return; + } + + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (CACHE_STALE_IS_SET(dmc)) { + pr_err("eio_resume_caching: Hard Failure Detected!! Cache \"%s\" can not be resumed.", + dmc->cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return; + } + + /* sanity check for writeback */ + if (dmc->mode == CACHE_MODE_WB) { + if (!CACHE_FAILED_IS_SET(dmc) || CACHE_SRC_IS_ABSENT(dmc) || CACHE_SSD_ADD_INPROG_IS_SET(dmc)) { + pr_debug("eio_resume_caching: Cache not in Failed state or Source is absent or SSD add already in progress for cache \"%s\".\n", + dmc->cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return; + } + } else { + /* sanity check for WT or RO cache. */ + if (CACHE_FAILED_IS_SET(dmc) || !CACHE_DEGRADED_IS_SET(dmc) || CACHE_SSD_ADD_INPROG_IS_SET(dmc)) { + pr_err("resume_caching: Cache \"%s\" is either in failed mode or cache device add in progress, ignoring.\n", + dmc->cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return; + } + } + + dmc->cache_flags |= CACHE_FLAGS_SSD_ADD_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + + r = eio_ctr_ssd_add(dmc, dev); + if (r) { + /* error */ + pr_debug("resume caching: returned error: %d\n", r); + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_SSD_ADD_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return; + } + + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->eio_errors.no_cache_dev = 0; + if (dmc->mode != CACHE_MODE_WB) + dmc->cache_flags &= ~CACHE_FLAGS_DEGRADED; + else + dmc->cache_flags &= ~CACHE_FLAGS_FAILED; + dmc->cache_flags &= ~CACHE_FLAGS_SSD_ADD_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + pr_info("resume_caching: cache \"%s\" is restored to ACTIVE mode.\n", dmc->cache_name); +} diff --git a/drivers/staging/enhanceio/eio_ttc.c b/drivers/staging/enhanceio/eio_ttc.c new file mode 100644 index 0000000..622d838 --- /dev/null +++ b/drivers/staging/enhanceio/eio_ttc.c @@ -0,0 +1,1708 @@ +/* + * True Transparent Caching (TTC) code. + * eio_ttc.c + * + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved + * + * Made EIO fully transparent with respect to applications. A cache can be + * created or deleted while a filesystem or applications are online + * Amit Kale <akale@stec-inc.com> + * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com> + * Akhil Bhansali <abhansali@stec-inc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include "eio.h" +#include "eio_ttc.h" +static struct rw_semaphore eio_ttc_lock[EIO_HASHTBL_SIZE]; +static struct list_head eio_ttc_list[EIO_HASHTBL_SIZE]; + +int eio_reboot_notified = 0; +extern int eio_force_warm_boot; + +extern long eio_ioctl(struct file *filp, unsigned cmd, unsigned long arg); +extern long eio_compact_ioctl(struct file *filp, unsigned cmd, unsigned long arg); + +extern mempool_t *_io_pool; +extern struct eio_control_s *eio_control; + +static void eio_make_request_fn(struct request_queue *, struct bio *); +static void eio_cache_rec_fill(struct cache_c *, cache_rec_short_t *); +static void eio_bio_end_empty_barrier(struct bio *, int); +static void eio_issue_empty_barrier_flush(struct block_device *, struct bio *, + int , make_request_fn *, int rw_flags); +static int eio_finish_nrdirty(struct cache_c *); +static int eio_mode_switch(struct cache_c *, u_int32_t); +static int eio_policy_switch(struct cache_c *, u_int32_t); + +static int eio_overlap_split_bio(struct request_queue *, struct bio *); +static struct bio * eio_split_new_bio(struct bio *, struct bio_container *, + unsigned *, unsigned *, sector_t); +static void eio_split_endio(struct bio *, int); + +static int +eio_open(struct inode *ip, struct file *filp) +{ + __module_get(THIS_MODULE); + return 0; +} + +static int +eio_release(struct inode *ip, struct file *filp) +{ + module_put(THIS_MODULE); + return 0; +} + +static struct file_operations eio_fops = { + .open = eio_open, + .release = eio_release, + .unlocked_ioctl = eio_ioctl, + .compat_ioctl = eio_compact_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice eio_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = MISC_DEVICE, + .fops = &eio_fops, +}; + +int +eio_create_misc_device() +{ + return misc_register(&eio_misc); +} + +int +eio_delete_misc_device() +{ + return misc_deregister(&eio_misc); +} + +int +eio_ttc_get_device(const char *path, fmode_t mode, struct eio_bdev **result) +{ + struct block_device *bdev; + struct eio_bdev *eio_bdev; + unsigned int major, minor; + dev_t uninitialized_var(dev); + static char *eio_holder = "ENHANCE IO"; + + if (sscanf(path, "%u:%u", &major, &minor) == 2) { + /* Extract the major/minor numbers */ + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) + return -EOVERFLOW; + } else { + /* convert the path to a device */ + struct block_device *bdev = lookup_bdev(path); + + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + dev = bdev->bd_dev; + bdput(bdev); + } + + bdev = blkdev_get_by_dev(dev, mode, eio_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + /* + * Do we need to claim the devices ?? + * bd_claim_by_disk(bdev, charptr, gendisk) + */ + + eio_bdev = (struct eio_bdev *)kzalloc(sizeof(*eio_bdev), GFP_KERNEL); + if (eio_bdev == NULL) { + blkdev_put(bdev, mode); + return -ENOMEM; + } + + eio_bdev->bdev = bdev; + eio_bdev->mode = mode; + *result = eio_bdev; + return 0; +} + +void +eio_ttc_put_device(struct eio_bdev **d) +{ + struct eio_bdev *eio_bdev; + + eio_bdev = *d; + blkdev_put(eio_bdev->bdev, eio_bdev->mode); + kfree(eio_bdev); + *d = NULL; + return; +} + +struct cache_c * +eio_cache_lookup(char *name) +{ + struct cache_c *dmc = NULL; + int i; + + for (i = 0; i < EIO_HASHTBL_SIZE; i++) { + down_read(&eio_ttc_lock[i]); + list_for_each_entry(dmc, &eio_ttc_list[i], cachelist) { + if (!strcmp(name, dmc->cache_name)) { + up_read(&eio_ttc_lock[i]); + return dmc; + } + } + up_read(&eio_ttc_lock[i]); + } + return NULL; +} + +int +eio_ttc_activate(struct cache_c *dmc) +{ + struct block_device *bdev; + struct request_queue *rq; + make_request_fn *origmfn; + struct cache_c *dmc1; + int wholedisk; + int error; + int index; + int rw_flags = 0; + + bdev = dmc->disk_dev->bdev; + if (bdev == NULL) { + pr_err("cache_create: Source device not found\n"); + return (-ENODEV); + } + rq = bdev->bd_disk->queue; + + wholedisk = 0; + if (bdev == bdev->bd_contains) { + wholedisk = 1; + } + + dmc->dev_start_sect = bdev->bd_part->start_sect; + dmc->dev_end_sect = + bdev->bd_part->start_sect + bdev->bd_part->nr_sects - 1; + + pr_debug("eio_ttc_activate: Device/Partition" + " sector_start: %llu, end: %llu\n", + (uint64_t)dmc->dev_start_sect, (uint64_t)dmc->dev_end_sect); + + error = 0; + origmfn = NULL; + index = EIO_HASH_BDEV(bdev->bd_contains->bd_dev); + + down_write(&eio_ttc_lock[index]); + list_for_each_entry(dmc1, &eio_ttc_list[index], cachelist) { + if (dmc1->disk_dev->bdev->bd_contains != bdev->bd_contains) + continue; + + if ((wholedisk) || (dmc1->dev_info == EIO_DEV_WHOLE_DISK) || + (dmc1->disk_dev->bdev == bdev)) { + error = -EINVAL; + up_write(&eio_ttc_lock[index]); + goto out; + } + + /* some partition of same device already cached */ + VERIFY(dmc1->dev_info == EIO_DEV_PARTITION); + origmfn = dmc1->origmfn; + break; + } + + /* + * Save original make_request_fn. Switch make_request_fn only once. + */ + + if (origmfn) { + dmc->origmfn = origmfn; + dmc->dev_info = EIO_DEV_PARTITION; + VERIFY(wholedisk == 0); + } else { + dmc->origmfn = rq->make_request_fn; + rq->make_request_fn = eio_make_request_fn; + dmc->dev_info = (wholedisk) ? EIO_DEV_WHOLE_DISK : EIO_DEV_PARTITION; + } + + list_add_tail(&dmc->cachelist, &eio_ttc_list[index]); + + /* + * Sleep for sometime, to allow previous I/Os to hit + * Issue a barrier I/O on Source device. + */ + + msleep(1); + SET_BARRIER_FLAGS(rw_flags); + eio_issue_empty_barrier_flush(dmc->disk_dev->bdev, NULL, + EIO_HDD_DEVICE, dmc->origmfn, rw_flags); + up_write(&eio_ttc_lock[index]); + +out: + if (error == -EINVAL) { + if (wholedisk) + pr_err("cache_create: A partition of this device is already cached.\n"); + else + pr_err("cache_create: Device is already cached.\n"); + } + return error; +} + +int +eio_ttc_deactivate(struct cache_c *dmc, int force) +{ + struct block_device *bdev; + struct request_queue *rq; + struct cache_c *dmc1; + int found_partitions; + int index; + int ret; + + ret = 0; + bdev = dmc->disk_dev->bdev; + rq = bdev->bd_disk->queue; + + if (force) + goto deactivate; + + /* Process and wait for nr_dirty to drop to zero */ + if (dmc->mode == CACHE_MODE_WB) { + if (!CACHE_FAILED_IS_SET(dmc)) { + ret = eio_finish_nrdirty(dmc); + if (ret) { + pr_err("ttc_deactivate: nrdirty failed to finish for cache \"%s\".", + dmc->cache_name); + return ret; + } + } else { + pr_debug("ttc_deactivate: Cache \"%s\" failed is already set. Continue with cache delete.", + dmc->cache_name); + } + } + + /* + * Traverse the list and see if other partitions of this device are + * cached. Switch mfn if this is the only partition of the device + * in the list. + */ +deactivate: + index = EIO_HASH_BDEV(bdev->bd_contains->bd_dev); + found_partitions = 0; + + /* check if barrier QUEUE is empty or not */ + down_write(&eio_ttc_lock[index]); + + if (dmc->dev_info != EIO_DEV_WHOLE_DISK) { + list_for_each_entry(dmc1, &eio_ttc_list[index], cachelist) { + if (dmc == dmc1) + continue; + + if (dmc1->disk_dev->bdev->bd_contains != bdev->bd_contains) + continue; + + VERIFY(dmc1->dev_info == EIO_DEV_PARTITION); + + /* + * There are still other partitions which are cached. + * Do not switch the make_request_fn. + */ + + found_partitions = 1; + break; + } + } + + if ((dmc->dev_info == EIO_DEV_WHOLE_DISK) || (found_partitions == 0)) { + rq->make_request_fn = dmc->origmfn; + } else { + } + + list_del_init(&dmc->cachelist); + up_write(&eio_ttc_lock[index]); + + /* wait for nr_ios to drain-out */ + while (atomic64_read(&dmc->nr_ios) != 0) + schedule_timeout(msecs_to_jiffies(100)); + + return ret; +} + +void +eio_ttc_init(void) +{ + int i; + + for (i = 0; i < EIO_HASHTBL_SIZE; i++) { + init_rwsem(&eio_ttc_lock[i]); + INIT_LIST_HEAD(&eio_ttc_list[i]); + } +} + +/* + * Cases:- + * 1. Full device cached. + * if (ENQUEUE || barrier(bio)) + * enqueue (dmc, bio) and return + * else + * call eio_map(dmc, bio) + * 2. Some partitions of the device cached. + * if (ENQUEUE || barrier(bio)) + * All I/Os (both on cached and uncached partitions) are enqueued. + * else + * if (I/O on cached partition) + * call eio_map(dmc, bio) + * else + * origmfn(bio); // uncached partition + * 3. q->mfn got switched back to original + * call origmfn(q, bio) + * 4. Race condition: + */ + +static void +eio_make_request_fn(struct request_queue *q, struct bio *bio) +{ + int ret; + int overlap; + int index; + make_request_fn *origmfn; + struct cache_c *dmc, *dmc1; + struct block_device *bdev; + + bdev = bio->bi_bdev; + + +re_lookup: + dmc = NULL; + origmfn = NULL; + overlap = ret = 0; + + index = EIO_HASH_BDEV(bdev->bd_contains->bd_dev); + + down_read(&eio_ttc_lock[index]); + + list_for_each_entry(dmc1, &eio_ttc_list[index], cachelist) { + if (dmc1->disk_dev->bdev->bd_contains != bdev->bd_contains) { + continue; + } + + if (dmc1->dev_info == EIO_DEV_WHOLE_DISK) { + dmc = dmc1; /* found cached device */ + break; + } + + /* Handle partitions */ + if (!origmfn) + origmfn = dmc1->origmfn; + + /* I/O perfectly fit within cached partition */ + if ((bio->bi_sector >= dmc1->dev_start_sect) && + ((bio->bi_sector + to_sector(bio->bi_size) - 1) <= + dmc1->dev_end_sect)) { + VERIFY(overlap == 0); + dmc = dmc1; /* found cached partition */ + break; + } + + /* Check if I/O is overlapping with cached partitions */ + if (((bio->bi_sector >= dmc1->dev_start_sect) && + (bio->bi_sector <= dmc1->dev_end_sect)) || + ((bio->bi_sector + to_sector(bio->bi_size) - 1 >= + dmc1->dev_start_sect) && + (bio->bi_sector + to_sector(bio->bi_size) - 1 <= + dmc1->dev_end_sect))) { + overlap = 1; + pr_err("Overlapping I/O detected on %s cache at sector: %llu, size: %u\n", + dmc1->cache_name, (uint64_t)bio->bi_sector, bio->bi_size); + break; + } + } + + if (unlikely(overlap)) { + up_read(&eio_ttc_lock[index]); + + if (bio_rw_flagged(bio, REQ_DISCARD)) { + pr_err("eio_mfn: Overlap I/O with Discard flag received." + " Discard flag is not supported.\n"); + bio_endio(bio, -EOPNOTSUPP); + } else { + ret = eio_overlap_split_bio(q, bio); + } + } else if (dmc) { /* found cached partition or device */ + + /* + * Start sector of cached partition may or may not be + * aligned with cache blocksize. + * Map start of the partition to zero reference. + */ + + if (bio->bi_sector) { + VERIFY(bio->bi_sector >= dmc->dev_start_sect); + bio->bi_sector -= dmc->dev_start_sect; + } + ret = eio_map(dmc, q, bio); + if (ret) { + /* Error case: restore the start sector of bio */ + bio->bi_sector += dmc->dev_start_sect; + } + } + + if (!overlap) { + up_read(&eio_ttc_lock[index]); + } + + if (overlap || dmc) + return; + + /* + * Race condition:- + * origmfn can be NULL if all partitions or whole disk got uncached. + * We set origmfn = q->mfn if origmfn is NULL. + * The origmfn may now again be eio_make_request_fn because + * someone else switched the q->mfn because of a new + * partition or whole disk being cached. + * Since, we cannot protect q->make_request_fn() by any lock, + * this situation may occur. However, this is a very rare event. + * In this case restart the lookup. + */ + + if (origmfn == NULL) + origmfn = q->make_request_fn; + if (origmfn == eio_make_request_fn) + goto re_lookup; + + origmfn(q, bio); + return; +} + +uint64_t +eio_get_cache_count(void) +{ + struct cache_c *dmc; + uint64_t cnt = 0; + int i; + + for (i = 0; i < EIO_HASHTBL_SIZE; i++) { + down_read(&eio_ttc_lock[i]); + list_for_each_entry(dmc, &eio_ttc_list[i], cachelist) { + cnt++; + } + up_read(&eio_ttc_lock[i]); + } + return cnt; +} + +int +eio_get_cache_list(unsigned long *arg) +{ + int error = 0; + unsigned int size, i, j; + cache_list_t reclist; + cache_rec_short_t *cache_recs; + struct cache_c *dmc; + + if (copy_from_user(&reclist, (cache_list_t __user *)arg, + sizeof (cache_list_t))) { + error = -EFAULT; + goto out; + } + + size = reclist.ncaches * sizeof (cache_rec_short_t); + cache_recs = vmalloc(size); + if (!cache_recs) { + error = -ENOMEM; + goto out; + } + memset(cache_recs, 0, size); + + i = 0; + for (j = 0; j < EIO_HASHTBL_SIZE; j++) { + down_read(&eio_ttc_lock[j]); + list_for_each_entry(dmc, &eio_ttc_list[j], cachelist) { + eio_cache_rec_fill(dmc, &cache_recs[i]); + i++; + + if (i == reclist.ncaches) + break; + } + up_read(&eio_ttc_lock[j]); + + if (i == reclist.ncaches) + break; + } + + if (copy_to_user((char __user *)reclist.cachelist, + (char *)cache_recs, size)) { + error = -EFAULT; + goto out; + } + + if (copy_to_user((cache_list_t __user *)arg, &reclist, + sizeof (cache_list_t))) { + error = -EFAULT; + goto out; + } + +out: + return error; +} + +static void +eio_cache_rec_fill(struct cache_c *dmc, cache_rec_short_t *rec) +{ + strncpy(rec->cr_name, dmc->cache_name, + sizeof (rec->cr_name)); + strncpy(rec->cr_src_devname, dmc->disk_devname, + sizeof (rec->cr_src_devname)); + strncpy(rec->cr_ssd_devname, dmc->cache_devname, + sizeof (rec->cr_ssd_devname)); + rec->cr_src_dev_size = eio_get_device_size(dmc->disk_dev); + rec->cr_ssd_dev_size = eio_get_device_size(dmc->cache_dev); + rec->cr_src_sector_size = 0; /* unused in userspace */ + rec->cr_ssd_sector_size = 0; /* unused in userspace */ + rec->cr_flags = dmc->cache_flags; + rec->cr_policy = dmc->req_policy; + rec->cr_mode = dmc->mode; + rec->cr_persistence = dmc->persistence; + rec->cr_blksize = dmc->block_size; /* In sectors */ + rec->cr_assoc = dmc->assoc; + return; +} + +/* + * Few sanity checks before cache creation. + */ + +int +eio_do_preliminary_checks(struct cache_c *dmc) +{ + struct block_device *bdev, *ssd_bdev; + struct cache_c *dmc1; + int error; + int wholedisk; + int index; + + error = wholedisk = 0; + bdev = dmc->disk_dev->bdev; + ssd_bdev = dmc->cache_dev->bdev; + + /* + * Disallow cache creation if source and cache device + * belong to same device. + */ + + if (bdev->bd_contains == ssd_bdev->bd_contains) + return -EINVAL; + + /* + * Check if cache with same name exists. + */ + + if (eio_cache_lookup(dmc->cache_name)) + return -EEXIST; + + if (bdev == bdev->bd_contains) { + wholedisk = 1; + } + + index = EIO_HASH_BDEV(bdev->bd_contains->bd_dev); + + down_read(&eio_ttc_lock[index]); + list_for_each_entry(dmc1, &eio_ttc_list[index], cachelist) { + if (dmc1->disk_dev->bdev->bd_contains != bdev->bd_contains) + continue; + + if ((wholedisk) || (dmc1->dev_info == EIO_DEV_WHOLE_DISK) || + (dmc1->disk_dev->bdev == bdev)) { + error = -EINVAL; + break; + } + } + up_read(&eio_ttc_lock[index]); + return error; +} + +/* Use mempool_alloc and free for io in sync_io as well */ +static void eio_dec_count(struct eio_context *io, int error) +{ + + if (error) + io->error = error; + + if (atomic_dec_and_test(&io->count)) { + if (io->event) { + complete(io->event); + } else { + int err = io->error; + eio_notify_fn fn = io->callback; + void *context = io->context; + + mempool_free(io, _io_pool); + io = NULL; + fn(err, context); + } + } +} + +static void eio_endio(struct bio *bio, int error) +{ + struct eio_context *io; + + io = bio->bi_private; + VERIFY (io != NULL); + + bio_put(bio); + + eio_dec_count(io, error); +} + +static int eio_dispatch_io_pages(struct cache_c *dmc, struct eio_io_region *where, int rw, struct page **pagelist, + struct eio_context *io, int hddio, int num_vecs, int sync) +{ + struct bio *bio; + struct page *page; + unsigned long len; + unsigned offset; + int num_bvecs; + int remaining_bvecs = num_vecs; + int ret = 0; + int pindex = 0; + + sector_t remaining = where->count; + + do { + /* Verify that num_vecs should not cross the threshhold */ + /* Check how many max bvecs bdev supports */ + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), remaining_bvecs); + bio = bio_alloc(GFP_NOIO, num_bvecs); + bio->bi_bdev = where->bdev; + bio->bi_sector = where->sector + (where->count - remaining); + + /* Remap the start sector of partition */ + if (hddio) + bio->bi_sector += dmc->dev_start_sect; + bio->bi_rw |= rw; + bio->bi_end_io = eio_endio; + bio->bi_private = io; + + while (remaining) { + page = pagelist[pindex]; + len = min_t(unsigned long, PAGE_SIZE, to_bytes(remaining)); + offset = 0; + + if (!bio_add_page(bio, page, len, offset)) + break; + + remaining -= to_sector(len); + pindex++; + remaining_bvecs--; + } + + atomic_inc(&io->count); + if (hddio) { + dmc->origmfn(bdev_get_queue(bio->bi_bdev), bio); + + } else { + submit_bio(rw, bio); + } + + } while (remaining); + + VERIFY(remaining_bvecs == 0); + return ret; +} + +/* + * This function will dispatch the i/o. It also takes care of + * splitting the large I/O requets to smaller I/Os which may not + * fit into single bio. + */ + +static int eio_dispatch_io(struct cache_c *dmc, struct eio_io_region *where, int rw, struct bio_vec *bvec, + struct eio_context *io, int hddio, int num_vecs, int sync) +{ + struct bio *bio; + struct page *page; + unsigned long len; + unsigned offset; + int num_bvecs; + int remaining_bvecs = num_vecs; + int ret = 0; + + sector_t remaining = where->count; + + do { + /* Verify that num_vecs should not cross the threshhold */ + /* Check how many max bvecs bdev supports */ + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), remaining_bvecs); + bio = bio_alloc(GFP_NOIO, num_bvecs); + bio->bi_bdev = where->bdev; + bio->bi_sector = where->sector + (where->count - remaining); + + /* Remap the start sector of partition */ + if (hddio) + bio->bi_sector += dmc->dev_start_sect; + bio->bi_rw |= rw; + bio->bi_end_io = eio_endio; + bio->bi_private = io; + + while (remaining) { + page = bvec->bv_page; + len = min_t(unsigned long, bvec->bv_len, to_bytes(remaining)); + offset = bvec->bv_offset; + + if (!bio_add_page(bio, page, len, offset)) + break; + + offset = 0; + remaining -= to_sector(len); + bvec = bvec + 1; + remaining_bvecs--; + } + + atomic_inc(&io->count); + if (hddio) { + dmc->origmfn(bdev_get_queue(bio->bi_bdev), bio); + if (ret) { + } + } else { + submit_bio(rw, bio); + } + + + } while (remaining); + + VERIFY(remaining_bvecs == 0); + return ret; +} + + +static int eio_async_io(struct cache_c *dmc, struct eio_io_region *where, int rw, struct eio_io_request *req) +{ + struct eio_context *io; + int err = 0; + + io = mempool_alloc(_io_pool, GFP_NOIO); + if (unlikely(io == NULL)) { + pr_err("eio_async_io: failed to allocate eio_context.\n"); + return -ENOMEM; + } + memset((char *)io, 0, sizeof (struct eio_context)); + + atomic_set(&io->count, 1); + io->callback = req->notify; + io->context = req->context; + io->event = NULL; + + switch (req->mtype) { + case EIO_BVECS: + err = eio_dispatch_io(dmc, where, rw, req->dptr.pages, io, req->hddio, req->num_bvecs, 0); + break; + + case EIO_PAGES: + err = eio_dispatch_io_pages(dmc, where, rw, req->dptr.plist, io, req->hddio, req->num_bvecs, 0); + break; + } + + /* Check if i/o submission has returned any error */ + if (unlikely(err)) { + /* Wait for any i/os which are submitted, to end. */ +retry: + if (atomic_read(&io->count) != 1) { + schedule_timeout(msecs_to_jiffies(1)); + goto retry; + } + + VERIFY(io != NULL); + mempool_free(io, _io_pool); + io = NULL; + return err; + } + + /* Drop the extra reference count here */ + eio_dec_count(io, err); + return err; +} + +static int eio_sync_io(struct cache_c *dmc, struct eio_io_region *where, + int rw, struct eio_io_request *req) +{ + int ret = 0; + struct eio_context io; + DECLARE_COMPLETION_ONSTACK(wait); + + memset((char *)&io, 0, sizeof io); + + atomic_set(&io.count, 1); + io.event = &wait; + io.callback = NULL; + io.context = NULL; + + /* For synchronous I/Os pass SYNC */ + rw |= REQ_SYNC; + + switch(req->mtype) { + case EIO_BVECS: + ret = eio_dispatch_io(dmc, where, rw, req->dptr.pages, + &io, req->hddio, req->num_bvecs, 1); + break; + case EIO_PAGES: + ret = eio_dispatch_io_pages(dmc, where, rw, req->dptr.plist, + &io, req->hddio, req->num_bvecs, 1); + break; + } + + /* Check if i/o submission has returned any error */ + if (unlikely(ret)) { + /* Wait for any i/os which are submitted, to end. */ +retry: + if (atomic_read(&(io.count)) != 1) { + schedule_timeout(msecs_to_jiffies(1)); + goto retry; + } + + return ret; + } + + /* Drop extra reference count here */ + eio_dec_count(&io, ret); + wait_for_completion(&wait); + + if (io.error) + ret = io.error; + + return ret; +} + +int eio_do_io(struct cache_c *dmc, struct eio_io_region *where, int rw, + struct eio_io_request *io_req) +{ + if (!io_req->notify) + return eio_sync_io(dmc, where, rw, io_req); + + return eio_async_io(dmc, where, rw, io_req); +} + +void +eio_process_zero_size_bio(struct cache_c *dmc, struct bio *origbio) +{ + unsigned long rw_flags = 0; + + /* Extract bio flags from original bio */ + rw_flags = origbio->bi_rw; + + VERIFY(origbio->bi_size == 0); + VERIFY(rw_flags != 0); + + eio_issue_empty_barrier_flush(dmc->cache_dev->bdev, NULL, + EIO_SSD_DEVICE, NULL, rw_flags); + eio_issue_empty_barrier_flush(dmc->disk_dev->bdev, origbio, + EIO_HDD_DEVICE, dmc->origmfn, rw_flags); +} + +static void +eio_bio_end_empty_barrier(struct bio *bio, int err) +{ + if (bio->bi_private) + bio_endio(bio->bi_private, err); + bio_put(bio); + return; +} + +static void +eio_issue_empty_barrier_flush(struct block_device *bdev, struct bio *orig_bio, + int device, make_request_fn *origmfn, int rw_flags) +{ + struct bio *bio; + + bio = bio_alloc(GFP_KERNEL, 0); + if (!bio) { + if (orig_bio) + bio_endio(orig_bio, -ENOMEM); + } + bio->bi_end_io = eio_bio_end_empty_barrier; + bio->bi_private = orig_bio; + bio->bi_bdev = bdev; + bio->bi_rw |= rw_flags; + + bio_get(bio); + if (device == EIO_HDD_DEVICE) { + origmfn(bdev_get_queue(bio->bi_bdev), bio); + + } else { + submit_bio(0, bio); + } + bio_put(bio); + return; +} + +static int +eio_finish_nrdirty(struct cache_c *dmc) +{ + int index; + int ret = 0; + int retry_count; + + /* + * Due to any transient errors, finish_nr_dirty may not drop + * to zero. Retry the clean operations for FINISH_NRDIRTY_RETRY_COUNT. + */ + retry_count = FINISH_NRDIRTY_RETRY_COUNT; + + index = EIO_HASH_BDEV(dmc->disk_dev->bdev->bd_contains->bd_dev); + down_write(&eio_ttc_lock[index]); + + /* Wait for the in-flight I/Os to drain out */ + while (atomic64_read(&dmc->nr_ios) != 0) { + pr_debug("finish_nrdirty: Draining I/O inflight\n"); + schedule_timeout(msecs_to_jiffies(1)); + } + VERIFY(!(dmc->sysctl_active.do_clean & EIO_CLEAN_START)); + + dmc->sysctl_active.do_clean |= EIO_CLEAN_KEEP | EIO_CLEAN_START; + up_write(&eio_ttc_lock[index]); + + /* + * In the process of cleaning CACHE if CACHE turns to FAILED state, + * its a severe error. + */ + do { + if (unlikely(CACHE_FAILED_IS_SET(dmc))) { + pr_err("finish_nrdirty: CACHE \"%s\" is in FAILED state.", + dmc->cache_name); + ret = -ENODEV; + break; + } + + if (!dmc->sysctl_active.fast_remove) { + eio_clean_all(dmc); + } + } while (!dmc->sysctl_active.fast_remove && (atomic64_read(&dmc->nr_dirty) > 0) + && (!(dmc->cache_flags & CACHE_FLAGS_SHUTDOWN_INPROG))); + dmc->sysctl_active.do_clean &= ~EIO_CLEAN_START; + + /* + * If all retry_count exhausted and nr_dirty is still not zero. + * Return error. + */ + if (((dmc->cache_flags & CACHE_FLAGS_SHUTDOWN_INPROG) || + (retry_count == 0)) && + (atomic64_read(&dmc->nr_dirty) > 0)) { + ret = -EINVAL; + } + if (ret) + pr_err("finish_nrdirty: Failed to finish %lu dirty blocks for cache \"%s\".", + atomic64_read(&dmc->nr_dirty), dmc->cache_name); + + return ret; +} + +int +eio_cache_edit(char *cache_name, u_int32_t mode, u_int32_t policy) +{ + int error = 0; + int index; + struct cache_c *dmc; + uint32_t old_time_thresh = 0; + int restart_async_task = 0; + int ret; + + VERIFY((mode != 0) || (policy != 0)); + + dmc = eio_cache_lookup(cache_name); + if (NULL == dmc) { + pr_err("cache_edit: cache %s do not exist", cache_name); + return -EINVAL; + } + + if ((dmc->mode == mode) && (dmc->req_policy == policy)) + return 0; + + if (unlikely(CACHE_FAILED_IS_SET(dmc)) || unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + pr_err("cache_edit: Cannot proceed with edit for cache \"%s\"." + " Cache is in failed or degraded state.", + dmc->cache_name); + return -EINVAL; + } + + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + if (dmc->cache_flags & CACHE_FLAGS_SHUTDOWN_INPROG) { + pr_err("cache_edit: system shutdown in progress, cannot edit" + " cache %s", cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return -EINVAL; + } + if (dmc->cache_flags & CACHE_FLAGS_MOD_INPROG) { + pr_err("cache_edit: simultaneous edit/delete operation on cache" + " %s is not permitted", cache_name); + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + return -EINVAL; + } + dmc->cache_flags |= CACHE_FLAGS_MOD_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + old_time_thresh = dmc->sysctl_active.time_based_clean_interval; + + if (dmc->mode == CACHE_MODE_WB) { + if (CACHE_FAILED_IS_SET(dmc)) { + pr_err("cache_edit: Can not proceed with edit for Failed cache \"%s\".", + dmc->cache_name); + error = -EINVAL; + goto out; + } + eio_stop_async_tasks(dmc); + restart_async_task = 1; + } + + /* Wait for nr_dirty to drop to zero */ + if (dmc->mode == CACHE_MODE_WB && mode != CACHE_MODE_WB) { + if (CACHE_FAILED_IS_SET(dmc)) { + pr_err("cache_edit: Can not proceed with edit for Failed cache \"%s\".", + dmc->cache_name); + error = -EINVAL; + goto out; + } + + error = eio_finish_nrdirty(dmc); + /* This error can mostly occur due to Device removal */ + if (unlikely(error)) { + pr_err("cache_edit: nr_dirty FAILED to finish for cache \"%s\".", + dmc->cache_name); + goto out; + } + VERIFY((dmc->sysctl_active.do_clean & EIO_CLEAN_KEEP) && + !(dmc->sysctl_active.do_clean & EIO_CLEAN_START)); + VERIFY(dmc->sysctl_active.fast_remove || (atomic64_read(&dmc->nr_dirty) == 0)); + } + + index = EIO_HASH_BDEV(dmc->disk_dev->bdev->bd_contains->bd_dev); + down_write(&eio_ttc_lock[index]); + + /* Wait for the in-flight I/Os to drain out */ + while (atomic64_read(&dmc->nr_ios) != 0) { + pr_debug("cache_edit: Draining I/O inflight\n"); + schedule_timeout(msecs_to_jiffies(1)); + } + + pr_debug("cache_edit: Blocking application I/O\n"); + + VERIFY(atomic64_read(&dmc->nr_ios) == 0); + + /* policy change */ + if ((policy != 0) && (policy != dmc->req_policy)) { + error = eio_policy_switch(dmc, policy); + if (error) { + + up_write(&eio_ttc_lock[index]); + goto out; + } + } + + /* mode change */ + if ((mode != 0) && (mode != dmc->mode)) { + error = eio_mode_switch(dmc, mode); + if (error) { + + up_write(&eio_ttc_lock[index]); + goto out; + } + } + + dmc->sysctl_active.time_based_clean_interval = old_time_thresh; + /* write updated superblock */ + error = eio_sb_store(dmc); + if (error) { + /* XXX: In case of error put the cache in degraded mode. */ + pr_err("eio_cache_edit: superblock update failed(error %d)", + error); + goto out; + } + + eio_procfs_dtr(dmc); + eio_procfs_ctr(dmc); + + up_write(&eio_ttc_lock[index]); + +out: + dmc->sysctl_active.time_based_clean_interval = old_time_thresh; + + /* + * Resetting EIO_CLEAN_START and EIO_CLEAN_KEEP flags. + * EIO_CLEAN_START flag should be restored if eio_stop_async_tasks() + * is not called in future. + */ + + dmc->sysctl_active.do_clean &= ~(EIO_CLEAN_START | EIO_CLEAN_KEEP); + + /* Restart async-task for "WB" cache. */ + if ((dmc->mode == CACHE_MODE_WB) && (restart_async_task == 1)) { + pr_debug("cache_edit: Restarting the clean_thread.\n"); + VERIFY(dmc->clean_thread == NULL); + ret = eio_start_clean_thread(dmc); + if (ret) { + error = ret; + pr_err("cache_edit: Failed to restart async tasks. error=%d.\n", ret); + } + if (dmc->sysctl_active.time_based_clean_interval && + atomic64_read(&dmc->nr_dirty)) { + schedule_delayed_work(&dmc->clean_aged_sets_work, + dmc->sysctl_active.time_based_clean_interval * 60 * HZ); + dmc->is_clean_aged_sets_sched = 1; + } + } + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_MOD_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + pr_debug("eio_cache_edit: Allowing application I/O\n"); + return error; +} + +static int +eio_mode_switch(struct cache_c *dmc, u_int32_t mode) +{ + int error = 0; + u_int32_t orig_mode; + + VERIFY(dmc->mode != mode); + pr_debug("eio_mode_switch: mode switch from %u to %u\n", + dmc->mode, mode); + + if (mode == CACHE_MODE_WB) { + orig_mode = dmc->mode; + dmc->mode = mode; + + error = eio_allocate_wb_resources(dmc); + if (error) { + dmc->mode = orig_mode; + goto out; + } + } else if (dmc->mode == CACHE_MODE_WB) { + eio_free_wb_resources(dmc); + dmc->mode = mode; + } else { /* (RO -> WT) or (WT -> RO) */ + VERIFY(((dmc->mode == CACHE_MODE_RO) && (mode == CACHE_MODE_WT)) || + ((dmc->mode == CACHE_MODE_WT) && (mode == CACHE_MODE_RO))); + dmc->mode = mode; + } + +out: + if (error) { + pr_err("mode_switch: Failed to switch mode, error: %d\n", error); + } + return error; +} + +/* + * XXX: Error handling. + * In case of error put the cache in degraded mode. + */ + +static int +eio_policy_switch(struct cache_c *dmc, u_int32_t policy) +{ + int error; + + VERIFY(dmc->req_policy != policy); + + + eio_policy_free(dmc); + + dmc->req_policy = policy; + error = eio_policy_init(dmc); + if (error) { + goto out; + } + + error = eio_repl_blk_init(dmc->policy_ops); + if (error) { + pr_err("eio_policy_swtich: Unable to allocate memory for policy cache block"); + goto out; + } + + error = eio_repl_sets_init(dmc->policy_ops); + if (error) { + pr_err("eio_policy_switch: Failed to allocate memory for cache policy"); + goto out; + } + + eio_policy_lru_pushblks(dmc->policy_ops); + return 0; + +out: + eio_policy_free(dmc); + dmc->req_policy = CACHE_REPL_RANDOM; + (void)eio_policy_init(dmc); + return error; +} + +void +eio_free_wb_pages(struct page **pages, int allocated) +{ + /* Verify that allocated is never 0 or less that zero. */ + if (allocated <= 0) { + return; + } + + do { + put_page(pages[--allocated]); + } while (allocated); + + *pages = NULL; +} + +void +eio_free_wb_bvecs(struct bio_vec *bvec, int allocated, int blksize) +{ + int i; + + if (allocated <= 0) + return; + + for (i = 0; i < allocated; i++) { + + switch(blksize) { + case BLKSIZE_2K: + /* + * For 2k blocksize, each page is shared between two + * bio_vecs. Hence make sure to put_page only for even + * indexes. + */ + if (((i % 2) == 0) && bvec[i].bv_page) { + put_page(bvec[i].bv_page); + bvec[i].bv_page = NULL; + continue; + } + + /* For odd index page should already have been freed. */ + if ((i % 2)) + bvec[i].bv_page = NULL; + + continue; + + case BLKSIZE_4K: + case BLKSIZE_8K: + if (bvec[i].bv_page) { + put_page(bvec[i].bv_page); + bvec[i].bv_page = NULL; + } + + continue; + } + } + +} + +/* + * This function allocates pages to array of bvecs allocated by caller. + * It has special handling of blocksize of 2k where single page is + * shared between two bio_vecs. + */ + +int +eio_alloc_wb_bvecs(struct bio_vec *bvec, int max, int blksize) +{ + int i, ret; + struct bio_vec *iovec; + struct page *page; + + ret = 0; + iovec = bvec; + page = NULL; + + for (i = 0; i < max; i++) { + + switch(blksize) { + + case BLKSIZE_2K: + /* + * In case of 2k blocksize, two biovecs will be sharing + * same page address. This is handled below. + */ + + if ((i % 2) == 0) { + /* Allocate page only for even bio vector */ + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (unlikely(!page)) { + pr_err("eio_alloc_wb_bvecs: System memory too low.\n"); + goto err; + } + iovec[i].bv_page = page; + iovec[i].bv_len = to_bytes(blksize); + iovec[i].bv_offset = 0; + } else { + /* Let the odd biovec share page allocated earlier. */ + VERIFY(page != NULL); + iovec[i].bv_page = page; + iovec[i].bv_len = to_bytes(blksize); + iovec[i].bv_offset = PAGE_SIZE - to_bytes(blksize); + + /* Mark page NULL here as it is not required anymore. */ + page = NULL; + } + + continue; + + case BLKSIZE_4K: + case BLKSIZE_8K: + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (unlikely(!page)) { + pr_err("eio_alloc_wb_bvecs: System memory too low.\n"); + goto err; + } + iovec[i].bv_page = page; + iovec[i].bv_offset = 0; + iovec[i].bv_len = PAGE_SIZE; + + page = NULL; + continue; + } + + } + + goto out; + +err: + if (i != max) { + if ( i > 0) + eio_free_wb_bvecs(bvec, i, blksize); + ret = -ENOMEM; + } + +out: + return ret; +} + + +int +eio_alloc_wb_pages(struct page **pages, int max) +{ + int i, ret = 0; + struct page *page; + + for (i = 0; i < max; i++) { + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (unlikely(!page)) { + pr_err("alloc_wb_pages: System memory too low.\n"); + break; + } + pages[i] = page; + } + + if (i != max) { + if (i > 0) + eio_free_wb_pages(pages, i); + ret = -ENOMEM; + goto out; + } + +out: + return ret; +} + +/* + **************************************************************************** + * struct bio_vec *eio_alloc_pages(int max_pages, int *page_count) + * dmc : cache object + * pages : bio_vec to be allocated for synchronous I/O. + * page_count : total number of pages allocated. + **************************************************************************** + * + * This function allocates pages capped to minimum of + * MD_MAX_NR_PAGES OR maximun number of pages supported by + * block device. + * This is to ensure that the pages allocated should fit + * into single bio request. + */ + +struct bio_vec * +eio_alloc_pages(u_int32_t max_pages, int *page_count) +{ + int pcount, i; + struct bio_vec *pages; + int nr_pages; + + + /* + * Find out no. of pages supported by block device max capped to + * MD_MAX_NR_PAGES; + */ + nr_pages = min_t(u_int32_t, max_pages, MD_MAX_NR_PAGES); + + pages = kzalloc(nr_pages * sizeof(struct bio_vec), GFP_NOIO); + if (unlikely(!pages)) { + pr_err("eio_alloc_pages: System memory too low.\n"); + return NULL; + } + + pcount = 0; + for (i = 0; i < nr_pages; i++) { + pages[i].bv_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (unlikely(!pages[i].bv_page)) { + pr_err("eio_alloc_pages: System memory too low.\n"); + break; + } else { + pages[i].bv_len = PAGE_SIZE; + pages[i].bv_offset = 0; + pcount++; + } + } + + if (pcount == 0) { + pr_err("Single page allocation failed. System memory too low."); + if (pages) + kfree(pages); + + return NULL; + } + + /* following can be commented out later... + * we may have less pages allocated. + */ + VERIFY(pcount == nr_pages); + + /* Set the return values here */ + *page_count = pcount; + return pages; +} + +/* + * As part of reboot handling, stop all activies and mark the devices as + * read only. + */ + +int +eio_reboot_handling(void) +{ + struct cache_c *dmc, *tempdmc = NULL; + int i, error; + uint32_t old_time_thresh; + + if (eio_reboot_notified == EIO_REBOOT_HANDLING_DONE) { + return 0; + } + + (void)wait_on_bit_lock((void *)&eio_control->synch_flags, EIO_HANDLE_REBOOT, + eio_wait_schedule, TASK_UNINTERRUPTIBLE); + if (eio_reboot_notified == EIO_REBOOT_HANDLING_DONE) { + clear_bit(EIO_HANDLE_REBOOT, (void *)&eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_HANDLE_REBOOT); + return 0; + } + VERIFY(eio_reboot_notified == 0); + eio_reboot_notified = EIO_REBOOT_HANDLING_INPROG; + + for (i = 0; i < EIO_HASHTBL_SIZE; i++) { + down_write(&eio_ttc_lock[i]); + list_for_each_entry(dmc, &eio_ttc_list[i], cachelist) { + if (tempdmc) { + kfree(tempdmc); + } + tempdmc = NULL; + if (unlikely(CACHE_FAILED_IS_SET(dmc)) || + unlikely(CACHE_DEGRADED_IS_SET(dmc))) { + pr_err("Cache \"%s\" is in failed/degraded mode." + " Cannot mark cache read only.\n", + dmc->cache_name); + continue; + } + + while (atomic64_read(&dmc->nr_ios) != 0) { + pr_debug("rdonly: Draining I/O inflight\n"); + schedule_timeout(msecs_to_jiffies(10)); + } + + VERIFY(atomic64_read(&dmc->nr_ios) == 0); + VERIFY(dmc->cache_rdonly == 0); + + /* + * Shutdown processing has the highest priority. + * Stop all ongoing activities. + */ + + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + VERIFY(!(dmc->cache_flags & CACHE_FLAGS_SHUTDOWN_INPROG)); + dmc->cache_flags |= CACHE_FLAGS_SHUTDOWN_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + + /* + * Wait for ongoing edit/delete to complete. + */ + + while (dmc->cache_flags & CACHE_FLAGS_MOD_INPROG) { + up_write(&eio_ttc_lock[i]); + schedule_timeout(msecs_to_jiffies(1)); + down_write(&eio_ttc_lock[i]); + } + if (dmc->cache_flags & CACHE_FLAGS_DELETED) { + + /* + * Cache got deleted. Free the dmc. + */ + + tempdmc = dmc; + continue; + } + old_time_thresh = dmc->sysctl_active.time_based_clean_interval; + eio_stop_async_tasks(dmc); + dmc->sysctl_active.time_based_clean_interval = old_time_thresh; + + dmc->cache_rdonly = 1; + pr_info("Cache \"%s\" marked read only\n", dmc->cache_name); + up_write(&eio_ttc_lock[i]); + + if (dmc->cold_boot && atomic64_read(&dmc->nr_dirty) && !eio_force_warm_boot) { + pr_info("Cold boot set for cache %s: Draining dirty blocks: %ld", + dmc->cache_name, atomic64_read(&dmc->nr_dirty)); + eio_clean_for_reboot(dmc); + } + + error = eio_md_store(dmc); + if (error) { + pr_err("Cannot mark cache \"%s\" read only\n", + dmc->cache_name); + } + + spin_lock_irqsave(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + dmc->cache_flags &= ~CACHE_FLAGS_SHUTDOWN_INPROG; + spin_unlock_irqrestore(&dmc->cache_spin_lock, dmc->cache_spin_lock_flags); + + down_write(&eio_ttc_lock[i]); + } + if (tempdmc) { + kfree(tempdmc); + } + tempdmc = NULL; + up_write(&eio_ttc_lock[i]); + } + + eio_reboot_notified = EIO_REBOOT_HANDLING_DONE; + clear_bit(EIO_HANDLE_REBOOT, (void *)&eio_control->synch_flags); + smp_mb__after_clear_bit(); + wake_up_bit((void *)&eio_control->synch_flags, EIO_HANDLE_REBOOT); + return 0; +} + +static int +eio_overlap_split_bio(struct request_queue *q, struct bio *bio) +{ + int i, nbios; + void **bioptr; + sector_t snum; + struct bio_container *bc; + unsigned bvec_idx; + unsigned bvec_consumed; + + nbios = bio->bi_size >> SECTOR_SHIFT; + snum = bio->bi_sector; + + bioptr = kmalloc(nbios * (sizeof (void *)), GFP_KERNEL); + if (!bioptr) { + bio_endio(bio, -ENOMEM); + return 0; + } + bc = kmalloc(sizeof (struct bio_container), GFP_NOWAIT); + if (!bc) { + bio_endio(bio, -ENOMEM); + kfree(bioptr); + return 0; + } + + atomic_set(&bc->bc_holdcount, nbios); + bc->bc_bio = bio; + bc->bc_error = 0; + + bvec_idx = bio->bi_idx; + bvec_consumed = 0; + for (i = 0; i < nbios; i++) { + bioptr[i] = eio_split_new_bio(bio, bc, &bvec_idx, &bvec_consumed, snum); + if (!bioptr[i]) { + break; + } + snum++; + } + + /* Error: cleanup */ + if (i < nbios) { + for (i--; i >= 0; i--) + bio_put(bioptr[i]); + bio_endio(bio, -ENOMEM); + kfree(bc); + goto out; + } + + for (i = 0; i < nbios; i++) { + eio_make_request_fn(q, bioptr[i]); + } + +out: + kfree(bioptr); + return 0; +} + +static struct bio * +eio_split_new_bio(struct bio *bio, struct bio_container *bc, + unsigned *bvec_idx, unsigned *bvec_consumed, sector_t snum) +{ + struct bio *cbio; + unsigned iosize = 1 << SECTOR_SHIFT; + + cbio = bio_alloc(GFP_NOIO, 1); + if (!cbio) + return NULL; + + VERIFY(bio->bi_io_vec[*bvec_idx].bv_len >= iosize); + + if (bio->bi_io_vec[*bvec_idx].bv_len <= *bvec_consumed) { + VERIFY(bio->bi_io_vec[*bvec_idx].bv_len == *bvec_consumed); + (*bvec_idx)++; + VERIFY(bio->bi_vcnt > *bvec_idx); + *bvec_consumed = 0; + } + + cbio->bi_io_vec[0].bv_page = bio->bi_io_vec[*bvec_idx].bv_page; + cbio->bi_io_vec[0].bv_offset = bio->bi_io_vec[*bvec_idx].bv_offset + *bvec_consumed; + cbio->bi_io_vec[0].bv_len = iosize; + *bvec_consumed += iosize; + + cbio->bi_sector = snum; + cbio->bi_size = iosize; + cbio->bi_bdev = bio->bi_bdev; + cbio->bi_rw = bio->bi_rw; + cbio->bi_vcnt = 1; + cbio->bi_idx = 0; + cbio->bi_end_io = eio_split_endio; + cbio->bi_private = bc; + return cbio; +} + +static void +eio_split_endio(struct bio *bio, int error) +{ + struct bio_container *bc = bio->bi_private; + if (error) + bc->bc_error = error; + bio_put(bio); + if (atomic_dec_and_test(&bc->bc_holdcount)) { + bio_endio(bc->bc_bio, bc->bc_error); + kfree(bc); + } + return; +} + diff --git a/drivers/staging/enhanceio/eio_ttc.h b/drivers/staging/enhanceio/eio_ttc.h new file mode 100644 index 0000000..3f7eb64 --- /dev/null +++ b/drivers/staging/enhanceio/eio_ttc.h @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2012 STEC, Inc. All rights not specifically granted + * under a license included herein are reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + */ + +#ifndef EIO_TTC_H +#define EIO_TTC_H + +#ifdef __KERNEL__ +#include <linux/device-mapper.h> +#define curthread get_current() +#else +#include <stdint.h> +#endif /* __KERNEL__ */ + +static inline bool bio_rw_flagged(struct bio *bio, int flag) +{ + return (bio->bi_rw & flag) != 0; +} + +/* + * Whether the cached (source) device is a partition or a whole device. + * dmc->dev_info stores this info. + */ +enum eio_io_mem_type{ + EIO_BVECS, /* bio vectors */ + EIO_PAGES, /* array of pages */ +}; + +struct eio_io_request { + enum eio_io_mem_type mtype; + + union { + struct bio_vec *pages; + struct page **plist; + } dptr; + + unsigned num_bvecs; + eio_notify_fn notify; + void *context; + unsigned hddio; +}; + +struct eio_context { + atomic_t count; + int error; + struct completion *event; + eio_notify_fn callback; + void *context; +}; + +int eio_do_io(struct cache_c *dmc, struct eio_io_region *where, int rw, + struct eio_io_request *io_req); + +typedef enum eio_device { + EIO_HDD_DEVICE = 1, + EIO_SSD_DEVICE, +} eio_device_t; + +typedef enum eio_dev_info { + EIO_DEV_PARTITION = 1, + EIO_DEV_WHOLE_DISK +} eio_dev_info_t; + +typedef enum eio_cache_state { + DMC_TTC_INITIALIZING = 1, + DMC_TTC_READY, + DMC_TTC_IO_FREEZE, + DMC_TTC_UNINITIALIZING, + DMC_TTC_UNINITIALIZED +} eio_cache_state_t; + +#ifdef __KERNEL__ + +#define EIO_HASHTBL_SIZE 1024 + +/* + * In case of i/o errors while eio_clean_all, retry for + * finish_nrdirty_retry count. + */ +#define FINISH_NRDIRTY_RETRY_COUNT 2 + + +#define EIO_HASH_BDEV(dev) \ + ((MAJOR(dev) * EIO_MAGIC + MINOR(dev)) % EIO_HASHTBL_SIZE) + + +/* + * Reboot status flags. + */ + +#define EIO_REBOOT_HANDLING_INPROG 0x01 +#define EIO_REBOOT_HANDLING_DONE 0x02 + +/* + * kernel function prototypes. + */ + +extern int eio_create_misc_device(void); +extern int eio_delete_misc_device(void); + +extern int eio_ttc_get_device(const char *, fmode_t, struct eio_bdev **); +extern void eio_ttc_put_device(struct eio_bdev **); + +extern struct cache_c *eio_cache_lookup(char *); +extern int eio_ttc_activate(struct cache_c *); +extern int eio_ttc_deactivate(struct cache_c *, int); +extern void eio_ttc_init(void); + +extern int eio_cache_create(cache_rec_short_t *); +extern int eio_cache_delete(char *, int); +extern uint64_t eio_get_cache_count(void); +extern int eio_get_cache_list(unsigned long *); + +extern int eio_handle_ssd_message(char *cache_name, char *ssd_name, + dev_notifier_t note); + +int eio_do_preliminary_checks(struct cache_c *); + +extern int eio_allocate_wb_resources(struct cache_c *); +extern void eio_free_wb_resources(struct cache_c *); + +extern int eio_cache_edit(char *, u_int32_t, u_int32_t); + +extern void eio_stop_async_tasks(struct cache_c *dmc); +extern int eio_start_clean_thread(struct cache_c *dmc); + +extern int eio_policy_init(struct cache_c *); +extern void eio_policy_free(struct cache_c *); +extern int eio_alloc_wb_pages(struct page **pages, int max); +extern void eio_free_wb_pages(struct page **pages, int allocated); +extern int eio_alloc_wb_bvecs(struct bio_vec *bvec, int max, int blksize); +extern void eio_free_wb_bvecs(struct bio_vec *bvec, int allocated, int blksize); +extern struct bio_vec * eio_alloc_pages(u_int32_t max_pages, int *page_count); +extern int eio_md_store(struct cache_c *); +extern int eio_reboot_handling(void); +extern void eio_process_zero_size_bio(struct cache_c *dmc, struct bio *origbio); + +#endif /* __KERNEL__ */ + +#endif /* EIO_TTC_H */ + diff --git a/tools/enhanceio/eio_cli b/tools/enhanceio/eio_cli new file mode 100755 index 0000000..1fc515d --- /dev/null +++ b/tools/enhanceio/eio_cli @@ -0,0 +1,283 @@ +#!/usr/bin/python +# +# Copyright (C) 2012 STEC, Inc. All rights not specifically granted +# under a license included herein are reserved +# Wrote a python based CLI for admistration of Enhanceio Driver +# Sanoj Unnikrishnan <sunnikrishnan@stec-inc.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; under version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +from ctypes import * +from fcntl import * +from argparse import ArgumentParser +import sys,struct +import subprocess +import os + +#TBD : Change ioctl numbers to comply with linux kernel convention +EIODEV = '/dev/eiodev' +EIO_IOC_CREATE = 0x4500 +EIO_IOC_DELETE = 0x4501 +EIO_IOC_ENABLE = 0x4502 +EIO_IOC_EDIT = 0x4504 +EIO_IOC_NCACHES = 0x4505 +EIO_IOC_CACHE_LIST = 0x4506 +EIO_IOC_SSD_ADD = 0x4507 +EIO_IOC_SRC_ADD = 0x4509 +IOC_BLKGETSIZE64 = 0x80081272 +IOC_SECTSIZE = 0x1268 + +def run_cmd(cmd): + #Utility function that runs a command + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True) + output = process.stdout.read() + ret = process.wait() + status = Status(output,ret) + return status + + +def get_caches_list(): + + #Utility function that obtains cache list + cache_list = [f for f in os.listdir('/proc/enhanceio/')] + for name in cache_list: + if name == "version": + cache_list.remove(name) + return cache_list + +# Class that represents cache. Also used to pass ioctl to driver +class Cache_rec(Structure): + _fields_ = [ + ("name", c_char * 32), + ("src_name", c_char * 128), + ("ssd_name", c_char * 128), + ("ssd_uuid", c_char * 128), + ("src_size", c_ulonglong), + ("ssd_size", c_ulonglong), + ("src_sector_size", c_uint), + ("ssd_sector_size", c_uint), + ("flags", c_uint), + ("policy", c_byte), + ("mode", c_byte), + ("persistence", c_byte), + ("cold_boot", c_byte), + ("blksize", c_ulonglong), + ("assoc", c_ulonglong) + ] + def __init__(self, name, src_name="", ssd_name="", src_size=0, ssd_size=0, src_sector_size=0, ssd_sector_size=0, flags=0, policy="", mode="", persistence=0, cold_boot="", blksize="", assoc=""): + + modes = {"wt":3,"wb":1,"ro":2,"":0} + policies = {"rand":3,"fifo":1, "lru":2,"":0} + blksizes = {"4096":4096, "2048":2048, "8192":8192,"":0} + associativity = {2048:128, 4096:256, 8192:512,0:0} + + self.name = name + self.src_name =src_name + self.ssd_name = ssd_name + self.src_size = src_size + self.src_sector_size = src_sector_size + self.ssd_size = ssd_size + self.ssd_sector_size = ssd_sector_size + self.flags = flags + self.policy = policies[policy] + self.mode = modes[mode] + self.persistence = persistence + self.blksize = blksizes[blksize] + self.assoc = associativity[self.blksize] + + def print_info(self): + + # Display Cache info + modes = {3:"Write Through", 1:"Write Back", 2:"Read Only",0:"N/A"} + policies = {3:"rand", 1:"fifo", 2:"lru", 0:"N/A"} + + + print "Cache Name : " + self.name + print "Source Device : " + self.src_name + print "SSD Device : " + self.ssd_name + print "Policy : " + policies[self.policy] + print "Mode : " + modes[self.mode] + print "Block Size : " + str(self.blksize) + print "Associativity : " + str(self.assoc) + + pass + + def do_eio_ioctl(self,IOC_TYPE): + #send ioctl to driver + fd = open(EIODEV, "r") + fmt = '' + + try: + if ioctl(fd, IOC_TYPE, addressof(self)): + print "ioctl failed" + except Exception as e: + print e + + def clean(self): + #do sysctl corresponding to clean + cmd = "/sbin/sysctl dev.enhanceio." + self.name + ".do_clean=1" + print cmd + run_cmd(cmd) + pass + + def get_cache_info(self): + #function to extract information from /proc/enhanceio + status = Status() + + if os.path.exists("/proc/enhanceio/" + self.name): + + associativity = {2048:128, 4096:256, 8192:512,0:0} + + cmd = "cat /proc/enhanceio/" + self.name + "/config" + " | grep src_name" + status = run_cmd(cmd) + self.src_name = status.output.split()[1] + + cmd = "cat /proc/enhanceio/" + self.name + "/config" + " | grep ssd_name" + status = run_cmd(cmd) + self.ssd_name = status.output.split()[1] + + cmd = "cat /proc/enhanceio/" + self.name + "/config" + " | grep mode" + status = run_cmd(cmd) + self.mode = int(status.output.split()[1]) + + cmd = "cat /proc/enhanceio/" + self.name + "/config" + " | grep eviction" + status = run_cmd(cmd) + self.policy = int(status.output.split()[1]) + + cmd = "cat /proc/enhanceio/" + self.name + "/config" + " | grep block_size" + status = run_cmd(cmd) + self.blksize = int(status.output.split()[1]) + + self.assoc = associativity[self.blksize] + + + + +class Status: + output = "" + ret = 0 + + def __init__(self, outstr="", outret=0): + self.output = outstr + self.ret = outret + pass + +#Block Device class +class Dev_info: + + dev_size = 0 + dev_sect_size = 0 + + def get_device_size_info(self,name): + fd = open(name,"r") + + buf = ' ' * 8 + + buf = ioctl(fd, IOC_BLKGETSIZE64, buf) + bytes = struct.unpack('L', buf) + self.dev_size = int(bytes[0]) + + buf = ioctl(fd, IOC_SECTSIZE, buf) + bytes = struct.unpack('L', buf) + self.dev_sect_size = int(bytes[0]) + + pass + +def main(): + + + mainparser = ArgumentParser() + parser = mainparser.add_subparsers() + + parser_delete = parser.add_parser('delete', help='used to delete cache') + parser_delete.add_argument("-c", action="store", dest= "cache",required=True) + + parser_edit = parser.add_parser('edit', help='used to edit cache policy or mode or both') + parser_edit.add_argument("-c", action="store", dest="cache",required=True) + parser_edit.add_argument("-m", action="store", dest="mode", choices=["wb","wt","ro"], help="cache mode",default="wt") + parser_edit.add_argument("-p", action="store", dest="policy", choices=["rand","fifo","lru"], help="cache replacement policy",default="lru") + + parser_info = parser.add_parser('info', help='displays information about currently create caches') + + parser_clean = parser.add_parser('clean', help='clean the drity blocks in the cache (Applicable only to writeback caches)') + parser_clean.add_argument("-c", action="store", dest="cache",required=True) + + parser_create = parser.add_parser('create', help="create") + parser_create.add_argument("-d", action="store", dest="hdd", required=True, help="name of the source device") + parser_create.add_argument("-s", action="store", dest="ssd", required=True, help="name of the ssd device") + parser_create.add_argument("-p", action="store", dest="policy", choices=["rand","fifo","lru"], help="cache replacement policy",default="lru") + parser_create.add_argument("-m", action="store", dest="mode", choices=["wb","wt","ro"], help="cache mode",default="wt") + parser_create.add_argument("-b", action="store", dest="blksize", choices=["2048","4096","8192"], default="4096" ,help="block size for cache") + parser_create.add_argument("-c", action="store", dest="cache", required=True) + + args = mainparser.parse_args() + + if sys.argv[1] == "create": + cache = Cache_rec(name = args.cache, src_name = args.hdd, ssd_name = args.ssd, policy = args.policy, mode = args.mode, blksize = args.blksize) + + src_sz = Dev_info() + src_sz.get_device_size_info(cache.src_name) + cache.src_size = src_sz.dev_size + cache.src_sector_size = src_sz.dev_sect_size + + ssd_sz = Dev_info() + ssd_sz.get_device_size_info(cache.ssd_name) + cache.ssd_size = ssd_sz.dev_size + cache.ssd_sector_size = ssd_sz.dev_sect_size + + cache.print_info() + + cache.do_eio_ioctl(EIO_IOC_CREATE) + pass + elif sys.argv[1] == "info": + cache_list = get_caches_list() + + if not cache_list: + print "No caches Found" + else: + for cache_name in cache_list: + cache = Cache_rec(name = cache_name) + cache.get_cache_info() + cache.print_info() + + print "\nFor more information look at /proc/enhanceio/<cache_name>/config" + + pass + elif sys.argv[1] == "edit": + cache = Cache_rec(name = args.cache, policy = args.policy, mode = args.mode) + cache.do_eio_ioctl(EIO_IOC_EDIT) + pass + elif sys.argv[1] == "delete": + cache = Cache_rec(name = args.cache) + cache.do_eio_ioctl(EIO_IOC_DELETE) + pass + elif sys.argv[1] == "clean": + cache = Cache_rec(name = args.cache) + cache.clean() + pass + elif sys.argv[1] == "enable": + # This command will be fired by udev rule on SSD/Source addition + cache = Cache_rec(name = args.cache, persistence = 1) + cache.do_eio_ioctl(EIO_IOC_ENABLE) + elif sys.argv[1] == "notify": + # This command will be fired by udev rule on SSD/Source addition + cache = Cache_rec(name = args.cache, ) + cache.do_eio_ioctl(EIO_IOC_ENABLE) + cache = Cache_rec(name = args.cache) + pass + + +if __name__ == '__main__': + main() +

[RFC,DONOTAPPLY] enhanceio: STEC EnhanceIO SSD caching software for Linux kernel

Commit Message

Comments

Patch