newstore direction

Message ID	20151022083115.GA26798@infradead.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <ceph-devel-owner@kernel.org> Date: Thu, 22 Oct 2015 01:31:15 -0700 From: Christoph Hellwig <hch@infradead.org> To: Sage Weil <sweil@redhat.com> Cc: Ric Wheeler <rwheeler@redhat.com>, Orit Wasserman <owasserm@redhat.com>, ceph-devel@vger.kernel.org Subject: Re: newstore direction Message-ID: <20151022083115.GA26798@infradead.org> References: <alpine.DEB.2.00.1510191216200.4188@cobra.newdream.net> <56268886.7010806@redhat.com> <1445415736.1809.71.camel@redhat.com> <56277468.5000504@redhat.com> <alpine.DEB.2.00.1510210543110.16833@cobra.newdream.net> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <alpine.DEB.2.00.1510210543110.16833@cobra.newdream.net> User-Agent: Mutt/1.5.23 (2014-03-12) Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk

diff --git a/fs/fcntl.c b/fs/fcntl.c index ee85cd4..001dd49 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -740,7 +740,7 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ @@ -748,6 +748,7 @@ static int __init fcntl_init(void) O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | __FMODE_EXEC | O_PATH | __O_TMPFILE | + O_ATOMIC | __FMODE_NONOTIFY )); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index aeffeaa..8eafca6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4681,14 +4681,14 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + bool free_blocks) /* free extent at end of routine */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ int delay; /* current block is delayed allocated */ - int do_fx; /* free extent at end of routine */ xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ int error; /* error return value */ int flags; /* inode logging flags */ @@ -4712,8 +4712,8 @@ xfs_bmap_del_extent( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(*idx >= 0); + ASSERT(*idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); ASSERT(del->br_blockcount > 0); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &got); @@ -4746,10 +4746,13 @@ xfs_bmap_del_extent( len = del->br_blockcount; do_div(bno, mp->m_sb.sb_rextsize); do_div(len, mp->m_sb.sb_rextsize); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; - do_fx = 0; + if (free_blocks) { + error = xfs_rtfree_extent(tp, bno, + (xfs_extlen_t)len); + if (error) + goto done; + free_blocks = 0; + } nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; } @@ -4757,7 +4760,6 @@ xfs_bmap_del_extent( * Ordinary allocation. */ else { - do_fx = 1; nblks = del->br_blockcount; qfield = XFS_TRANS_DQ_BCOUNT; } @@ -4777,7 +4779,7 @@ xfs_bmap_del_extent( da_old = startblockval(got.br_startblock); da_new = 0; nblks = 0; - do_fx = 0; + free_blocks = 0; } /* * Set flag value to use in switch statement. @@ -4963,7 +4965,7 @@ xfs_bmap_del_extent( /* * If we need to, add to list of extents to delete. */ - if (do_fx) + if (free_blocks) xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, mp); /* @@ -5291,7 +5293,7 @@ xfs_bunmapi( goto error0; } error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, - &tmp_logflags, whichfork); + &tmp_logflags, whichfork, true); logflags |= tmp_logflags; if (error) goto error0; @@ -5936,3 +5938,291 @@ out: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); return error; } + +/* + * Create an extent tree pointing to an existing allocation. + * This is a small subset of the functionality in xfs_bmap_add_extent_hole_real. + * + * Note: we don't bother merging with neighbours. + */ +STATIC int +xfs_bmap_insert_extent_real( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *new, + struct xfs_btree_cur *cur, + xfs_extnum_t idx, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + int *logflags) +{ + struct xfs_mount *mp = tp->t_mountp; + int error = 0, rval = 0, i; + + ASSERT(idx >= 0); + ASSERT(idx <= ip->i_df.if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + + xfs_iext_insert(ip, idx, 1, new, 0); + ip->i_d.di_nextents++; + ip->i_d.di_nblocks += new->br_blockcount; + + if (cur == NULL) { + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + } else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(cur, + new->br_startoff, + new->br_startblock, + new->br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + &cur, 0, &tmp_logflags, XFS_DATA_FORK); + *logflags |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) + cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(cur, ip, XFS_DATA_FORK); +done: + *logflags |= rval; + return error; +} + +int +xfs_bmapi_insert( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *new, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + int whichfork = XFS_DATA_FORK; + int eof; + int error; + char inhole; + char wasdelay; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + struct xfs_btree_cur *cur = NULL; + xfs_extnum_t idx; + int logflags = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, new->br_startoff, whichfork, + &eof, &idx, &got, &prev); + + inhole = eof || got.br_startoff > new->br_startoff; + wasdelay = !inhole && isnullstartblock(got.br_startblock); + ASSERT(!wasdelay); + ASSERT(inhole); + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.flist = flist; + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flags = 0; + } + + error = xfs_bmap_insert_extent_real(tp, ip, new, cur, idx, firstblock, + flist, &logflags); + if (error) + return error; + + /* + * Transform from btree to extents, give it cur. + */ + if (xfs_bmap_wants_extents(ip, whichfork)) { + int tmp_logflags = 0; + + ASSERT(cur); + error = xfs_bmap_btree_to_extents(tp, ip, cur, + &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log whatever the flags say, even if error. Otherwise we might miss + * detecting a case where the data is changed, there's an error, + * and it's not logged so we don't shutdown when we should. + */ + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + + if (cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, + cur->bc_private.b.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, + cur->bc_private.b.firstblock))); + *firstblock = cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Remove the extent pointed to by del from the extent map, but do not free + * the blocks for it. + */ +int +xfs_bmapi_unmap( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_extnum_t idx, /* extent number to update/delete */ + struct xfs_bmbt_irec *del, /* extent being deleted */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = &ip->i_df; + int whichfork = XFS_DATA_FORK; + struct xfs_btree_cur *cur; + int error; + int logflags = 0; + + if (unlikely( + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + XFS_STATS_INC(xs_blk_unmap); + + if (ifp->if_flags & XFS_IFBROOT) { + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else + cur = NULL; + + ASSERT(!isnullstartblock(del->br_startblock)); + error = xfs_bmap_del_extent(ip, tp, &idx, flist, cur, del, + &logflags, whichfork, false); + if (error) + goto error0; + + /* + * transform from btree to extents, give it cur + */ + if (xfs_bmap_wants_extents(ip, whichfork)) { + int tmp_logflags = 0; + + ASSERT(cur != NULL); + error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log inode even in the error case, if the transaction + * is dirty we'll need to shut down the filesystem. + */ + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + if (!error) { + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 6aaa0c1..394843f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -221,5 +221,11 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmap_free *flist, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +int xfs_bmapi_insert(struct xfs_trans *tp, struct xfs_inode *ip, + struct xfs_bmbt_irec *new, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist); +int xfs_bmapi_unmap(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extnum_t idx, struct xfs_bmbt_irec *del, + xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a56960d..e64ffd80 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1365,6 +1365,9 @@ __xfs_get_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + if (ip->i_cow && !ip->i_df.if_bytes && !create) + ip = ip->i_cow; + offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; @@ -1372,6 +1375,7 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; +retry: /* * Direct I/O is usually done on preallocated files, so try getting * a block mapping without an exclusive lock first. For buffered @@ -1397,6 +1401,13 @@ __xfs_get_blocks( if (error) goto out_unlock; + if (!create && ip->i_cow && + (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) { + xfs_iunlock(ip, lockmode); + ip = ip->i_cow; + goto retry; + } + if (create && (!nimaps || (imap.br_startblock == HOLESTARTBLOCK || diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a52bbd3..c45f15e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1918,3 +1918,262 @@ out_trans_cancel: xfs_trans_cancel(tp, 0); goto out; } + +static int +xfs_remove_extent( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *del, + bool *done) +{ + struct xfs_trans *tp = *tpp, *ntp; + struct xfs_ifork *ifp = &ip->i_df; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstblock; + int error, committed; + xfs_extnum_t nextents, idx; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * Always delete the first last extents, this avoids shifting around + * the extent list every time. + * + * XXX: find a way to avoid the transaction allocation without extents? + */ + nextents = ifp->if_bytes / sizeof(struct xfs_bmbt_rec); + if (!nextents) { + *done = true; + return 0; + } + idx = nextents - 1; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), del); + + xfs_bmap_init(&free_list, &firstblock); + error = xfs_bmapi_unmap(tp, ip, idx, del, &firstblock, &free_list); + if (error) + goto out_bmap_cancel; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + if (committed) { + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + ntp = xfs_trans_dup(tp); + error = xfs_trans_commit(tp, 0); + tp = ntp; + xfs_trans_ijoin(tp, ip, 0); + + if (error) { + xfs_trans_cancel(tp, 0); + goto out_error; + } + + xfs_log_ticket_put(tp->t_ticket); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_write, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_error; + } + + *tpp = tp; + return 0; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +out_error: + *tpp = NULL; + return error; +} + +static int +xfs_free_range( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *del) +{ + struct xfs_trans *tp = *tpp, *ntp; + struct xfs_bmap_free free_list; + int committed; + int done; + int error = 0; + xfs_fsblock_t firstfsb; + + while (!error && !done) { + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, del->br_startoff, + del->br_blockcount, 0, 2, + &firstfsb, &free_list, &done); + if (error) + goto out_bmap_cancel; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + if (committed) { + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + ntp = xfs_trans_dup(tp); + error = xfs_trans_commit(tp, 0); + tp = ntp; + xfs_trans_ijoin(tp, ip, 0); + + if (error) + goto out_error; + + xfs_log_ticket_put(tp->t_ticket); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_write, 0, 0); + if (error) + goto out_error; + } + + *tpp = tp; + return 0; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_error: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + *tpp = NULL; + return error; +} + +static int +xfs_insert_extent( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *r) +{ + struct xfs_trans *tp = *tpp, *ntp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstblock; + int error, committed; + + xfs_trans_ijoin(tp, ip, 0); + xfs_bmap_init(&free_list, &firstblock); + error = xfs_bmapi_insert(tp, ip, r, &firstblock, &free_list); + if (error) + goto out_bmap_cancel; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + ntp = xfs_trans_dup(tp); + error = xfs_trans_commit(tp, 0); + tp = ntp; + xfs_trans_ijoin(tp, ip, 0); + + if (error) + goto out_error; + + xfs_log_ticket_put(tp->t_ticket); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_write, 0, 0); + if (error) + goto out_error; + + *tpp = tp; + return 0; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_error: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + *tpp = NULL; + return error; +} + +int +xfs_commit_clone( + struct file *file, + loff_t start, + loff_t end) +{ + struct xfs_inode *dest = XFS_I(file_inode(file)); + struct xfs_inode *clone = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = clone->i_mount; + struct xfs_trans *tp; + uint lock_flags; + bool done = false; + int error = 0; + + error = xfs_qm_dqattach(clone, 0); + if (error) + return error; + + error = xfs_qm_dqattach(dest, 0); + if (error) + return error; + + /* + * Lock the inodes against other IO, page faults and truncate to + * begin with. + */ + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_lock_two_inodes(dest, clone, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(dest, clone, XFS_MMAPLOCK_EXCL); + + inode_dio_wait(VFS_I(clone)); + error = filemap_write_and_wait(VFS_I(clone)->i_mapping); + if (error) + goto out_unlock; + + inode_dio_wait(VFS_I(dest)); + error = filemap_write_and_wait(VFS_I(dest)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(dest), 0, -1); + WARN_ON(VFS_I(dest)->i_mapping->nrpages); + + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_lock_two_inodes(dest, clone, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + + for (;;) { + struct xfs_bmbt_irec del; + + error = xfs_remove_extent(&tp, clone, &del, &done); + if (error) + goto out_unlock; + if (done) + break; + + error = xfs_free_range(&tp, dest, &del); + if (error) + goto out_unlock; + + error = xfs_insert_extent(&tp, dest, &del); + if (error) + goto out_unlock; + } + + xfs_trans_ijoin(tp, dest, 0); + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + + i_size_write(VFS_I(dest), VFS_I(clone)->i_size); + dest->i_d.di_size = VFS_I(clone)->i_size; + xfs_trans_ichgtime(tp, dest, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + +out_unlock: + xfs_iunlock(dest, lock_flags); + xfs_iunlock(clone, lock_flags); + return error; +} diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index af97d9a..1f4de38 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -65,6 +65,7 @@ int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); +int xfs_commit_clone(struct file *file, loff_t start, loff_t end); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75..11f60ca 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -199,7 +199,7 @@ xfs_file_fsync( loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; int error = 0; @@ -208,13 +208,20 @@ xfs_file_fsync( trace_xfs_file_fsync(ip); - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (error) - return error; - if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + if (file->f_mapping->host != inode) { + error = xfs_commit_clone(file, start, end); + if (error) + return error; + } else { + error = filemap_write_and_wait_range(inode->i_mapping, + start, end); + if (error) + return error; + } + xfs_iflags_clear(ip, XFS_ITRUNCATED); if (mp->m_flags & XFS_MOUNT_BARRIER) { @@ -1002,6 +1009,36 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; + + if (file->f_flags & O_ATOMIC) { + struct dentry *parent; + struct xfs_inode *clone; + int error; + + if (XFS_IS_REALTIME_INODE(XFS_I(inode))) + return -EINVAL; + + // XXX: also need to prevent setting O_DIRECT using fcntl. + if (file->f_flags & O_DIRECT) + return -EINVAL; + + error = filemap_write_and_wait(inode->i_mapping); + if (error) + return error; + + parent = dget_parent(file->f_path.dentry); + error = xfs_create_tmpfile(XFS_I(parent->d_inode), NULL, + file->f_mode, &clone); + dput(parent); + + if (error) + return error; + + VFS_I(clone)->i_size = inode->i_size; + clone->i_cow = XFS_I(inode); + file->f_mapping = VFS_I(clone)->i_mapping; + xfs_finish_inode_setup(clone); + } return 0; } @@ -1032,8 +1069,14 @@ xfs_dir_open( STATIC int xfs_file_release( struct inode *inode, - struct file *filp) + struct file *file) { + if (file->f_mapping->host != inode) { + XFS_I(file->f_mapping->host)->i_cow = NULL; + IRELE(XFS_I(file->f_mapping->host)); + return 0; + } + return xfs_release(XFS_I(inode)); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 76a9f27..a43e83a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -80,6 +80,7 @@ xfs_inode_alloc( ip->i_flags = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + ip->i_cow = NULL; return ip; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8f22d20..a7c3f78 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -52,6 +52,8 @@ typedef struct xfs_inode { /* operations vectors */ const struct xfs_dir_ops *d_ops; /* directory ops vector */ + struct xfs_inode *i_cow; + /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 38e633b..d9e177c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -268,6 +268,13 @@ xfs_iomap_eof_want_preallocate( return 0; /* + * Don't preallocate if this a clone for an O_ATOMIC open, as we'd + * overwrite space in the original file with garbage on a commit. + */ + if (ip->i_cow) + return 0; + + /* * If the file is smaller than the minimum prealloc and we are using * dynamic preallocation, don't do any preallocation at all as it is * likely this is the only write to the file that is going to be done. diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index e063eff..26ab762 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -92,6 +92,8 @@ #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) #define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) +#define O_ATOMIC 040000000 + #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK #endif

newstore direction

Commit Message

Patch