diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 746ea60eed3f..acdd4b65964c 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -550,6 +550,10 @@ For zoned file systems, the following attributes are exposed in: is limited by the capabilities of the backing zoned device, file system size and the max_open_zones mount option. + nr_open_zones (Min: 0 Default: Varies Max: UINTMAX) + This read-only attribute exposes the current number of open zones + used by the file system. + zonegc_low_space (Min: 0 Default: 0 Max: 100) Define a percentage for how much of the unused space that GC should keep available for writing. A high value will reclaim more of the space diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e4b6886e5c3c..d7b648421a70 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1647,16 +1647,12 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); - if (WARN_ON_ONCE((iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && - srcmap->type != IOMAP_UNWRITTEN)) - return -EIO; - if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)) { s64 status; - if (range_dirty) { + if (range_dirty && srcmap->type == IOMAP_UNWRITTEN) { range_dirty = false; status = iomap_zero_iter_flush_and_stale(&iter); } else { diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index bd8fbb40b49e..dcd2f93b6a6c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -110,10 +110,7 @@ xfs_perag_uninit( struct xfs_group *xg) { #ifdef __KERNEL__ - struct xfs_perag *pag = to_perag(xg); - - cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_cache_destroy(&pag->pag_bcache); + cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work); #endif } @@ -235,10 +232,6 @@ xfs_perag_alloc( INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); #endif /* __KERNEL__ */ - error = xfs_buf_cache_init(&pag->pag_bcache); - if (error) - goto out_free_perag; - /* * Pre-calculated geometry */ @@ -250,12 +243,10 @@ xfs_perag_alloc( error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG); if (error) - goto out_buf_cache_destroy; + goto out_free_perag; return 0; -out_buf_cache_destroy: - xfs_buf_cache_destroy(&pag->pag_bcache); out_free_perag: kfree(pag); return error; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 3cd4790768ff..16a9b43a3c27 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -85,8 +85,6 @@ struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - struct xfs_buf_cache pag_bcache; - /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; #endif /* __KERNEL__ */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index d165de607d17..185f09f327c0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -995,7 +995,8 @@ struct xfs_rtgroup_geometry { __u32 rg_sick; /* o: sick things in ag */ __u32 rg_checked; /* o: checked metadata in ag */ __u32 rg_flags; /* i/o: flags for this ag */ - __u32 rg_reserved[27]; /* o: zero */ + __u32 rg_writepointer; /* o: write pointer block offset for zoned */ + __u32 rg_reserved[26]; /* o: zero */ }; #define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ #define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ @@ -1003,6 +1004,8 @@ struct xfs_rtgroup_geometry { #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ #define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ +#define XFS_RTGROUP_GEOM_WRITEPOINTER (1U << 0) /* write pointer */ + /* Health monitor event domains */ /* affects the whole fs */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d2f3c50d80e7..580d40a5ee57 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -31,20 +31,20 @@ struct kmem_cache *xfs_buf_cache; * * xfs_buf_stale: * b_sema (caller holds) - * b_lock + * b_lockref.lock * lru_lock * * xfs_buf_rele: - * b_lock + * b_lockref.lock * lru_lock * * xfs_buftarg_drain_rele * lru_lock - * b_lock (trylock due to inversion) + * b_lockref.lock (trylock due to inversion) * * xfs_buftarg_isolate * lru_lock - * b_lock (trylock due to inversion) + * b_lockref.lock (trylock due to inversion) */ static void xfs_buf_submit(struct xfs_buf *bp); @@ -78,14 +78,11 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - spin_lock(&bp->b_lock); + spin_lock(&bp->b_lockref.lock); atomic_set(&bp->b_lru_ref, 0); - if (!(bp->b_state & XFS_BSTATE_DISPOSE) && - (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) - bp->b_hold--; - - ASSERT(bp->b_hold >= 1); - spin_unlock(&bp->b_lock); + if (!__lockref_is_dead(&bp->b_lockref)) + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); + spin_unlock(&bp->b_lockref.lock); } static void @@ -277,10 +274,8 @@ xfs_buf_alloc( * inserting into the hash table are safe (and will have to wait for * the unlock to do anything non-trivial). */ - bp->b_hold = 1; + lockref_init(&bp->b_lockref); sema_init(&bp->b_sema, 0); /* held, no waiters */ - - spin_lock_init(&bp->b_lock); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); @@ -368,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = { .obj_cmpfn = _xfs_buf_obj_cmp, }; -int -xfs_buf_cache_init( - struct xfs_buf_cache *bch) -{ - return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); -} - -void -xfs_buf_cache_destroy( - struct xfs_buf_cache *bch) -{ - rhashtable_destroy(&bch->bc_hash); -} - static int xfs_buf_map_verify( struct xfs_buftarg *btp, @@ -437,23 +418,9 @@ xfs_buf_find_lock( return 0; } -static bool -xfs_buf_try_hold( - struct xfs_buf *bp) -{ - spin_lock(&bp->b_lock); - if (bp->b_hold == 0) { - spin_unlock(&bp->b_lock); - return false; - } - bp->b_hold++; - spin_unlock(&bp->b_lock); - return true; -} - static inline int xfs_buf_lookup( - struct xfs_buf_cache *bch, + struct xfs_buftarg *btp, struct xfs_buf_map *map, xfs_buf_flags_t flags, struct xfs_buf **bpp) @@ -462,8 +429,8 @@ xfs_buf_lookup( int error; rcu_read_lock(); - bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); - if (!bp || !xfs_buf_try_hold(bp)) { + bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params); + if (!bp || !lockref_get_not_dead(&bp->b_lockref)) { rcu_read_unlock(); return -ENOENT; } @@ -487,7 +454,6 @@ xfs_buf_lookup( static int xfs_buf_find_insert( struct xfs_buftarg *btp, - struct xfs_buf_cache *bch, struct xfs_perag *pag, struct xfs_buf_map *cmap, struct xfs_buf_map *map, @@ -507,14 +473,14 @@ xfs_buf_find_insert( new_bp->b_pag = pag; rcu_read_lock(); - bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, + bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { rcu_read_unlock(); error = PTR_ERR(bp); goto out_free_buf; } - if (bp && xfs_buf_try_hold(bp)) { + if (bp && lockref_get_not_dead(&bp->b_lockref)) { /* found an existing buffer */ rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); @@ -549,16 +515,6 @@ xfs_buftarg_get_pag( return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); } -static inline struct xfs_buf_cache * -xfs_buftarg_buf_cache( - struct xfs_buftarg *btp, - struct xfs_perag *pag) -{ - if (pag) - return &pag->pag_bcache; - return btp->bt_cache; -} - /* * Assembles a buffer covering the specified range. The code is optimised for * cache hits, as metadata intensive workloads will see 3 orders of magnitude @@ -572,7 +528,6 @@ xfs_buf_get_map( xfs_buf_flags_t flags, struct xfs_buf **bpp) { - struct xfs_buf_cache *bch; struct xfs_perag *pag; struct xfs_buf *bp = NULL; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; @@ -589,9 +544,8 @@ xfs_buf_get_map( return error; pag = xfs_buftarg_get_pag(btp, &cmap); - bch = xfs_buftarg_buf_cache(btp, pag); - error = xfs_buf_lookup(bch, &cmap, flags, &bp); + error = xfs_buf_lookup(btp, &cmap, flags, &bp); if (error && error != -ENOENT) goto out_put_perag; @@ -603,7 +557,7 @@ xfs_buf_get_map( goto out_put_perag; /* xfs_buf_find_insert() consumes the perag reference. */ - error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, + error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, flags, &bp); if (error) return error; @@ -856,84 +810,27 @@ xfs_buf_hold( { trace_xfs_buf_hold(bp, _RET_IP_); - spin_lock(&bp->b_lock); - bp->b_hold++; - spin_unlock(&bp->b_lock); + lockref_get(&bp->b_lockref); } static void -xfs_buf_rele_uncached( +xfs_buf_destroy( struct xfs_buf *bp) { - ASSERT(list_empty(&bp->b_lru)); + ASSERT(__lockref_is_dead(&bp->b_lockref)); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - spin_lock(&bp->b_lock); - if (--bp->b_hold) { - spin_unlock(&bp->b_lock); - return; + if (!xfs_buf_is_uncached(bp)) { + rhashtable_remove_fast(&bp->b_target->bt_hash, + &bp->b_rhash_head, xfs_buf_hash_params); + + if (bp->b_pag) + xfs_perag_put(bp->b_pag); } - spin_unlock(&bp->b_lock); + xfs_buf_free(bp); } -static void -xfs_buf_rele_cached( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - struct xfs_perag *pag = bp->b_pag; - struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); - bool freebuf = false; - - trace_xfs_buf_rele(bp, _RET_IP_); - - spin_lock(&bp->b_lock); - ASSERT(bp->b_hold >= 1); - if (bp->b_hold > 1) { - bp->b_hold--; - goto out_unlock; - } - - /* we are asked to drop the last reference */ - if (atomic_read(&bp->b_lru_ref)) { - /* - * If the buffer is added to the LRU, keep the reference to the - * buffer for the LRU and clear the (now stale) dispose list - * state flag, else drop the reference. - */ - if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) - bp->b_state &= ~XFS_BSTATE_DISPOSE; - else - bp->b_hold--; - } else { - bp->b_hold--; - /* - * most of the time buffers will already be removed from the - * LRU, so optimise that case by checking for the - * XFS_BSTATE_DISPOSE flag indicating the last list the buffer - * was on was the disposal list - */ - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del_obj(&btp->bt_lru, &bp->b_lru); - } else { - ASSERT(list_empty(&bp->b_lru)); - } - - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, - xfs_buf_hash_params); - if (pag) - xfs_perag_put(pag); - freebuf = true; - } - -out_unlock: - spin_unlock(&bp->b_lock); - - if (freebuf) - xfs_buf_free(bp); -} - /* * Release a hold on the specified buffer. */ @@ -942,10 +839,23 @@ xfs_buf_rele( struct xfs_buf *bp) { trace_xfs_buf_rele(bp, _RET_IP_); - if (xfs_buf_is_uncached(bp)) - xfs_buf_rele_uncached(bp); - else - xfs_buf_rele_cached(bp); + + if (lockref_put_or_lock(&bp->b_lockref)) + return; + if (!--bp->b_lockref.count) { + if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref)) + goto kill; + list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru); + } + spin_unlock(&bp->b_lockref.lock); + return; + +kill: + lockref_mark_dead(&bp->b_lockref); + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); + spin_unlock(&bp->b_lockref.lock); + + xfs_buf_destroy(bp); } /* @@ -1254,9 +1164,11 @@ xfs_buf_ioerror_alert( /* * To simulate an I/O failure, the buffer must be locked and held with at least - * three references. The LRU reference is dropped by the stale call. The buf - * item reference is dropped via ioend processing. The third reference is owned - * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. + * two references. + * + * The buf item reference is dropped via ioend processing. The second reference + * is owned by the caller and is dropped on I/O completion if the buffer is + * XBF_ASYNC. */ void xfs_buf_ioend_fail( @@ -1512,23 +1424,18 @@ xfs_buftarg_drain_rele( struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); struct list_head *dispose = arg; - if (!spin_trylock(&bp->b_lock)) + if (!spin_trylock(&bp->b_lockref.lock)) return LRU_SKIP; - if (bp->b_hold > 1) { + if (bp->b_lockref.count > 0) { /* need to wait, so skip it this pass */ - spin_unlock(&bp->b_lock); + spin_unlock(&bp->b_lockref.lock); trace_xfs_buf_drain_buftarg(bp, _RET_IP_); return LRU_SKIP; } - /* - * clear the LRU reference count so the buffer doesn't get - * ignored in xfs_buf_rele(). - */ - atomic_set(&bp->b_lru_ref, 0); - bp->b_state |= XFS_BSTATE_DISPOSE; + lockref_mark_dead(&bp->b_lockref); list_lru_isolate_move(lru, item, dispose); - spin_unlock(&bp->b_lock); + spin_unlock(&bp->b_lockref.lock); return LRU_REMOVED; } @@ -1581,7 +1488,7 @@ xfs_buftarg_drain( "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", (long long)xfs_buf_daddr(bp)); } - xfs_buf_rele(bp); + xfs_buf_destroy(bp); } if (loop++ != 0) delay(100); @@ -1610,24 +1517,37 @@ xfs_buftarg_isolate( struct list_head *dispose = arg; /* - * we are inverting the lru lock/bp->b_lock here, so use a trylock. - * If we fail to get the lock, just skip it. + * We are inverting the lru lock vs bp->b_lockref.lock order here, so + * use a trylock. If we fail to get the lock, just skip the buffer. */ - if (!spin_trylock(&bp->b_lock)) + if (!spin_trylock(&bp->b_lockref.lock)) return LRU_SKIP; + + /* + * If the buffer is in use, remove it from the LRU for now. We can't + * free it while someone is using it, and we should also not count + * eviction passed for it, just as if it hadn't been added to the LRU + * yet. + */ + if (bp->b_lockref.count > 0) { + list_lru_isolate(lru, &bp->b_lru); + spin_unlock(&bp->b_lockref.lock); + return LRU_REMOVED; + } + /* * Decrement the b_lru_ref count unless the value is already * zero. If the value is already zero, we need to reclaim the * buffer, otherwise it gets another trip through the LRU. */ if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { - spin_unlock(&bp->b_lock); + spin_unlock(&bp->b_lockref.lock); return LRU_ROTATE; } - bp->b_state |= XFS_BSTATE_DISPOSE; + lockref_mark_dead(&bp->b_lockref); list_lru_isolate_move(lru, item, dispose); - spin_unlock(&bp->b_lock); + spin_unlock(&bp->b_lockref.lock); return LRU_REMOVED; } @@ -1647,7 +1567,7 @@ xfs_buftarg_shrink_scan( struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); - xfs_buf_rele(bp); + xfs_buf_destroy(bp); } return freed; @@ -1670,6 +1590,7 @@ xfs_destroy_buftarg( ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); percpu_counter_destroy(&btp->bt_readahead_count); list_lru_destroy(&btp->bt_lru); + rhashtable_destroy(&btp->bt_hash); } void @@ -1764,8 +1685,10 @@ xfs_init_buftarg( ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, DEFAULT_RATELIMIT_BURST); - if (list_lru_init(&btp->bt_lru)) + if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params)) return -ENOMEM; + if (list_lru_init(&btp->bt_lru)) + goto out_destroy_hash; if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) goto out_destroy_lru; @@ -1783,6 +1706,8 @@ out_destroy_io_count: percpu_counter_destroy(&btp->bt_readahead_count); out_destroy_lru: list_lru_destroy(&btp->bt_lru); +out_destroy_hash: + rhashtable_destroy(&btp->bt_hash); return -ENOMEM; } @@ -1831,6 +1756,7 @@ xfs_alloc_buftarg( return btp; error_free: + fs_put_dax(btp->bt_daxdev, mp); kfree(btp); return ERR_PTR(error); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index e25cd2a160f3..bf39d89f0f6d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -14,6 +14,7 @@ #include #include #include +#include extern struct kmem_cache *xfs_buf_cache; @@ -68,18 +69,6 @@ typedef unsigned int xfs_buf_flags_t; { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" } -/* - * Internal state flags. - */ -#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ - -struct xfs_buf_cache { - struct rhashtable bc_hash; -}; - -int xfs_buf_cache_init(struct xfs_buf_cache *bch); -void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); - /* * The xfs_buftarg contains 2 notions of "sector size" - * @@ -117,8 +106,7 @@ struct xfs_buftarg { unsigned int bt_awu_min; unsigned int bt_awu_max; - /* built-in cache, if we're not using the perag one */ - struct xfs_buf_cache bt_cache[]; + struct rhashtable bt_hash; }; struct xfs_buf_map { @@ -159,7 +147,7 @@ struct xfs_buf { xfs_daddr_t b_rhash_key; /* buffer cache index */ int b_length; /* size of buffer in BBs */ - unsigned int b_hold; /* reference count */ + struct lockref b_lockref; /* refcount + lock */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ @@ -169,8 +157,6 @@ struct xfs_buf { * bt_lru_lock and not by b_sema */ struct list_head b_lru; /* lru list */ - spinlock_t b_lock; /* internal state lock */ - unsigned int b_state; /* internal state flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index b0b3696bf599..b2fd7276b131 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -58,7 +58,7 @@ xmbuf_alloc( struct xfs_buftarg *btp; int error; - btp = kzalloc_flex(*btp, bt_cache, 1); + btp = kzalloc_obj(*btp); if (!btp) return -ENOMEM; @@ -81,10 +81,6 @@ xmbuf_alloc( /* ensure all writes are below EOF to avoid pagecache zeroing */ i_size_write(inode, inode->i_sb->s_maxbytes); - error = xfs_buf_cache_init(btp->bt_cache); - if (error) - goto out_file; - /* Initialize buffer target */ btp->bt_mount = mp; btp->bt_dev = (dev_t)-1U; @@ -95,15 +91,13 @@ xmbuf_alloc( error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr); if (error) - goto out_bcache; + goto out_file; trace_xmbuf_create(btp); *btpp = btp; return 0; -out_bcache: - xfs_buf_cache_destroy(btp->bt_cache); out_file: fput(file); out_free_btp: @@ -122,7 +116,6 @@ xmbuf_free( trace_xmbuf_free(btp); xfs_destroy_buftarg(btp); - xfs_buf_cache_destroy(btp->bt_cache); fput(btp->bt_file); kfree(btp); } diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 3efdca3d675b..41cf0605ec22 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -690,9 +690,9 @@ xfs_extent_busy_ag_cmp( container_of(l2, struct xfs_extent_busy, list); s32 diff; - diff = b1->group->xg_gno - b2->group->xg_gno; + diff = cmp_int(b1->group->xg_gno, b2->group->xg_gno); if (!diff) - diff = b1->bno - b2->bno; + diff = cmp_int(b1->bno, b2->bno); return diff; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 749a4eb9793c..2266d56e37dc 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -387,7 +387,7 @@ xfs_extent_free_diff_items( struct xfs_extent_free_item *ra = xefi_entry(a); struct xfs_extent_free_item *rb = xefi_entry(b); - return ra->xefi_group->xg_gno - rb->xefi_group->xg_gno; + return cmp_int(ra->xefi_group->xg_gno, rb->xefi_group->xg_gno); } /* Log a free extent to the intent item. */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6246f34df9fd..845a97c9b063 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -560,6 +560,72 @@ xfs_zoned_write_space_reserve( flags, ac); } +/* + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. + */ +static int +xfs_dio_endio_set_isize( + struct inode *inode, + loff_t offset, + ssize_t size) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (offset + size <= i_size_read(inode)) + return 0; + + spin_lock(&ip->i_flags_lock); + if (offset + size <= i_size_read(inode)) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + + i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); + + return xfs_setfilesize(ip, offset, size); +} + +static int +xfs_zoned_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + int error, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + unsigned int nofs_flag; + + ASSERT(!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + + trace_xfs_end_io_direct_write(ip, iocb->ki_pos, size); + + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; + + if (error || !size) + return error; + + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + + nofs_flag = memalloc_nofs_save(); + error = xfs_dio_endio_set_isize(inode, iocb->ki_pos, size); + memalloc_nofs_restore(nofs_flag); + + return error; +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -572,8 +638,7 @@ xfs_dio_write_end_io( loff_t offset = iocb->ki_pos; unsigned int nofs_flag; - ASSERT(!xfs_is_zoned_inode(ip) || - !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + ASSERT(!xfs_is_zoned_inode(ip)); trace_xfs_end_io_direct_write(ip, offset, size); @@ -623,30 +688,8 @@ xfs_dio_write_end_io( * with the on-disk inode size being outside the in-core inode size. We * have no other method of updating EOF for AIO, so always do it here * if necessary. - * - * We need to lock the test/set EOF update as we can be racing with - * other IO completions here to update the EOF. Failing to serialise - * here can result in EOF moving backwards and Bad Things Happen when - * that occurs. - * - * As IO completion only ever extends EOF, we can do an unlocked check - * here to avoid taking the spinlock. If we land within the current EOF, - * then we do not need to do an extending update at all, and we don't - * need to take the lock to check this. If we race with an update moving - * EOF, then we'll either still be beyond EOF and need to take the lock, - * or we'll be within EOF and we don't need to take it at all. */ - if (offset + size <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - if (offset + size > i_size_read(inode)) { - i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); - error = xfs_setfilesize(ip, offset, size); - } else { - spin_unlock(&ip->i_flags_lock); - } + error = xfs_dio_endio_set_isize(inode, offset, size); out: memalloc_nofs_restore(nofs_flag); @@ -688,7 +731,7 @@ xfs_dio_zoned_submit_io( static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { .bio_set = &iomap_ioend_bioset, .submit_io = xfs_dio_zoned_submit_io, - .end_io = xfs_dio_write_end_io, + .end_io = xfs_zoned_dio_write_end_io, }; /* @@ -1263,6 +1306,23 @@ xfs_falloc_insert_range( if (offset >= isize) return -EINVAL; + /* + * Let writeback clean up EOF folio state before we bump i_size. The + * insert flushes before it starts shifting and under certain + * circumstances we can write back blocks that should technically be + * considered post-eof (and thus should not be submitted for writeback). + * + * For example, a large, dirty folio that spans EOF and is backed by + * post-eof COW fork preallocation can cause block remap into the data + * fork. This shifts back out beyond EOF, but creates an expectedly + * written post-eof block. The insert is going to flush, unmap and + * cancel prealloc across this whole range, so flush EOF now before we + * bump i_size to provide consistent behavior. + */ + error = filemap_write_and_wait_range(inode->i_mapping, isize, isize); + if (error) + return error; + error = xfs_falloc_setsize(file, isize + len); if (error) return error; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index facffdc8dca8..46e234863644 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -37,12 +37,15 @@ #include "xfs_ioctl.h" #include "xfs_xattr.h" #include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" #include "xfs_file.h" #include "xfs_exchrange.h" #include "xfs_handle.h" #include "xfs_rtgroup.h" #include "xfs_healthmon.h" #include "xfs_verify_media.h" +#include "xfs_zone_priv.h" +#include "xfs_zone_alloc.h" #include #include @@ -413,6 +416,7 @@ xfs_ioc_rtgroup_geometry( { struct xfs_rtgroup *rtg; struct xfs_rtgroup_geometry rgeo; + xfs_rgblock_t highest_rgbno; int error; if (copy_from_user(&rgeo, arg, sizeof(rgeo))) @@ -433,6 +437,21 @@ xfs_ioc_rtgroup_geometry( if (error) return error; + if (xfs_has_zoned(mp)) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + if (rtg->rtg_open_zone) { + rgeo.rg_writepointer = rtg->rtg_open_zone->oz_allocated; + } else { + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + if (highest_rgbno == NULLRGBLOCK) + rgeo.rg_writepointer = 0; + else + rgeo.rg_writepointer = highest_rgbno + 1; + } + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + rgeo.rg_flags |= XFS_RTGROUP_GEOM_WRITEPOINTER; + } + if (copy_to_user(arg, &rgeo, sizeof(rgeo))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9c2f12d5fec9..f20a02f49ed9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1593,6 +1593,7 @@ xfs_zoned_buffered_write_iomap_begin( { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct address_space *mapping = inode->i_mapping; struct xfs_zone_alloc_ctx *ac = iter->private; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1617,6 +1618,7 @@ xfs_zoned_buffered_write_iomap_begin( if (error) return error; +restart: error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; @@ -1654,14 +1656,6 @@ xfs_zoned_buffered_write_iomap_begin( &smap)) smap.br_startoff = end_fsb; /* fake hole until EOF */ if (smap.br_startoff > offset_fsb) { - /* - * We never need to allocate blocks for zeroing a hole. - */ - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, - smap.br_startoff); - goto out_unlock; - } end_fsb = min(end_fsb, smap.br_startoff); } else { end_fsb = min(end_fsb, @@ -1693,6 +1687,33 @@ xfs_zoned_buffered_write_iomap_begin( count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); + /* + * When zeroing, don't allocate blocks for holes as they are already + * zeroes, but we need to ensure that no extents exist in both the data + * and COW fork to ensure this really is a hole. + * + * A window exists where we might observe a hole in both forks with + * valid data in cache. Writeback removes the COW fork blocks on + * submission but doesn't remap into the data fork until completion. If + * the data fork was previously a hole, we'll fail to zero. Until we + * find a way to avoid this transient state, check for dirty pagecache + * and flush to wait on blocks to land in the data fork. + */ + if ((flags & IOMAP_ZERO) && srcmap->type == IOMAP_HOLE) { + if (filemap_range_needs_writeback(mapping, offset, + offset + count - 1)) { + xfs_iunlock(ip, lockmode); + error = filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + if (error) + return error; + goto restart; + } + + xfs_hole_to_iomap(ip, iomap, offset_fsb, end_fsb); + goto out_unlock; + } + /* * The block reservation is supposed to cover all blocks that the * operation could possible write, but there is a nasty corner case @@ -1767,6 +1788,8 @@ xfs_buffered_write_iomap_begin( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + xfs_fileoff_t cow_fsb = NULLFILEOFF; + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); struct xfs_bmbt_irec imap, cmap; struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; @@ -1811,30 +1834,96 @@ xfs_buffered_write_iomap_begin( goto out_unlock; /* - * Search the data fork first to look up our source mapping. We - * always need the data fork map, as we have to return it to the - * iomap code so that the higher level write code can read data in to - * perform read-modify-write cycles for unaligned writes. + * Search the data fork first to look up our source mapping. We always + * need the data fork map, as we have to return it to the iomap code so + * that the higher level write code can read data in to perform + * read-modify-write cycles for unaligned writes. + * + * Then search the COW fork extent list even if we did not find a data + * fork extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. */ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) imap.br_startoff = end_fsb; /* fake hole until the end */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof) + cow_fsb = cmap.br_startoff; + } - /* We never need to allocate blocks for zeroing or unsharing a hole. */ - if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) && - imap.br_startoff > offset_fsb) { + /* We never need to allocate blocks for unsharing a hole. */ + if ((flags & IOMAP_UNSHARE) && imap.br_startoff > offset_fsb) { xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); goto out_unlock; } + /* + * We may need to zero over a hole in the data fork if it's fronted by + * COW blocks and dirty pagecache. Scan such file ranges for dirty + * cache and fill the iomap batch with folios that need zeroing. + */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + loff_t start, end; + unsigned int fbatch_count; + + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + + if (cow_fsb == NULLFILEOFF) + goto found_imap; + if (cow_fsb > offset_fsb) { + xfs_trim_extent(&imap, offset_fsb, + cow_fsb - offset_fsb); + goto found_imap; + } + + /* no zeroing beyond eof, so split at the boundary */ + if (offset_fsb >= eof_fsb) + goto found_imap; + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) + xfs_trim_extent(&imap, offset_fsb, + eof_fsb - offset_fsb); + + /* COW fork blocks overlap the hole */ + xfs_trim_extent(&imap, offset_fsb, + cmap.br_startoff + cmap.br_blockcount - offset_fsb); + start = XFS_FSB_TO_B(mp, imap.br_startoff); + end = XFS_FSB_TO_B(mp, imap.br_startoff + imap.br_blockcount); + fbatch_count = iomap_fill_dirty_folios(iter, &start, end, + &iomap_flags); + xfs_trim_extent(&imap, offset_fsb, + XFS_B_TO_FSB(mp, start) - offset_fsb); + + /* + * Report the COW mapping if we have folios to zero. Otherwise + * ignore the COW blocks as preallocation and report a hole. + */ + if (fbatch_count) { + xfs_trim_extent(&cmap, imap.br_startoff, + imap.br_blockcount); + imap.br_startoff = end_fsb; /* fake hole */ + goto found_cow; + } + goto found_imap; + } + /* * For zeroing, trim extents that extend beyond the EOF block. If a * delalloc extent starts beyond the EOF block, convert it to an * unwritten extent. */ if (flags & IOMAP_ZERO) { - xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - if (isnullstartblock(imap.br_startblock) && offset_fsb >= eof_fsb) goto convert_delay; @@ -1867,24 +1956,13 @@ xfs_buffered_write_iomap_begin( } /* - * Search the COW fork extent list even if we did not find a data fork - * extent. This serves two purposes: first this implements the - * speculative preallocation using cowextsize, so that we also unshare - * block adjacent to shared blocks instead of just the shared blocks - * themselves. Second the lookup in the extent list is generally faster - * than going out to the shared extent tree. + * Now that we've handled any operation specific special cases, at this + * point we can report a COW mapping if found. */ - if (xfs_is_cow_inode(ip)) { - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, - &ccur, &cmap); - if (!cow_eof && cmap.br_startoff <= offset_fsb) { - trace_xfs_reflink_cow_found(ip, &cmap); - goto found_cow; - } + if (xfs_is_cow_inode(ip) && + !cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + goto found_cow; } if (imap.br_startoff <= offset_fsb) { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 208543e57eda..325c2200c501 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -901,20 +901,18 @@ out_dqrele: /* * Truncate file. Must have write permission and not be a directory. - * - * Caution: The caller of this function is responsible for calling - * setattr_prepare() or otherwise verifying the change is fine. */ -STATIC int -xfs_setattr_size( +int +xfs_vn_setattr_size( struct mnt_idmap *idmap, struct dentry *dentry, - struct xfs_inode *ip, struct iattr *iattr) { + struct inode *inode = d_inode(dentry); + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct inode *inode = VFS_I(ip); - xfs_off_t oldsize, newsize; + xfs_off_t oldsize = inode->i_size; + xfs_off_t newsize = iattr->ia_size; struct xfs_trans *tp; int error; uint lock_flags = 0; @@ -927,8 +925,11 @@ xfs_setattr_size( ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0); - oldsize = inode->i_size; - newsize = iattr->ia_size; + trace_xfs_setattr(ip); + + error = xfs_vn_change_ok(idmap, dentry, iattr); + if (error) + return error; /* * Short circuit the truncate case for zero length files. @@ -1109,7 +1110,6 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1129,23 +1129,6 @@ out_trans_cancel: goto out_unlock; } -int -xfs_vn_setattr_size( - struct mnt_idmap *idmap, - struct dentry *dentry, - struct iattr *iattr) -{ - struct xfs_inode *ip = XFS_I(d_inode(dentry)); - int error; - - trace_xfs_setattr(ip); - - error = xfs_vn_change_ok(idmap, dentry, iattr); - if (error) - return error; - return xfs_setattr_size(idmap, dentry, ip, iattr); -} - STATIC int xfs_vn_setattr( struct mnt_idmap *idmap, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ef1ea8a1238c..b24195f570cd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -44,17 +44,36 @@ #include "xfs_healthmon.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); -static int xfs_uuid_table_size; -static uuid_t *xfs_uuid_table; +static DEFINE_XARRAY_ALLOC(xfs_uuid_table); + +static uuid_t * +xfs_uuid_search( + uuid_t *new_uuid) +{ + unsigned long index = 0; + uuid_t *uuid; + + xa_for_each(&xfs_uuid_table, index, uuid) { + if (uuid_equal(uuid, new_uuid)) + return uuid; + } + return NULL; +} + +static void +xfs_uuid_delete( + uuid_t *uuid, + unsigned int index) +{ + ASSERT(uuid_equal(xa_load(&xfs_uuid_table, index), uuid)); + xa_erase(&xfs_uuid_table, index); +} void xfs_uuid_table_free(void) { - if (xfs_uuid_table_size == 0) - return; - kfree(xfs_uuid_table); - xfs_uuid_table = NULL; - xfs_uuid_table_size = 0; + ASSERT(xa_empty(&xfs_uuid_table)); + xa_destroy(&xfs_uuid_table); } /* @@ -66,7 +85,7 @@ xfs_uuid_mount( struct xfs_mount *mp) { uuid_t *uuid = &mp->m_sb.sb_uuid; - int hole, i; + int ret; /* Publish UUID in struct super_block */ super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid)); @@ -80,30 +99,17 @@ xfs_uuid_mount( } mutex_lock(&xfs_uuid_table_mutex); - for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { - if (uuid_is_null(&xfs_uuid_table[i])) { - hole = i; - continue; - } - if (uuid_equal(uuid, &xfs_uuid_table[i])) - goto out_duplicate; + if (unlikely(xfs_uuid_search(uuid))) { + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", + uuid); + mutex_unlock(&xfs_uuid_table_mutex); + return -EINVAL; } - if (hole < 0) { - xfs_uuid_table = krealloc(xfs_uuid_table, - (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - GFP_KERNEL | __GFP_NOFAIL); - hole = xfs_uuid_table_size++; - } - xfs_uuid_table[hole] = *uuid; + ret = xa_alloc(&xfs_uuid_table, &mp->m_uuid_table_index, uuid, + xa_limit_32b, GFP_KERNEL); mutex_unlock(&xfs_uuid_table_mutex); - - return 0; - - out_duplicate: - mutex_unlock(&xfs_uuid_table_mutex); - xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); - return -EINVAL; + return ret; } STATIC void @@ -111,21 +117,12 @@ xfs_uuid_unmount( struct xfs_mount *mp) { uuid_t *uuid = &mp->m_sb.sb_uuid; - int i; if (xfs_has_nouuid(mp)) return; mutex_lock(&xfs_uuid_table_mutex); - for (i = 0; i < xfs_uuid_table_size; i++) { - if (uuid_is_null(&xfs_uuid_table[i])) - continue; - if (!uuid_equal(uuid, &xfs_uuid_table[i])) - continue; - memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); - break; - } - ASSERT(i < xfs_uuid_table_size); + xfs_uuid_delete(uuid, mp->m_uuid_table_index); mutex_unlock(&xfs_uuid_table_mutex); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index ddd4028be8d6..d964bae096ef 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -346,6 +346,9 @@ typedef struct xfs_mount { /* Private data referring to a health monitor object. */ struct xfs_healthmon __rcu *m_healthmon; + + /* Index of uuid record in the uuid xarray. */ + unsigned int m_uuid_table_index; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index d50b7318cb5c..21a784986828 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -391,6 +391,38 @@ out_rele: return error; } +/* + * Fill out the default quota limits for an ID that has no dquot on disk. + * Returns 0 if default limits are configured + * and were filled in, -ENOENT otherwise. + */ +static int +xfs_qm_scall_getquota_fill_defaults( + struct xfs_mount *mp, + xfs_dqtype_t type, + struct qc_dqblk *dst) +{ + struct xfs_def_quota *defq; + + defq = xfs_get_defquota(mp->m_quotainfo, type); + + if (!defq->blk.soft && !defq->blk.hard && + !defq->ino.soft && !defq->ino.hard && + !defq->rtb.soft && !defq->rtb.hard) { + return -ENOENT; + } + + memset(dst, 0, sizeof(*dst)); + dst->d_spc_softlimit = XFS_FSB_TO_B(mp, defq->blk.soft); + dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, defq->blk.hard); + dst->d_ino_softlimit = defq->ino.soft; + dst->d_ino_hardlimit = defq->ino.hard; + dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, defq->rtb.soft); + dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, defq->rtb.hard); + + return 0; +} + /* Fill out the quota context. */ static void xfs_qm_scall_getquota_fill_qc( @@ -451,8 +483,17 @@ xfs_qm_scall_getquota( * set doalloc. If it doesn't exist, we'll get ENOENT back. */ error = xfs_qm_dqget(mp, id, type, false, &dqp); - if (error) + if (error) { + /* + * If there is no dquot on disk and default limits are + * configured, return them with zero usage so that + * unprivileged users can see what limits apply to them. + */ + if (error == -ENOENT && id != 0 && + !xfs_qm_scall_getquota_fill_defaults(mp, type, dst)) + return 0; return error; + } /* * If everything's NULL, this dquot doesn't quite exist as far as diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 881c3f3a6a24..8bccf89a7766 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -266,7 +266,7 @@ xfs_refcount_update_diff_items( struct xfs_refcount_intent *ra = ci_entry(a); struct xfs_refcount_intent *rb = ci_entry(b); - return ra->ri_group->xg_gno - rb->ri_group->xg_gno; + return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno); } /* Log refcount updates in the intent item. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a39fe08dcd8f..2a3a73a8566d 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -267,7 +267,7 @@ xfs_rmap_update_diff_items( struct xfs_rmap_intent *ra = ri_entry(a); struct xfs_rmap_intent *rb = ri_entry(b); - return ra->ri_group->xg_gno - rb->ri_group->xg_gno; + return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno); } /* Log rmap updates in the intent item. */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 6c7909838234..676777064c2d 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -13,7 +13,9 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" +#include "xfs_zone_priv.h" #include "xfs_zones.h" +#include "xfs_zone_alloc.h" struct xfs_sysfs_attr { struct attribute attr; @@ -718,12 +720,24 @@ max_open_zones_show( } XFS_SYSFS_ATTR_RO(max_open_zones); +static ssize_t +nr_open_zones_show( + struct kobject *kobj, + char *buf) +{ + struct xfs_zone_info *zi = zoned_to_mp(kobj)->m_zone_info; + + return sysfs_emit(buf, "%u\n", READ_ONCE(zi->zi_nr_open_zones)); +} +XFS_SYSFS_ATTR_RO(nr_open_zones); + static ssize_t zonegc_low_space_store( struct kobject *kobj, const char *buf, size_t count) { + struct xfs_mount *mp = zoned_to_mp(kobj); int ret; unsigned int val; @@ -734,7 +748,10 @@ zonegc_low_space_store( if (val > 100) return -EINVAL; - zoned_to_mp(kobj)->m_zonegc_low_space = val; + if (mp->m_zonegc_low_space != val) { + mp->m_zonegc_low_space = val; + xfs_zone_gc_wakeup(mp); + } return count; } @@ -751,6 +768,7 @@ XFS_SYSFS_ATTR_RW(zonegc_low_space); static struct attribute *xfs_zoned_attrs[] = { ATTR_LIST(max_open_zones), + ATTR_LIST(nr_open_zones), ATTR_LIST(zonegc_low_space), NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5e8190fe2be9..1c098cfc5c00 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -394,6 +394,7 @@ DEFINE_ZONE_EVENT(xfs_zone_full); DEFINE_ZONE_EVENT(xfs_zone_opened); DEFINE_ZONE_EVENT(xfs_zone_reset); DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); +DEFINE_ZONE_EVENT(xfs_zone_gc_target_stolen); TRACE_EVENT(xfs_zone_free_blocks, TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, @@ -461,6 +462,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \ DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks); DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_spurious_open); TRACE_EVENT(xfs_zone_gc_select_victim, TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), @@ -740,7 +742,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->dev = bp->b_target->bt_dev; __entry->bno = xfs_buf_daddr(bp); __entry->nblks = bp->b_length; - __entry->hold = bp->b_hold; + __entry->hold = bp->b_lockref.count; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; @@ -814,7 +816,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->bno = xfs_buf_daddr(bp); __entry->length = bp->b_length; __entry->flags = flags; - __entry->hold = bp->b_hold; + __entry->hold = bp->b_lockref.count; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->caller_ip = caller_ip; @@ -858,7 +860,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->dev = bp->b_target->bt_dev; __entry->bno = xfs_buf_daddr(bp); __entry->length = bp->b_length; - __entry->hold = bp->b_hold; + __entry->hold = bp->b_lockref.count; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->error = error; @@ -902,7 +904,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->buf_bno = xfs_buf_daddr(bip->bli_buf); __entry->buf_len = bip->bli_buf->b_length; __entry->buf_flags = bip->bli_buf->b_flags; - __entry->buf_hold = bip->bli_buf->b_hold; + __entry->buf_hold = bip->bli_buf->b_lockref.count; __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_lockval = bip->bli_buf->b_sema.count; __entry->li_flags = bip->bli_item.li_flags; @@ -5206,7 +5208,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class, __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; __entry->bno = xfs_buf_daddr(bp); __entry->nblks = bp->b_length; - __entry->hold = bp->b_hold; + __entry->hold = bp->b_lockref.count; __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index e3d19b6dc64a..a851b98143c0 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -174,42 +174,33 @@ xfs_open_zone_mark_full( WRITE_ONCE(rtg->rtg_open_zone, NULL); spin_lock(&zi->zi_open_zones_lock); - if (oz->oz_is_gc) { - ASSERT(current == zi->zi_gc_thread); - zi->zi_open_gc_zone = NULL; - } else { + if (oz->oz_is_gc) + zi->zi_nr_open_gc_zones--; + else zi->zi_nr_open_zones--; - list_del_init(&oz->oz_entry); - } + list_del_init(&oz->oz_entry); spin_unlock(&zi->zi_open_zones_lock); - xfs_open_zone_put(oz); - wake_up_all(&zi->zi_zone_wait); + if (oz->oz_is_gc) + wake_up_process(zi->zi_gc_thread); + else + wake_up_all(&zi->zi_zone_wait); + if (used < rtg_blocks(rtg)) xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); + xfs_open_zone_put(oz); } -static void -xfs_zone_record_blocks( - struct xfs_trans *tp, +static inline void +xfs_zone_inc_written( struct xfs_open_zone *oz, - xfs_fsblock_t fsbno, xfs_filblks_t len) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_rtgroup *rtg = oz->oz_rtg; - struct xfs_inode *rmapip = rtg_rmap(rtg); + xfs_assert_ilocked(rtg_rmap(oz->oz_rtg), XFS_ILOCK_EXCL); - trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); - - xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); - xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); - rmapip->i_used_blocks += len; - ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); oz->oz_written += len; - if (oz->oz_written == rtg_blocks(rtg)) + if (oz->oz_written == rtg_blocks(oz->oz_rtg)) xfs_open_zone_mark_full(oz); - xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); } /* @@ -227,9 +218,7 @@ xfs_zone_skip_blocks( trace_xfs_zone_skip_blocks(oz, 0, len); xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); - oz->oz_written += len; - if (oz->oz_written == rtg_blocks(rtg)) - xfs_open_zone_mark_full(oz); + xfs_zone_inc_written(oz, len); xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); xfs_add_frextents(rtg_mount(rtg), len); @@ -244,6 +233,8 @@ xfs_zoned_map_extent( xfs_fsblock_t old_startblock) { struct xfs_bmbt_irec data; + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_inode *rmapip = rtg_rmap(rtg); int nmaps = 1; int error; @@ -302,7 +293,15 @@ xfs_zoned_map_extent( } } - xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); + trace_xfs_zone_record_blocks(oz, + xfs_rtb_to_rgbno(tp->t_mountp, new->br_startblock), + new->br_blockcount); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + rmapip->i_used_blocks += new->br_blockcount; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); + xfs_zone_inc_written(oz, new->br_blockcount); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); @@ -560,6 +559,9 @@ xfs_try_use_zone( struct xfs_open_zone *oz, unsigned int goodness) { + if (oz->oz_is_gc) + return false; + if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; @@ -681,10 +683,11 @@ xfs_select_zone_nowait( if (oz) goto out_unlock; - if (pack_tight) + if (pack_tight) { oz = xfs_select_open_zone_mru(zi, write_hint); - if (oz) - goto out_unlock; + if (oz) + goto out_unlock; + } /* * See if we can open a new zone and use that so that data for different @@ -695,7 +698,7 @@ xfs_select_zone_nowait( goto out_unlock; /* - * Try to find an zone that is an ok match to colocate data with. + * Try to find a zone that is an ok match to colocate data with. */ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); if (oz) @@ -1232,6 +1235,100 @@ xfs_free_zone_info( kfree(zi); } +static int +xfs_report_zones( + struct xfs_mount *mp, + struct xfs_init_zones *iz) +{ + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_rgblock_t write_pointer; + int error; + + error = xfs_query_write_pointer(iz, rtg, &write_pointer); + if (!error) + error = xfs_init_zone(iz, rtg, write_pointer); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } + + return 0; +} + +static inline bool +xfs_zone_is_conv( + struct xfs_rtgroup *rtg) +{ + return !bdev_zone_is_seq(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, + xfs_gbno_to_daddr(rtg_group(rtg), 0)); +} + +static struct xfs_open_zone * +xfs_find_fullest_conventional_open_zone( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *found = NULL, *oz; + + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { + if (!xfs_zone_is_conv(oz->oz_rtg)) + continue; + if (!found || oz->oz_allocated > found->oz_allocated) + found = oz; + } + spin_unlock(&zi->zi_open_zones_lock); + + return found; +} + +/* + * Find the fullest conventional zones and remove them from the open zone pool + * until we are at the open zone limit. + * + * We can end up with spurious "open" zones when the last blocks in a fully + * written zone were invalidate as there is no write pointer for conventional + * zones. + * + * If we are still over the limit when there is no conventional open zone left, + * the user overrode the max open zones limit using the max_open_zones mount + * option we should fail. + */ +static int +xfs_finish_spurious_open_zones( + struct xfs_mount *mp, + struct xfs_init_zones *iz) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + + while (zi->zi_nr_open_zones > mp->m_max_open_zones) { + struct xfs_open_zone *oz; + xfs_filblks_t adjust; + + oz = xfs_find_fullest_conventional_open_zone(mp); + if (!oz) { + xfs_err(mp, +"too many open zones for max_open_zones limit (%u/%u)", + zi->zi_nr_open_zones, mp->m_max_open_zones); + return -EINVAL; + } + + xfs_rtgroup_lock(oz->oz_rtg, XFS_RTGLOCK_RMAP); + adjust = rtg_blocks(oz->oz_rtg) - oz->oz_written; + trace_xfs_zone_spurious_open(oz, oz->oz_written, adjust); + oz->oz_written = rtg_blocks(oz->oz_rtg); + xfs_open_zone_mark_full(oz); + xfs_rtgroup_unlock(oz->oz_rtg, XFS_RTGLOCK_RMAP); + iz->available -= adjust; + iz->reclaimable += adjust; + } + + return 0; +} + int xfs_mount_zones( struct xfs_mount *mp) @@ -1240,7 +1337,6 @@ xfs_mount_zones( .zone_capacity = mp->m_groups[XG_TYPE_RTG].blocks, .zone_size = xfs_rtgroup_raw_size(mp), }; - struct xfs_rtgroup *rtg = NULL; int error; if (!mp->m_rtdev_targp) { @@ -1270,9 +1366,17 @@ xfs_mount_zones( if (!mp->m_zone_info) return -ENOMEM; - xfs_info(mp, "%u zones of %u blocks (%u max open zones)", - mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones); - trace_xfs_zones_mount(mp); + error = xfs_report_zones(mp, &iz); + if (error) + goto out_free_zone_info; + + error = xfs_finish_spurious_open_zones(mp, &iz); + if (error) + goto out_free_zone_info; + + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + iz.available + iz.reclaimable); /* * The writeback code switches between inodes regularly to provide @@ -1298,22 +1402,6 @@ xfs_mount_zones( XFS_FSB_TO_B(mp, min(iz.zone_capacity, XFS_MAX_BMBT_EXTLEN)) >> PAGE_SHIFT; - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - xfs_rgblock_t write_pointer; - - error = xfs_query_write_pointer(&iz, rtg, &write_pointer); - if (!error) - error = xfs_init_zone(&iz, rtg, write_pointer); - if (error) { - xfs_rtgroup_rele(rtg); - goto out_free_zone_info; - } - } - - xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); - xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, - iz.available + iz.reclaimable); - /* * The user may configure GC to free up a percentage of unused blocks. * By default this is 0. GC will always trigger at the minimum level @@ -1324,6 +1412,10 @@ xfs_mount_zones( error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info; + + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", + mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones); + trace_xfs_zones_mount(mp); return 0; out_free_zone_info: diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h index 4db02816d0fd..8b2ef98c81ef 100644 --- a/fs/xfs/xfs_zone_alloc.h +++ b/fs/xfs/xfs_zone_alloc.h @@ -51,6 +51,7 @@ int xfs_mount_zones(struct xfs_mount *mp); void xfs_unmount_zones(struct xfs_mount *mp); void xfs_zone_gc_start(struct xfs_mount *mp); void xfs_zone_gc_stop(struct xfs_mount *mp); +void xfs_zone_gc_wakeup(struct xfs_mount *mp); #else static inline int xfs_mount_zones(struct xfs_mount *mp) { @@ -65,6 +66,9 @@ static inline void xfs_zone_gc_start(struct xfs_mount *mp) static inline void xfs_zone_gc_stop(struct xfs_mount *mp) { } +static inline void xfs_zone_gc_wakeup(struct xfs_mount *mp) +{ +} #endif /* CONFIG_XFS_RT */ #endif /* _XFS_ZONE_ALLOC_H */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index b2626a482563..fedcc47048af 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -125,6 +125,7 @@ struct xfs_zone_gc_iter { */ struct xfs_zone_gc_data { struct xfs_mount *mp; + struct xfs_open_zone *oz; /* bioset used to allocate the gc_bios */ struct bio_set bio_set; @@ -170,25 +171,37 @@ xfs_zoned_need_gc( s64 available, free, threshold; s32 remainder; + /* If we have no reclaimable blocks, running GC is useless. */ if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) return false; + /* + * In order to avoid file fragmentation as much as possible, we should + * make sure that we can open enough zones. So trigger GC if the number + * of blocks immediately available for writes is lower than the total + * number of blocks from all possible open zones. + */ available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); - if (available < xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) return true; - free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + /* + * For cases where the user wants to be more aggressive with GC, + * the sysfs attribute zonegc_low_space may be set to a non zero value, + * to indicate that GC should try to maintain at least zonegc_low_space + * percent of the free space to be directly available for writing. Check + * this here. + */ + if (!mp->m_zonegc_low_space) + return false; + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); threshold = div_s64_rem(free, 100, &remainder); threshold = threshold * mp->m_zonegc_low_space + remainder * div_s64(mp->m_zonegc_low_space, 100); - if (available < threshold) - return true; - - return false; + return available < threshold; } static struct xfs_zone_gc_data * @@ -362,7 +375,7 @@ done: } static bool -xfs_zone_gc_iter_next( +xfs_zone_gc_iter_irec( struct xfs_mount *mp, struct xfs_zone_gc_iter *iter, struct xfs_rmap_irec *chunk_rec, @@ -371,9 +384,6 @@ xfs_zone_gc_iter_next( struct xfs_rmap_irec *irec; int error; - if (!iter->victim_rtg) - return false; - retry: if (iter->rec_idx == iter->rec_count) { error = xfs_zone_gc_query(mp, iter); @@ -515,10 +525,11 @@ xfs_zone_gc_select_victim( return true; } -static struct xfs_open_zone * -xfs_zone_gc_steal_open( - struct xfs_zone_info *zi) +static int +xfs_zone_gc_steal_open_zone( + struct xfs_zone_gc_data *data) { + struct xfs_zone_info *zi = data->mp->m_zone_info; struct xfs_open_zone *oz, *found = NULL; spin_lock(&zi->zi_open_zones_lock); @@ -526,56 +537,64 @@ xfs_zone_gc_steal_open( if (!found || oz->oz_allocated < found->oz_allocated) found = oz; } - - if (found) { - found->oz_is_gc = true; - list_del_init(&found->oz_entry); - zi->zi_nr_open_zones--; + if (!found) { + spin_unlock(&zi->zi_open_zones_lock); + return -EIO; } + trace_xfs_zone_gc_target_stolen(found->oz_rtg); + found->oz_is_gc = true; + zi->zi_nr_open_zones--; + zi->zi_nr_open_gc_zones++; spin_unlock(&zi->zi_open_zones_lock); - return found; -} -static struct xfs_open_zone * -xfs_zone_gc_select_target( - struct xfs_mount *mp) -{ - struct xfs_zone_info *zi = mp->m_zone_info; - struct xfs_open_zone *oz = zi->zi_open_gc_zone; - - /* - * We need to wait for pending writes to finish. - */ - if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) - return NULL; - - ASSERT(zi->zi_nr_open_zones <= - mp->m_max_open_zones - XFS_OPEN_GC_ZONES); - oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); - if (oz) - trace_xfs_zone_gc_target_opened(oz->oz_rtg); - spin_lock(&zi->zi_open_zones_lock); - zi->zi_open_gc_zone = oz; - spin_unlock(&zi->zi_open_zones_lock); - return oz; + atomic_inc(&found->oz_ref); + data->oz = found; + return 0; } /* - * Ensure we have a valid open zone to write the GC data to. - * - * If the current target zone has space keep writing to it, else first wait for - * all pending writes and then pick a new one. + * Ensure we have a valid open zone to write to. */ -static struct xfs_open_zone * -xfs_zone_gc_ensure_target( - struct xfs_mount *mp) +static bool +xfs_zone_gc_select_target( + struct xfs_zone_gc_data *data) { - struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; + struct xfs_zone_info *zi = data->mp->m_zone_info; - if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) - return xfs_zone_gc_select_target(mp); - return oz; + if (data->oz) { + /* + * If we have space available, just keep using the existing + * zone. + */ + if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg)) + return true; + + /* + * Wait for all writes to the current zone to finish before + * picking a new one. + */ + if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg)) + return false; + + xfs_open_zone_put(data->oz); + } + + /* + * Open a new zone when there is none currently in use. + */ + ASSERT(zi->zi_nr_open_zones <= + data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES); + data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true); + if (!data->oz) + return false; + trace_xfs_zone_gc_target_opened(data->oz->oz_rtg); + atomic_inc(&data->oz->oz_ref); + spin_lock(&zi->zi_open_zones_lock); + zi->zi_nr_open_gc_zones++; + list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones); + spin_unlock(&zi->zi_open_zones_lock); + return true; } static void @@ -590,7 +609,7 @@ xfs_zone_gc_end_io( wake_up_process(data->mp->m_zone_info->zi_gc_thread); } -static struct xfs_open_zone * +static bool xfs_zone_gc_alloc_blocks( struct xfs_zone_gc_data *data, xfs_extlen_t *count_fsb, @@ -598,11 +617,7 @@ xfs_zone_gc_alloc_blocks( bool *is_seq) { struct xfs_mount *mp = data->mp; - struct xfs_open_zone *oz; - - oz = xfs_zone_gc_ensure_target(mp); - if (!oz) - return NULL; + struct xfs_open_zone *oz = data->oz; *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available)); @@ -624,7 +639,7 @@ xfs_zone_gc_alloc_blocks( spin_unlock(&mp->m_sb_lock); if (!*count_fsb) - return NULL; + return false; *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); @@ -632,7 +647,7 @@ xfs_zone_gc_alloc_blocks( *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); oz->oz_allocated += *count_fsb; atomic_inc(&oz->oz_ref); - return oz; + return true; } static void @@ -657,6 +672,28 @@ xfs_zone_gc_add_data( } while (len); } +static bool +xfs_zone_gc_can_start_chunk( + struct xfs_zone_gc_data *data) +{ + + if (xfs_is_shutdown(data->mp)) + return false; + if (!data->scratch_available) + return false; + + if (!data->iter.victim_rtg) { + if (kthread_should_stop() || kthread_should_park()) + return false; + if (!xfs_zoned_need_gc(data->mp)) + return false; + if (!xfs_zone_gc_select_victim(data)) + return false; + } + + return xfs_zone_gc_select_target(data); +} + static bool xfs_zone_gc_start_chunk( struct xfs_zone_gc_data *data) @@ -664,7 +701,6 @@ xfs_zone_gc_start_chunk( struct xfs_zone_gc_iter *iter = &data->iter; struct xfs_mount *mp = data->mp; struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; - struct xfs_open_zone *oz; struct xfs_rmap_irec irec; struct xfs_gc_bio *chunk; struct xfs_inode *ip; @@ -672,14 +708,15 @@ xfs_zone_gc_start_chunk( xfs_daddr_t daddr; bool is_seq; - if (xfs_is_shutdown(mp)) + if (!xfs_zone_gc_can_start_chunk(data)) return false; - if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) + set_current_state(TASK_RUNNING); + if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip)) return false; - oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, - &is_seq); - if (!oz) { + + if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, + &is_seq)) { xfs_irele(ip); return false; } @@ -699,7 +736,7 @@ xfs_zone_gc_start_chunk( chunk->new_daddr = daddr; chunk->is_seq = is_seq; chunk->data = data; - chunk->oz = oz; + chunk->oz = data->oz; chunk->victim_rtg = iter->victim_rtg; atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); atomic_inc(&chunk->victim_rtg->rtg_gccount); @@ -985,33 +1022,6 @@ xfs_zone_gc_reset_zones( } while (next); } -static bool -xfs_zone_gc_should_start_new_work( - struct xfs_zone_gc_data *data) -{ - struct xfs_open_zone *oz; - - if (xfs_is_shutdown(data->mp)) - return false; - if (!data->scratch_available) - return false; - - oz = xfs_zone_gc_ensure_target(data->mp); - if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) - return false; - - if (!data->iter.victim_rtg) { - if (kthread_should_stop() || kthread_should_park()) - return false; - if (!xfs_zoned_need_gc(data->mp)) - return false; - if (!xfs_zone_gc_select_victim(data)) - return false; - } - - return true; -} - /* * Handle the work to read and write data for GC and to reset the zones, * including handling all completions. @@ -1061,13 +1071,10 @@ xfs_zone_gc_handle_work( } blk_finish_plug(&plug); - if (xfs_zone_gc_should_start_new_work(data)) { - set_current_state(TASK_RUNNING); - blk_start_plug(&plug); - while (xfs_zone_gc_start_chunk(data)) - ; - blk_finish_plug(&plug); - } + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); } /* @@ -1127,6 +1134,8 @@ xfs_zoned_gcd( } xfs_clear_zonegc_running(mp); + if (data->oz) + xfs_open_zone_put(data->oz); if (data->iter.victim_rtg) xfs_rtgroup_rele(data->iter.victim_rtg); @@ -1151,41 +1160,49 @@ xfs_zone_gc_stop( kthread_park(mp->m_zone_info->zi_gc_thread); } +void +xfs_zone_gc_wakeup( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + /* + * If we are unmounting the file system we must not try to + * wake gc as m_zone_info might have been freed already. + */ + if (down_read_trylock(&sb->s_umount)) { + if (!xfs_is_readonly(mp)) + wake_up_process(mp->m_zone_info->zi_gc_thread); + up_read(&sb->s_umount); + } +} + int xfs_zone_gc_mount( struct xfs_mount *mp) { struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_zone_gc_data *data; - struct xfs_open_zone *oz; int error; + data = xfs_zone_gc_data_alloc(mp); + if (!data) + return -ENOMEM; + /* - * If there are no free zones available for GC, pick the open zone with + * If there are no free zones available for GC, or the number of open + * zones has reached the open zone limit, pick the open zone with * the least used space to GC into. This should only happen after an - * unclean shutdown near ENOSPC while GC was ongoing. - * - * We also need to do this for the first gc zone allocation if we - * unmounted while at the open limit. + * unclean shutdown while GC was ongoing. Otherwise a GC zone will + * be selected from the free zone pool on demand. */ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || - zi->zi_nr_open_zones == mp->m_max_open_zones) - oz = xfs_zone_gc_steal_open(zi); - else - oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); - if (!oz) { - xfs_warn(mp, "unable to allocate a zone for gc"); - error = -EIO; - goto out; - } - - trace_xfs_zone_gc_target_opened(oz->oz_rtg); - zi->zi_open_gc_zone = oz; - - data = xfs_zone_gc_data_alloc(mp); - if (!data) { - error = -ENOMEM; - goto out_put_gc_zone; + zi->zi_nr_open_zones >= mp->m_max_open_zones) { + error = xfs_zone_gc_steal_open_zone(data); + if (error) { + xfs_warn(mp, "unable to steal an open zone for gc"); + goto out_free_gc_data; + } } zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, @@ -1193,18 +1210,18 @@ xfs_zone_gc_mount( if (IS_ERR(zi->zi_gc_thread)) { xfs_warn(mp, "unable to create zone gc thread"); error = PTR_ERR(zi->zi_gc_thread); - goto out_free_gc_data; + goto out_put_oz; } /* xfs_zone_gc_start will unpark for rw mounts */ kthread_park(zi->zi_gc_thread); return 0; +out_put_oz: + if (data->oz) + xfs_open_zone_put(data->oz); out_free_gc_data: kfree(data); -out_put_gc_zone: - xfs_open_zone_put(zi->zi_open_gc_zone); -out: return error; } @@ -1215,6 +1232,4 @@ xfs_zone_gc_unmount( struct xfs_zone_info *zi = mp->m_zone_info; kthread_stop(zi->zi_gc_thread); - if (zi->zi_open_gc_zone) - xfs_open_zone_put(zi->zi_open_gc_zone); } diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c index 53eabbc3334c..47b475e21af8 100644 --- a/fs/xfs/xfs_zone_info.c +++ b/fs/xfs/xfs_zone_info.c @@ -30,11 +30,12 @@ xfs_show_open_zone( struct seq_file *m, struct xfs_open_zone *oz) { - seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s %s\n", rtg_rgno(oz->oz_rtg), oz->oz_allocated, oz->oz_written, rtg_rmap(oz->oz_rtg)->i_used_blocks, - xfs_write_hint_to_str(oz->oz_write_hint)); + xfs_write_hint_to_str(oz->oz_write_hint), + oz->oz_is_gc ? "(GC)" : ""); } static void @@ -58,9 +59,8 @@ xfs_show_full_zone_used_distribution( spin_unlock(&zi->zi_used_buckets_lock); full = mp->m_sb.sb_rgcount; - if (zi->zi_open_gc_zone) - full--; full -= zi->zi_nr_open_zones; + full -= zi->zi_nr_open_gc_zones; full -= atomic_read(&zi->zi_nr_free_zones); full -= reclaimable; @@ -90,15 +90,20 @@ xfs_zoned_show_stats( seq_printf(m, "\tRT GC required: %d\n", xfs_zoned_need_gc(mp)); + seq_printf(m, "\ttotal number of zones: %u\n", + mp->m_sb.sb_rgcount); seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); - seq_puts(m, "\topen zones:\n"); + spin_lock(&zi->zi_open_zones_lock); + seq_printf(m, "\tmax open zones: %u\n", + mp->m_max_open_zones); + seq_printf(m, "\tnr open zones: %u\n", + zi->zi_nr_open_zones); + seq_printf(m, "\tnr open GC zones: %u\n", + zi->zi_nr_open_gc_zones); + seq_puts(m, "\topen zones:\n"); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) xfs_show_open_zone(m, oz); - if (zi->zi_open_gc_zone) { - seq_puts(m, "\topen gc zone:\n"); - xfs_show_open_zone(m, zi->zi_open_gc_zone); - } spin_unlock(&zi->zi_open_zones_lock); seq_puts(m, "\tused blocks distribution (fully written zones):\n"); xfs_show_full_zone_used_distribution(m, mp); diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h index 8fbf9a52964e..fcb57506d8e6 100644 --- a/fs/xfs/xfs_zone_priv.h +++ b/fs/xfs/xfs_zone_priv.h @@ -32,11 +32,7 @@ struct xfs_open_zone { */ enum rw_hint oz_write_hint; - /* - * Is this open zone used for garbage collection? There can only be a - * single open GC zone, which is pointed to by zi_open_gc_zone in - * struct xfs_zone_info. Constant over the life time of an open zone. - */ + /* Is this open zone used for garbage collection? */ bool oz_is_gc; /* @@ -68,6 +64,7 @@ struct xfs_zone_info { spinlock_t zi_open_zones_lock; struct list_head zi_open_zones; unsigned int zi_nr_open_zones; + unsigned int zi_nr_open_gc_zones; /* * Free zone search cursor and number of free zones: @@ -81,15 +78,9 @@ struct xfs_zone_info { wait_queue_head_t zi_zone_wait; /* - * Pointer to the GC thread, and the current open zone used by GC - * (if any). - * - * zi_open_gc_zone is mostly private to the GC thread, but can be read - * for debugging from other threads, in which case zi_open_zones_lock - * must be taken to access it. + * Pointer to the GC thread. */ struct task_struct *zi_gc_thread; - struct xfs_open_zone *zi_open_gc_zone; /* * List of zones that need a reset: