Merge tag 'xfs-merge-7.1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs updates from Carlos Maiolino:
 "There aren't any new features.

  The whole series is just a collection of bug fixes and code
  refactoring. There is some new information added a couple new
  tracepoints, new data added to mountstats, but no big changes"

* tag 'xfs-merge-7.1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (41 commits)
  xfs: fix number of GC bvecs
  xfs: untangle the open zones reporting in mountinfo
  xfs: expose the number of open zones in sysfs
  xfs: reduce special casing for the open GC zone
  xfs: streamline GC zone selection
  xfs: refactor GC zone selection helpers
  xfs: rename xfs_zone_gc_iter_next to xfs_zone_gc_iter_irec
  xfs: put the open zone later xfs_open_zone_put
  xfs: add a separate tracepoint for stealing an open zone for GC
  xfs: delay initial open of the GC zone
  xfs: fix a resource leak in xfs_alloc_buftarg()
  xfs: handle too many open zones when mounting
  xfs: refactor xfs_mount_zones
  xfs: fix integer overflow in busy extent sort comparator
  xfs: fix integer overflow in deferred intent sort comparators
  xfs: fold xfs_setattr_size into xfs_vn_setattr_size
  xfs: remove a duplicate assert in xfs_setattr_size
  xfs: return default quota limits for IDs without a dquot
  xfs: start gc on zonegc_low_space attribute updates
  xfs: don't decrement the buffer LRU count for in-use buffers
  ...
This commit is contained in:
Linus Torvalds
2026-04-13 17:03:48 -07:00
26 changed files with 748 additions and 543 deletions

View File

@@ -550,6 +550,10 @@ For zoned file systems, the following attributes are exposed in:
is limited by the capabilities of the backing zoned device, file system
size and the max_open_zones mount option.
nr_open_zones (Min: 0 Default: Varies Max: UINTMAX)
This read-only attribute exposes the current number of open zones
used by the file system.
zonegc_low_space (Min: 0 Default: 0 Max: 100)
Define a percentage for how much of the unused space that GC should keep
available for writing. A high value will reclaim more of the space

View File

@@ -1647,16 +1647,12 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
while ((ret = iomap_iter(&iter, ops)) > 0) {
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
if (WARN_ON_ONCE((iter.iomap.flags & IOMAP_F_FOLIO_BATCH) &&
srcmap->type != IOMAP_UNWRITTEN))
return -EIO;
if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) &&
(srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN)) {
s64 status;
if (range_dirty) {
if (range_dirty && srcmap->type == IOMAP_UNWRITTEN) {
range_dirty = false;
status = iomap_zero_iter_flush_and_stale(&iter);
} else {

View File

@@ -110,10 +110,7 @@ xfs_perag_uninit(
struct xfs_group *xg)
{
#ifdef __KERNEL__
struct xfs_perag *pag = to_perag(xg);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
xfs_buf_cache_destroy(&pag->pag_bcache);
cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
#endif
}
@@ -235,10 +232,6 @@ xfs_perag_alloc(
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
#endif /* __KERNEL__ */
error = xfs_buf_cache_init(&pag->pag_bcache);
if (error)
goto out_free_perag;
/*
* Pre-calculated geometry
*/
@@ -250,12 +243,10 @@ xfs_perag_alloc(
error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
if (error)
goto out_buf_cache_destroy;
goto out_free_perag;
return 0;
out_buf_cache_destroy:
xfs_buf_cache_destroy(&pag->pag_bcache);
out_free_perag:
kfree(pag);
return error;

View File

@@ -85,8 +85,6 @@ struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
struct xfs_buf_cache pag_bcache;
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
#endif /* __KERNEL__ */

View File

@@ -995,7 +995,8 @@ struct xfs_rtgroup_geometry {
__u32 rg_sick; /* o: sick things in ag */
__u32 rg_checked; /* o: checked metadata in ag */
__u32 rg_flags; /* i/o: flags for this ag */
__u32 rg_reserved[27]; /* o: zero */
__u32 rg_writepointer; /* o: write pointer block offset for zoned */
__u32 rg_reserved[26]; /* o: zero */
};
#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */
#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */
@@ -1003,6 +1004,8 @@ struct xfs_rtgroup_geometry {
#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */
#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */
#define XFS_RTGROUP_GEOM_WRITEPOINTER (1U << 0) /* write pointer */
/* Health monitor event domains */
/* affects the whole fs */

View File

@@ -31,20 +31,20 @@ struct kmem_cache *xfs_buf_cache;
*
* xfs_buf_stale:
* b_sema (caller holds)
* b_lock
* b_lockref.lock
* lru_lock
*
* xfs_buf_rele:
* b_lock
* b_lockref.lock
* lru_lock
*
* xfs_buftarg_drain_rele
* lru_lock
* b_lock (trylock due to inversion)
* b_lockref.lock (trylock due to inversion)
*
* xfs_buftarg_isolate
* lru_lock
* b_lock (trylock due to inversion)
* b_lockref.lock (trylock due to inversion)
*/
static void xfs_buf_submit(struct xfs_buf *bp);
@@ -78,14 +78,11 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
spin_lock(&bp->b_lock);
spin_lock(&bp->b_lockref.lock);
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
bp->b_hold--;
ASSERT(bp->b_hold >= 1);
spin_unlock(&bp->b_lock);
if (!__lockref_is_dead(&bp->b_lockref))
list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
spin_unlock(&bp->b_lockref.lock);
}
static void
@@ -277,10 +274,8 @@ xfs_buf_alloc(
* inserting into the hash table are safe (and will have to wait for
* the unlock to do anything non-trivial).
*/
bp->b_hold = 1;
lockref_init(&bp->b_lockref);
sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
@@ -368,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
.obj_cmpfn = _xfs_buf_obj_cmp,
};
int
xfs_buf_cache_init(
struct xfs_buf_cache *bch)
{
return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
void
xfs_buf_cache_destroy(
struct xfs_buf_cache *bch)
{
rhashtable_destroy(&bch->bc_hash);
}
static int
xfs_buf_map_verify(
struct xfs_buftarg *btp,
@@ -437,23 +418,9 @@ xfs_buf_find_lock(
return 0;
}
static bool
xfs_buf_try_hold(
struct xfs_buf *bp)
{
spin_lock(&bp->b_lock);
if (bp->b_hold == 0) {
spin_unlock(&bp->b_lock);
return false;
}
bp->b_hold++;
spin_unlock(&bp->b_lock);
return true;
}
static inline int
xfs_buf_lookup(
struct xfs_buf_cache *bch,
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
@@ -462,8 +429,8 @@ xfs_buf_lookup(
int error;
rcu_read_lock();
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
if (!bp || !xfs_buf_try_hold(bp)) {
bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
rcu_read_unlock();
return -ENOENT;
}
@@ -487,7 +454,6 @@ xfs_buf_lookup(
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
struct xfs_buf_cache *bch,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
@@ -507,14 +473,14 @@ xfs_buf_find_insert(
new_bp->b_pag = pag;
rcu_read_lock();
bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
rcu_read_unlock();
error = PTR_ERR(bp);
goto out_free_buf;
}
if (bp && xfs_buf_try_hold(bp)) {
if (bp && lockref_get_not_dead(&bp->b_lockref)) {
/* found an existing buffer */
rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
@@ -549,16 +515,6 @@ xfs_buftarg_get_pag(
return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
}
static inline struct xfs_buf_cache *
xfs_buftarg_buf_cache(
struct xfs_buftarg *btp,
struct xfs_perag *pag)
{
if (pag)
return &pag->pag_bcache;
return btp->bt_cache;
}
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -572,7 +528,6 @@ xfs_buf_get_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
struct xfs_buf_cache *bch;
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
@@ -589,9 +544,8 @@ xfs_buf_get_map(
return error;
pag = xfs_buftarg_get_pag(btp, &cmap);
bch = xfs_buftarg_buf_cache(btp, pag);
error = xfs_buf_lookup(bch, &cmap, flags, &bp);
error = xfs_buf_lookup(btp, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
@@ -603,7 +557,7 @@ xfs_buf_get_map(
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
@@ -856,84 +810,27 @@ xfs_buf_hold(
{
trace_xfs_buf_hold(bp, _RET_IP_);
spin_lock(&bp->b_lock);
bp->b_hold++;
spin_unlock(&bp->b_lock);
lockref_get(&bp->b_lockref);
}
static void
xfs_buf_rele_uncached(
xfs_buf_destroy(
struct xfs_buf *bp)
{
ASSERT(list_empty(&bp->b_lru));
ASSERT(__lockref_is_dead(&bp->b_lockref));
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
spin_lock(&bp->b_lock);
if (--bp->b_hold) {
spin_unlock(&bp->b_lock);
return;
if (!xfs_buf_is_uncached(bp)) {
rhashtable_remove_fast(&bp->b_target->bt_hash,
&bp->b_rhash_head, xfs_buf_hash_params);
if (bp->b_pag)
xfs_perag_put(bp->b_pag);
}
spin_unlock(&bp->b_lock);
xfs_buf_free(bp);
}
static void
xfs_buf_rele_cached(
struct xfs_buf *bp)
{
struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag;
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
spin_lock(&bp->b_lock);
ASSERT(bp->b_hold >= 1);
if (bp->b_hold > 1) {
bp->b_hold--;
goto out_unlock;
}
/* we are asked to drop the last reference */
if (atomic_read(&bp->b_lru_ref)) {
/*
* If the buffer is added to the LRU, keep the reference to the
* buffer for the LRU and clear the (now stale) dispose list
* state flag, else drop the reference.
*/
if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
bp->b_state &= ~XFS_BSTATE_DISPOSE;
else
bp->b_hold--;
} else {
bp->b_hold--;
/*
* most of the time buffers will already be removed from the
* LRU, so optimise that case by checking for the
* XFS_BSTATE_DISPOSE flag indicating the last list the buffer
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
if (pag)
xfs_perag_put(pag);
freebuf = true;
}
out_unlock:
spin_unlock(&bp->b_lock);
if (freebuf)
xfs_buf_free(bp);
}
/*
* Release a hold on the specified buffer.
*/
@@ -942,10 +839,23 @@ xfs_buf_rele(
struct xfs_buf *bp)
{
trace_xfs_buf_rele(bp, _RET_IP_);
if (xfs_buf_is_uncached(bp))
xfs_buf_rele_uncached(bp);
else
xfs_buf_rele_cached(bp);
if (lockref_put_or_lock(&bp->b_lockref))
return;
if (!--bp->b_lockref.count) {
if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
goto kill;
list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
}
spin_unlock(&bp->b_lockref.lock);
return;
kill:
lockref_mark_dead(&bp->b_lockref);
list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
spin_unlock(&bp->b_lockref.lock);
xfs_buf_destroy(bp);
}
/*
@@ -1254,9 +1164,11 @@ xfs_buf_ioerror_alert(
/*
* To simulate an I/O failure, the buffer must be locked and held with at least
* three references. The LRU reference is dropped by the stale call. The buf
* item reference is dropped via ioend processing. The third reference is owned
* by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
* two references.
*
* The buf item reference is dropped via ioend processing. The second reference
* is owned by the caller and is dropped on I/O completion if the buffer is
* XBF_ASYNC.
*/
void
xfs_buf_ioend_fail(
@@ -1512,23 +1424,18 @@ xfs_buftarg_drain_rele(
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
struct list_head *dispose = arg;
if (!spin_trylock(&bp->b_lock))
if (!spin_trylock(&bp->b_lockref.lock))
return LRU_SKIP;
if (bp->b_hold > 1) {
if (bp->b_lockref.count > 0) {
/* need to wait, so skip it this pass */
spin_unlock(&bp->b_lock);
spin_unlock(&bp->b_lockref.lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
/*
* clear the LRU reference count so the buffer doesn't get
* ignored in xfs_buf_rele().
*/
atomic_set(&bp->b_lru_ref, 0);
bp->b_state |= XFS_BSTATE_DISPOSE;
lockref_mark_dead(&bp->b_lockref);
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
@@ -1581,7 +1488,7 @@ xfs_buftarg_drain(
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
(long long)xfs_buf_daddr(bp));
}
xfs_buf_rele(bp);
xfs_buf_destroy(bp);
}
if (loop++ != 0)
delay(100);
@@ -1610,24 +1517,37 @@ xfs_buftarg_isolate(
struct list_head *dispose = arg;
/*
* we are inverting the lru lock/bp->b_lock here, so use a trylock.
* If we fail to get the lock, just skip it.
* We are inverting the lru lock vs bp->b_lockref.lock order here, so
* use a trylock. If we fail to get the lock, just skip the buffer.
*/
if (!spin_trylock(&bp->b_lock))
if (!spin_trylock(&bp->b_lockref.lock))
return LRU_SKIP;
/*
* If the buffer is in use, remove it from the LRU for now. We can't
* free it while someone is using it, and we should also not count
* eviction passed for it, just as if it hadn't been added to the LRU
* yet.
*/
if (bp->b_lockref.count > 0) {
list_lru_isolate(lru, &bp->b_lru);
spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
/*
* Decrement the b_lru_ref count unless the value is already
* zero. If the value is already zero, we need to reclaim the
* buffer, otherwise it gets another trip through the LRU.
*/
if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
spin_unlock(&bp->b_lock);
spin_unlock(&bp->b_lockref.lock);
return LRU_ROTATE;
}
bp->b_state |= XFS_BSTATE_DISPOSE;
lockref_mark_dead(&bp->b_lockref);
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
@@ -1647,7 +1567,7 @@ xfs_buftarg_shrink_scan(
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
xfs_buf_destroy(bp);
}
return freed;
@@ -1670,6 +1590,7 @@ xfs_destroy_buftarg(
ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
percpu_counter_destroy(&btp->bt_readahead_count);
list_lru_destroy(&btp->bt_lru);
rhashtable_destroy(&btp->bt_hash);
}
void
@@ -1764,8 +1685,10 @@ xfs_init_buftarg(
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
if (list_lru_init(&btp->bt_lru))
if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
return -ENOMEM;
if (list_lru_init(&btp->bt_lru))
goto out_destroy_hash;
if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
goto out_destroy_lru;
@@ -1783,6 +1706,8 @@ out_destroy_io_count:
percpu_counter_destroy(&btp->bt_readahead_count);
out_destroy_lru:
list_lru_destroy(&btp->bt_lru);
out_destroy_hash:
rhashtable_destroy(&btp->bt_hash);
return -ENOMEM;
}
@@ -1831,6 +1756,7 @@ xfs_alloc_buftarg(
return btp;
error_free:
fs_put_dax(btp->bt_daxdev, mp);
kfree(btp);
return ERR_PTR(error);
}

View File

@@ -14,6 +14,7 @@
#include <linux/dax.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
#include <linux/lockref.h>
extern struct kmem_cache *xfs_buf_cache;
@@ -68,18 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }
/*
* Internal state flags.
*/
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
struct xfs_buf_cache {
struct rhashtable bc_hash;
};
int xfs_buf_cache_init(struct xfs_buf_cache *bch);
void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
/*
* The xfs_buftarg contains 2 notions of "sector size" -
*
@@ -117,8 +106,7 @@ struct xfs_buftarg {
unsigned int bt_awu_min;
unsigned int bt_awu_max;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
struct rhashtable bt_hash;
};
struct xfs_buf_map {
@@ -159,7 +147,7 @@ struct xfs_buf {
xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
unsigned int b_hold; /* reference count */
struct lockref b_lockref; /* refcount + lock */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
@@ -169,8 +157,6 @@ struct xfs_buf {
* bt_lru_lock and not by b_sema
*/
struct list_head b_lru; /* lru list */
spinlock_t b_lock; /* internal state lock */
unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag;

View File

@@ -58,7 +58,7 @@ xmbuf_alloc(
struct xfs_buftarg *btp;
int error;
btp = kzalloc_flex(*btp, bt_cache, 1);
btp = kzalloc_obj(*btp);
if (!btp)
return -ENOMEM;
@@ -81,10 +81,6 @@ xmbuf_alloc(
/* ensure all writes are below EOF to avoid pagecache zeroing */
i_size_write(inode, inode->i_sb->s_maxbytes);
error = xfs_buf_cache_init(btp->bt_cache);
if (error)
goto out_file;
/* Initialize buffer target */
btp->bt_mount = mp;
btp->bt_dev = (dev_t)-1U;
@@ -95,15 +91,13 @@ xmbuf_alloc(
error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
if (error)
goto out_bcache;
goto out_file;
trace_xmbuf_create(btp);
*btpp = btp;
return 0;
out_bcache:
xfs_buf_cache_destroy(btp->bt_cache);
out_file:
fput(file);
out_free_btp:
@@ -122,7 +116,6 @@ xmbuf_free(
trace_xmbuf_free(btp);
xfs_destroy_buftarg(btp);
xfs_buf_cache_destroy(btp->bt_cache);
fput(btp->bt_file);
kfree(btp);
}

View File

@@ -690,9 +690,9 @@ xfs_extent_busy_ag_cmp(
container_of(l2, struct xfs_extent_busy, list);
s32 diff;
diff = b1->group->xg_gno - b2->group->xg_gno;
diff = cmp_int(b1->group->xg_gno, b2->group->xg_gno);
if (!diff)
diff = b1->bno - b2->bno;
diff = cmp_int(b1->bno, b2->bno);
return diff;
}

View File

@@ -387,7 +387,7 @@ xfs_extent_free_diff_items(
struct xfs_extent_free_item *ra = xefi_entry(a);
struct xfs_extent_free_item *rb = xefi_entry(b);
return ra->xefi_group->xg_gno - rb->xefi_group->xg_gno;
return cmp_int(ra->xefi_group->xg_gno, rb->xefi_group->xg_gno);
}
/* Log a free extent to the intent item. */

View File

@@ -560,6 +560,72 @@ xfs_zoned_write_space_reserve(
flags, ac);
}
/*
* We need to lock the test/set EOF update as we can be racing with
* other IO completions here to update the EOF. Failing to serialise
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*
* As IO completion only ever extends EOF, we can do an unlocked check
* here to avoid taking the spinlock. If we land within the current EOF,
* then we do not need to do an extending update at all, and we don't
* need to take the lock to check this. If we race with an update moving
* EOF, then we'll either still be beyond EOF and need to take the lock,
* or we'll be within EOF and we don't need to take it at all.
*/
static int
xfs_dio_endio_set_isize(
struct inode *inode,
loff_t offset,
ssize_t size)
{
struct xfs_inode *ip = XFS_I(inode);
if (offset + size <= i_size_read(inode))
return 0;
spin_lock(&ip->i_flags_lock);
if (offset + size <= i_size_read(inode)) {
spin_unlock(&ip->i_flags_lock);
return 0;
}
i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock);
return xfs_setfilesize(ip, offset, size);
}
static int
xfs_zoned_dio_write_end_io(
struct kiocb *iocb,
ssize_t size,
int error,
unsigned flags)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
unsigned int nofs_flag;
ASSERT(!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
trace_xfs_end_io_direct_write(ip, iocb->ki_pos, size);
if (xfs_is_shutdown(ip->i_mount))
return -EIO;
if (error || !size)
return error;
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
nofs_flag = memalloc_nofs_save();
error = xfs_dio_endio_set_isize(inode, iocb->ki_pos, size);
memalloc_nofs_restore(nofs_flag);
return error;
}
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@@ -572,8 +638,7 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
ASSERT(!xfs_is_zoned_inode(ip) ||
!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
ASSERT(!xfs_is_zoned_inode(ip));
trace_xfs_end_io_direct_write(ip, offset, size);
@@ -623,30 +688,8 @@ xfs_dio_write_end_io(
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
* if necessary.
*
* We need to lock the test/set EOF update as we can be racing with
* other IO completions here to update the EOF. Failing to serialise
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*
* As IO completion only ever extends EOF, we can do an unlocked check
* here to avoid taking the spinlock. If we land within the current EOF,
* then we do not need to do an extending update at all, and we don't
* need to take the lock to check this. If we race with an update moving
* EOF, then we'll either still be beyond EOF and need to take the lock,
* or we'll be within EOF and we don't need to take it at all.
*/
if (offset + size <= i_size_read(inode))
goto out;
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode)) {
i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock);
error = xfs_setfilesize(ip, offset, size);
} else {
spin_unlock(&ip->i_flags_lock);
}
error = xfs_dio_endio_set_isize(inode, offset, size);
out:
memalloc_nofs_restore(nofs_flag);
@@ -688,7 +731,7 @@ xfs_dio_zoned_submit_io(
static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
.bio_set = &iomap_ioend_bioset,
.submit_io = xfs_dio_zoned_submit_io,
.end_io = xfs_dio_write_end_io,
.end_io = xfs_zoned_dio_write_end_io,
};
/*
@@ -1263,6 +1306,23 @@ xfs_falloc_insert_range(
if (offset >= isize)
return -EINVAL;
/*
* Let writeback clean up EOF folio state before we bump i_size. The
* insert flushes before it starts shifting and under certain
* circumstances we can write back blocks that should technically be
* considered post-eof (and thus should not be submitted for writeback).
*
* For example, a large, dirty folio that spans EOF and is backed by
* post-eof COW fork preallocation can cause block remap into the data
* fork. This shifts back out beyond EOF, but creates an expectedly
* written post-eof block. The insert is going to flush, unmap and
* cancel prealloc across this whole range, so flush EOF now before we
* bump i_size to provide consistent behavior.
*/
error = filemap_write_and_wait_range(inode->i_mapping, isize, isize);
if (error)
return error;
error = xfs_falloc_setsize(file, isize + len);
if (error)
return error;

View File

@@ -37,12 +37,15 @@
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_file.h"
#include "xfs_exchrange.h"
#include "xfs_handle.h"
#include "xfs_rtgroup.h"
#include "xfs_healthmon.h"
#include "xfs_verify_media.h"
#include "xfs_zone_priv.h"
#include "xfs_zone_alloc.h"
#include <linux/mount.h>
#include <linux/fileattr.h>
@@ -413,6 +416,7 @@ xfs_ioc_rtgroup_geometry(
{
struct xfs_rtgroup *rtg;
struct xfs_rtgroup_geometry rgeo;
xfs_rgblock_t highest_rgbno;
int error;
if (copy_from_user(&rgeo, arg, sizeof(rgeo)))
@@ -433,6 +437,21 @@ xfs_ioc_rtgroup_geometry(
if (error)
return error;
if (xfs_has_zoned(mp)) {
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
if (rtg->rtg_open_zone) {
rgeo.rg_writepointer = rtg->rtg_open_zone->oz_allocated;
} else {
highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
if (highest_rgbno == NULLRGBLOCK)
rgeo.rg_writepointer = 0;
else
rgeo.rg_writepointer = highest_rgbno + 1;
}
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
rgeo.rg_flags |= XFS_RTGROUP_GEOM_WRITEPOINTER;
}
if (copy_to_user(arg, &rgeo, sizeof(rgeo)))
return -EFAULT;
return 0;

View File

@@ -1593,6 +1593,7 @@ xfs_zoned_buffered_write_iomap_begin(
{
struct iomap_iter *iter =
container_of(iomap, struct iomap_iter, iomap);
struct address_space *mapping = inode->i_mapping;
struct xfs_zone_alloc_ctx *ac = iter->private;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1617,6 +1618,7 @@ xfs_zoned_buffered_write_iomap_begin(
if (error)
return error;
restart:
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
@@ -1654,14 +1656,6 @@ xfs_zoned_buffered_write_iomap_begin(
&smap))
smap.br_startoff = end_fsb; /* fake hole until EOF */
if (smap.br_startoff > offset_fsb) {
/*
* We never need to allocate blocks for zeroing a hole.
*/
if (flags & IOMAP_ZERO) {
xfs_hole_to_iomap(ip, iomap, offset_fsb,
smap.br_startoff);
goto out_unlock;
}
end_fsb = min(end_fsb, smap.br_startoff);
} else {
end_fsb = min(end_fsb,
@@ -1693,6 +1687,33 @@ xfs_zoned_buffered_write_iomap_begin(
count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
/*
* When zeroing, don't allocate blocks for holes as they are already
* zeroes, but we need to ensure that no extents exist in both the data
* and COW fork to ensure this really is a hole.
*
* A window exists where we might observe a hole in both forks with
* valid data in cache. Writeback removes the COW fork blocks on
* submission but doesn't remap into the data fork until completion. If
* the data fork was previously a hole, we'll fail to zero. Until we
* find a way to avoid this transient state, check for dirty pagecache
* and flush to wait on blocks to land in the data fork.
*/
if ((flags & IOMAP_ZERO) && srcmap->type == IOMAP_HOLE) {
if (filemap_range_needs_writeback(mapping, offset,
offset + count - 1)) {
xfs_iunlock(ip, lockmode);
error = filemap_write_and_wait_range(mapping, offset,
offset + count - 1);
if (error)
return error;
goto restart;
}
xfs_hole_to_iomap(ip, iomap, offset_fsb, end_fsb);
goto out_unlock;
}
/*
* The block reservation is supposed to cover all blocks that the
* operation could possible write, but there is a nasty corner case
@@ -1767,6 +1788,8 @@ xfs_buffered_write_iomap_begin(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
struct xfs_bmbt_irec imap, cmap;
struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
@@ -1811,30 +1834,96 @@ xfs_buffered_write_iomap_begin(
goto out_unlock;
/*
* Search the data fork first to look up our source mapping. We
* always need the data fork map, as we have to return it to the
* iomap code so that the higher level write code can read data in to
* perform read-modify-write cycles for unaligned writes.
* Search the data fork first to look up our source mapping. We always
* need the data fork map, as we have to return it to the iomap code so
* that the higher level write code can read data in to perform
* read-modify-write cycles for unaligned writes.
*
* Then search the COW fork extent list even if we did not find a data
* fork extent. This serves two purposes: first this implements the
* speculative preallocation using cowextsize, so that we also unshare
* block adjacent to shared blocks instead of just the shared blocks
* themselves. Second the lookup in the extent list is generally faster
* than going out to the shared extent tree.
*/
eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
imap.br_startoff = end_fsb; /* fake hole until the end */
if (xfs_is_cow_inode(ip)) {
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
}
cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
&ccur, &cmap);
if (!cow_eof)
cow_fsb = cmap.br_startoff;
}
/* We never need to allocate blocks for zeroing or unsharing a hole. */
if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) &&
imap.br_startoff > offset_fsb) {
/* We never need to allocate blocks for unsharing a hole. */
if ((flags & IOMAP_UNSHARE) && imap.br_startoff > offset_fsb) {
xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
goto out_unlock;
}
/*
* We may need to zero over a hole in the data fork if it's fronted by
* COW blocks and dirty pagecache. Scan such file ranges for dirty
* cache and fill the iomap batch with folios that need zeroing.
*/
if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
loff_t start, end;
unsigned int fbatch_count;
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
imap.br_state = XFS_EXT_NORM;
if (cow_fsb == NULLFILEOFF)
goto found_imap;
if (cow_fsb > offset_fsb) {
xfs_trim_extent(&imap, offset_fsb,
cow_fsb - offset_fsb);
goto found_imap;
}
/* no zeroing beyond eof, so split at the boundary */
if (offset_fsb >= eof_fsb)
goto found_imap;
if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
xfs_trim_extent(&imap, offset_fsb,
eof_fsb - offset_fsb);
/* COW fork blocks overlap the hole */
xfs_trim_extent(&imap, offset_fsb,
cmap.br_startoff + cmap.br_blockcount - offset_fsb);
start = XFS_FSB_TO_B(mp, imap.br_startoff);
end = XFS_FSB_TO_B(mp, imap.br_startoff + imap.br_blockcount);
fbatch_count = iomap_fill_dirty_folios(iter, &start, end,
&iomap_flags);
xfs_trim_extent(&imap, offset_fsb,
XFS_B_TO_FSB(mp, start) - offset_fsb);
/*
* Report the COW mapping if we have folios to zero. Otherwise
* ignore the COW blocks as preallocation and report a hole.
*/
if (fbatch_count) {
xfs_trim_extent(&cmap, imap.br_startoff,
imap.br_blockcount);
imap.br_startoff = end_fsb; /* fake hole */
goto found_cow;
}
goto found_imap;
}
/*
* For zeroing, trim extents that extend beyond the EOF block. If a
* delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
if (flags & IOMAP_ZERO) {
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
if (isnullstartblock(imap.br_startblock) &&
offset_fsb >= eof_fsb)
goto convert_delay;
@@ -1867,24 +1956,13 @@ xfs_buffered_write_iomap_begin(
}
/*
* Search the COW fork extent list even if we did not find a data fork
* extent. This serves two purposes: first this implements the
* speculative preallocation using cowextsize, so that we also unshare
* block adjacent to shared blocks instead of just the shared blocks
* themselves. Second the lookup in the extent list is generally faster
* than going out to the shared extent tree.
* Now that we've handled any operation specific special cases, at this
* point we can report a COW mapping if found.
*/
if (xfs_is_cow_inode(ip)) {
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
}
cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
&ccur, &cmap);
if (!cow_eof && cmap.br_startoff <= offset_fsb) {
trace_xfs_reflink_cow_found(ip, &cmap);
goto found_cow;
}
if (xfs_is_cow_inode(ip) &&
!cow_eof && cmap.br_startoff <= offset_fsb) {
trace_xfs_reflink_cow_found(ip, &cmap);
goto found_cow;
}
if (imap.br_startoff <= offset_fsb) {

View File

@@ -901,20 +901,18 @@ out_dqrele:
/*
* Truncate file. Must have write permission and not be a directory.
*
* Caution: The caller of this function is responsible for calling
* setattr_prepare() or otherwise verifying the change is fine.
*/
STATIC int
xfs_setattr_size(
int
xfs_vn_setattr_size(
struct mnt_idmap *idmap,
struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
xfs_off_t oldsize, newsize;
xfs_off_t oldsize = inode->i_size;
xfs_off_t newsize = iattr->ia_size;
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
@@ -927,8 +925,11 @@ xfs_setattr_size(
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;
trace_xfs_setattr(ip);
error = xfs_vn_change_ok(idmap, dentry, iattr);
if (error)
return error;
/*
* Short circuit the truncate case for zero length files.
@@ -1109,7 +1110,6 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
setattr_copy(idmap, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1129,23 +1129,6 @@ out_trans_cancel:
goto out_unlock;
}
int
xfs_vn_setattr_size(
struct mnt_idmap *idmap,
struct dentry *dentry,
struct iattr *iattr)
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error;
trace_xfs_setattr(ip);
error = xfs_vn_change_ok(idmap, dentry, iattr);
if (error)
return error;
return xfs_setattr_size(idmap, dentry, ip, iattr);
}
STATIC int
xfs_vn_setattr(
struct mnt_idmap *idmap,

View File

@@ -44,17 +44,36 @@
#include "xfs_healthmon.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
static DEFINE_XARRAY_ALLOC(xfs_uuid_table);
static uuid_t *
xfs_uuid_search(
uuid_t *new_uuid)
{
unsigned long index = 0;
uuid_t *uuid;
xa_for_each(&xfs_uuid_table, index, uuid) {
if (uuid_equal(uuid, new_uuid))
return uuid;
}
return NULL;
}
static void
xfs_uuid_delete(
uuid_t *uuid,
unsigned int index)
{
ASSERT(uuid_equal(xa_load(&xfs_uuid_table, index), uuid));
xa_erase(&xfs_uuid_table, index);
}
void
xfs_uuid_table_free(void)
{
if (xfs_uuid_table_size == 0)
return;
kfree(xfs_uuid_table);
xfs_uuid_table = NULL;
xfs_uuid_table_size = 0;
ASSERT(xa_empty(&xfs_uuid_table));
xa_destroy(&xfs_uuid_table);
}
/*
@@ -66,7 +85,7 @@ xfs_uuid_mount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
int hole, i;
int ret;
/* Publish UUID in struct super_block */
super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
@@ -80,30 +99,17 @@ xfs_uuid_mount(
}
mutex_lock(&xfs_uuid_table_mutex);
for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
if (uuid_is_null(&xfs_uuid_table[i])) {
hole = i;
continue;
}
if (uuid_equal(uuid, &xfs_uuid_table[i]))
goto out_duplicate;
if (unlikely(xfs_uuid_search(uuid))) {
xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount",
uuid);
mutex_unlock(&xfs_uuid_table_mutex);
return -EINVAL;
}
if (hole < 0) {
xfs_uuid_table = krealloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
GFP_KERNEL | __GFP_NOFAIL);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
ret = xa_alloc(&xfs_uuid_table, &mp->m_uuid_table_index, uuid,
xa_limit_32b, GFP_KERNEL);
mutex_unlock(&xfs_uuid_table_mutex);
return 0;
out_duplicate:
mutex_unlock(&xfs_uuid_table_mutex);
xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
return -EINVAL;
return ret;
}
STATIC void
@@ -111,21 +117,12 @@ xfs_uuid_unmount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
int i;
if (xfs_has_nouuid(mp))
return;
mutex_lock(&xfs_uuid_table_mutex);
for (i = 0; i < xfs_uuid_table_size; i++) {
if (uuid_is_null(&xfs_uuid_table[i]))
continue;
if (!uuid_equal(uuid, &xfs_uuid_table[i]))
continue;
memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
break;
}
ASSERT(i < xfs_uuid_table_size);
xfs_uuid_delete(uuid, mp->m_uuid_table_index);
mutex_unlock(&xfs_uuid_table_mutex);
}

View File

@@ -346,6 +346,9 @@ typedef struct xfs_mount {
/* Private data referring to a health monitor object. */
struct xfs_healthmon __rcu *m_healthmon;
/* Index of uuid record in the uuid xarray. */
unsigned int m_uuid_table_index;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)

View File

@@ -391,6 +391,38 @@ out_rele:
return error;
}
/*
* Fill out the default quota limits for an ID that has no dquot on disk.
* Returns 0 if default limits are configured
* and were filled in, -ENOENT otherwise.
*/
static int
xfs_qm_scall_getquota_fill_defaults(
struct xfs_mount *mp,
xfs_dqtype_t type,
struct qc_dqblk *dst)
{
struct xfs_def_quota *defq;
defq = xfs_get_defquota(mp->m_quotainfo, type);
if (!defq->blk.soft && !defq->blk.hard &&
!defq->ino.soft && !defq->ino.hard &&
!defq->rtb.soft && !defq->rtb.hard) {
return -ENOENT;
}
memset(dst, 0, sizeof(*dst));
dst->d_spc_softlimit = XFS_FSB_TO_B(mp, defq->blk.soft);
dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, defq->blk.hard);
dst->d_ino_softlimit = defq->ino.soft;
dst->d_ino_hardlimit = defq->ino.hard;
dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, defq->rtb.soft);
dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, defq->rtb.hard);
return 0;
}
/* Fill out the quota context. */
static void
xfs_qm_scall_getquota_fill_qc(
@@ -451,8 +483,17 @@ xfs_qm_scall_getquota(
* set doalloc. If it doesn't exist, we'll get ENOENT back.
*/
error = xfs_qm_dqget(mp, id, type, false, &dqp);
if (error)
if (error) {
/*
* If there is no dquot on disk and default limits are
* configured, return them with zero usage so that
* unprivileged users can see what limits apply to them.
*/
if (error == -ENOENT && id != 0 &&
!xfs_qm_scall_getquota_fill_defaults(mp, type, dst))
return 0;
return error;
}
/*
* If everything's NULL, this dquot doesn't quite exist as far as

View File

@@ -266,7 +266,7 @@ xfs_refcount_update_diff_items(
struct xfs_refcount_intent *ra = ci_entry(a);
struct xfs_refcount_intent *rb = ci_entry(b);
return ra->ri_group->xg_gno - rb->ri_group->xg_gno;
return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno);
}
/* Log refcount updates in the intent item. */

View File

@@ -267,7 +267,7 @@ xfs_rmap_update_diff_items(
struct xfs_rmap_intent *ra = ri_entry(a);
struct xfs_rmap_intent *rb = ri_entry(b);
return ra->ri_group->xg_gno - rb->ri_group->xg_gno;
return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno);
}
/* Log rmap updates in the intent item. */

View File

@@ -13,7 +13,9 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
#include "xfs_zone_alloc.h"
struct xfs_sysfs_attr {
struct attribute attr;
@@ -718,12 +720,24 @@ max_open_zones_show(
}
XFS_SYSFS_ATTR_RO(max_open_zones);
static ssize_t
nr_open_zones_show(
struct kobject *kobj,
char *buf)
{
struct xfs_zone_info *zi = zoned_to_mp(kobj)->m_zone_info;
return sysfs_emit(buf, "%u\n", READ_ONCE(zi->zi_nr_open_zones));
}
XFS_SYSFS_ATTR_RO(nr_open_zones);
static ssize_t
zonegc_low_space_store(
struct kobject *kobj,
const char *buf,
size_t count)
{
struct xfs_mount *mp = zoned_to_mp(kobj);
int ret;
unsigned int val;
@@ -734,7 +748,10 @@ zonegc_low_space_store(
if (val > 100)
return -EINVAL;
zoned_to_mp(kobj)->m_zonegc_low_space = val;
if (mp->m_zonegc_low_space != val) {
mp->m_zonegc_low_space = val;
xfs_zone_gc_wakeup(mp);
}
return count;
}
@@ -751,6 +768,7 @@ XFS_SYSFS_ATTR_RW(zonegc_low_space);
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
ATTR_LIST(nr_open_zones),
ATTR_LIST(zonegc_low_space),
NULL,
};

View File

@@ -394,6 +394,7 @@ DEFINE_ZONE_EVENT(xfs_zone_full);
DEFINE_ZONE_EVENT(xfs_zone_opened);
DEFINE_ZONE_EVENT(xfs_zone_reset);
DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
DEFINE_ZONE_EVENT(xfs_zone_gc_target_stolen);
TRACE_EVENT(xfs_zone_free_blocks,
TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
@@ -461,6 +462,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_spurious_open);
TRACE_EVENT(xfs_zone_gc_select_victim,
TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
@@ -740,7 +742,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
__entry->hold = bp->b_hold;
__entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
@@ -814,7 +816,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
__entry->flags = flags;
__entry->hold = bp->b_hold;
__entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->caller_ip = caller_ip;
@@ -858,7 +860,7 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
__entry->hold = bp->b_hold;
__entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->error = error;
@@ -902,7 +904,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
__entry->buf_len = bip->bli_buf->b_length;
__entry->buf_flags = bip->bli_buf->b_flags;
__entry->buf_hold = bip->bli_buf->b_hold;
__entry->buf_hold = bip->bli_buf->b_lockref.count;
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
__entry->li_flags = bip->bli_item.li_flags;
@@ -5206,7 +5208,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
__entry->hold = bp->b_hold;
__entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;

View File

@@ -174,42 +174,33 @@ xfs_open_zone_mark_full(
WRITE_ONCE(rtg->rtg_open_zone, NULL);
spin_lock(&zi->zi_open_zones_lock);
if (oz->oz_is_gc) {
ASSERT(current == zi->zi_gc_thread);
zi->zi_open_gc_zone = NULL;
} else {
if (oz->oz_is_gc)
zi->zi_nr_open_gc_zones--;
else
zi->zi_nr_open_zones--;
list_del_init(&oz->oz_entry);
}
list_del_init(&oz->oz_entry);
spin_unlock(&zi->zi_open_zones_lock);
xfs_open_zone_put(oz);
wake_up_all(&zi->zi_zone_wait);
if (oz->oz_is_gc)
wake_up_process(zi->zi_gc_thread);
else
wake_up_all(&zi->zi_zone_wait);
if (used < rtg_blocks(rtg))
xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
xfs_open_zone_put(oz);
}
static void
xfs_zone_record_blocks(
struct xfs_trans *tp,
static inline void
xfs_zone_inc_written(
struct xfs_open_zone *oz,
xfs_fsblock_t fsbno,
xfs_filblks_t len)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = oz->oz_rtg;
struct xfs_inode *rmapip = rtg_rmap(rtg);
xfs_assert_ilocked(rtg_rmap(oz->oz_rtg), XFS_ILOCK_EXCL);
trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
rmapip->i_used_blocks += len;
ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
oz->oz_written += len;
if (oz->oz_written == rtg_blocks(rtg))
if (oz->oz_written == rtg_blocks(oz->oz_rtg))
xfs_open_zone_mark_full(oz);
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
}
/*
@@ -227,9 +218,7 @@ xfs_zone_skip_blocks(
trace_xfs_zone_skip_blocks(oz, 0, len);
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
oz->oz_written += len;
if (oz->oz_written == rtg_blocks(rtg))
xfs_open_zone_mark_full(oz);
xfs_zone_inc_written(oz, len);
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
xfs_add_frextents(rtg_mount(rtg), len);
@@ -244,6 +233,8 @@ xfs_zoned_map_extent(
xfs_fsblock_t old_startblock)
{
struct xfs_bmbt_irec data;
struct xfs_rtgroup *rtg = oz->oz_rtg;
struct xfs_inode *rmapip = rtg_rmap(rtg);
int nmaps = 1;
int error;
@@ -302,7 +293,15 @@ xfs_zoned_map_extent(
}
}
xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
trace_xfs_zone_record_blocks(oz,
xfs_rtb_to_rgbno(tp->t_mountp, new->br_startblock),
new->br_blockcount);
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
rmapip->i_used_blocks += new->br_blockcount;
ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
xfs_zone_inc_written(oz, new->br_blockcount);
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
/* Map the new blocks into the data fork. */
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
@@ -560,6 +559,9 @@ xfs_try_use_zone(
struct xfs_open_zone *oz,
unsigned int goodness)
{
if (oz->oz_is_gc)
return false;
if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return false;
@@ -681,10 +683,11 @@ xfs_select_zone_nowait(
if (oz)
goto out_unlock;
if (pack_tight)
if (pack_tight) {
oz = xfs_select_open_zone_mru(zi, write_hint);
if (oz)
goto out_unlock;
if (oz)
goto out_unlock;
}
/*
* See if we can open a new zone and use that so that data for different
@@ -695,7 +698,7 @@ xfs_select_zone_nowait(
goto out_unlock;
/*
* Try to find an zone that is an ok match to colocate data with.
* Try to find a zone that is an ok match to colocate data with.
*/
oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
if (oz)
@@ -1232,6 +1235,100 @@ xfs_free_zone_info(
kfree(zi);
}
static int
xfs_report_zones(
struct xfs_mount *mp,
struct xfs_init_zones *iz)
{
struct xfs_rtgroup *rtg = NULL;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
xfs_rgblock_t write_pointer;
int error;
error = xfs_query_write_pointer(iz, rtg, &write_pointer);
if (!error)
error = xfs_init_zone(iz, rtg, write_pointer);
if (error) {
xfs_rtgroup_rele(rtg);
return error;
}
}
return 0;
}
static inline bool
xfs_zone_is_conv(
struct xfs_rtgroup *rtg)
{
return !bdev_zone_is_seq(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
xfs_gbno_to_daddr(rtg_group(rtg), 0));
}
static struct xfs_open_zone *
xfs_find_fullest_conventional_open_zone(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_open_zone *found = NULL, *oz;
spin_lock(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
if (!xfs_zone_is_conv(oz->oz_rtg))
continue;
if (!found || oz->oz_allocated > found->oz_allocated)
found = oz;
}
spin_unlock(&zi->zi_open_zones_lock);
return found;
}
/*
* Find the fullest conventional zones and remove them from the open zone pool
* until we are at the open zone limit.
*
* We can end up with spurious "open" zones when the last blocks in a fully
* written zone were invalidate as there is no write pointer for conventional
* zones.
*
* If we are still over the limit when there is no conventional open zone left,
* the user overrode the max open zones limit using the max_open_zones mount
* option we should fail.
*/
static int
xfs_finish_spurious_open_zones(
struct xfs_mount *mp,
struct xfs_init_zones *iz)
{
struct xfs_zone_info *zi = mp->m_zone_info;
while (zi->zi_nr_open_zones > mp->m_max_open_zones) {
struct xfs_open_zone *oz;
xfs_filblks_t adjust;
oz = xfs_find_fullest_conventional_open_zone(mp);
if (!oz) {
xfs_err(mp,
"too many open zones for max_open_zones limit (%u/%u)",
zi->zi_nr_open_zones, mp->m_max_open_zones);
return -EINVAL;
}
xfs_rtgroup_lock(oz->oz_rtg, XFS_RTGLOCK_RMAP);
adjust = rtg_blocks(oz->oz_rtg) - oz->oz_written;
trace_xfs_zone_spurious_open(oz, oz->oz_written, adjust);
oz->oz_written = rtg_blocks(oz->oz_rtg);
xfs_open_zone_mark_full(oz);
xfs_rtgroup_unlock(oz->oz_rtg, XFS_RTGLOCK_RMAP);
iz->available -= adjust;
iz->reclaimable += adjust;
}
return 0;
}
int
xfs_mount_zones(
struct xfs_mount *mp)
@@ -1240,7 +1337,6 @@ xfs_mount_zones(
.zone_capacity = mp->m_groups[XG_TYPE_RTG].blocks,
.zone_size = xfs_rtgroup_raw_size(mp),
};
struct xfs_rtgroup *rtg = NULL;
int error;
if (!mp->m_rtdev_targp) {
@@ -1270,9 +1366,17 @@ xfs_mount_zones(
if (!mp->m_zone_info)
return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
error = xfs_report_zones(mp, &iz);
if (error)
goto out_free_zone_info;
error = xfs_finish_spurious_open_zones(mp, &iz);
if (error)
goto out_free_zone_info;
xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
/*
* The writeback code switches between inodes regularly to provide
@@ -1298,22 +1402,6 @@ xfs_mount_zones(
XFS_FSB_TO_B(mp, min(iz.zone_capacity, XFS_MAX_BMBT_EXTLEN)) >>
PAGE_SHIFT;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
xfs_rgblock_t write_pointer;
error = xfs_query_write_pointer(&iz, rtg, &write_pointer);
if (!error)
error = xfs_init_zone(&iz, rtg, write_pointer);
if (error) {
xfs_rtgroup_rele(rtg);
goto out_free_zone_info;
}
}
xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
/*
* The user may configure GC to free up a percentage of unused blocks.
* By default this is 0. GC will always trigger at the minimum level
@@ -1324,6 +1412,10 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
return 0;
out_free_zone_info:

View File

@@ -51,6 +51,7 @@ int xfs_mount_zones(struct xfs_mount *mp);
void xfs_unmount_zones(struct xfs_mount *mp);
void xfs_zone_gc_start(struct xfs_mount *mp);
void xfs_zone_gc_stop(struct xfs_mount *mp);
void xfs_zone_gc_wakeup(struct xfs_mount *mp);
#else
static inline int xfs_mount_zones(struct xfs_mount *mp)
{
@@ -65,6 +66,9 @@ static inline void xfs_zone_gc_start(struct xfs_mount *mp)
static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
{
}
static inline void xfs_zone_gc_wakeup(struct xfs_mount *mp)
{
}
#endif /* CONFIG_XFS_RT */
#endif /* _XFS_ZONE_ALLOC_H */

View File

@@ -125,6 +125,7 @@ struct xfs_zone_gc_iter {
*/
struct xfs_zone_gc_data {
struct xfs_mount *mp;
struct xfs_open_zone *oz;
/* bioset used to allocate the gc_bios */
struct bio_set bio_set;
@@ -170,25 +171,37 @@ xfs_zoned_need_gc(
s64 available, free, threshold;
s32 remainder;
/* If we have no reclaimable blocks, running GC is useless. */
if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
return false;
/*
* In order to avoid file fragmentation as much as possible, we should
* make sure that we can open enough zones. So trigger GC if the number
* of blocks immediately available for writes is lower than the total
* number of blocks from all possible open zones.
*/
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
if (available <
xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
return true;
free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
/*
* For cases where the user wants to be more aggressive with GC,
* the sysfs attribute zonegc_low_space may be set to a non zero value,
* to indicate that GC should try to maintain at least zonegc_low_space
* percent of the free space to be directly available for writing. Check
* this here.
*/
if (!mp->m_zonegc_low_space)
return false;
free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
threshold = div_s64_rem(free, 100, &remainder);
threshold = threshold * mp->m_zonegc_low_space +
remainder * div_s64(mp->m_zonegc_low_space, 100);
if (available < threshold)
return true;
return false;
return available < threshold;
}
static struct xfs_zone_gc_data *
@@ -362,7 +375,7 @@ done:
}
static bool
xfs_zone_gc_iter_next(
xfs_zone_gc_iter_irec(
struct xfs_mount *mp,
struct xfs_zone_gc_iter *iter,
struct xfs_rmap_irec *chunk_rec,
@@ -371,9 +384,6 @@ xfs_zone_gc_iter_next(
struct xfs_rmap_irec *irec;
int error;
if (!iter->victim_rtg)
return false;
retry:
if (iter->rec_idx == iter->rec_count) {
error = xfs_zone_gc_query(mp, iter);
@@ -515,10 +525,11 @@ xfs_zone_gc_select_victim(
return true;
}
static struct xfs_open_zone *
xfs_zone_gc_steal_open(
struct xfs_zone_info *zi)
static int
xfs_zone_gc_steal_open_zone(
struct xfs_zone_gc_data *data)
{
struct xfs_zone_info *zi = data->mp->m_zone_info;
struct xfs_open_zone *oz, *found = NULL;
spin_lock(&zi->zi_open_zones_lock);
@@ -526,56 +537,64 @@ xfs_zone_gc_steal_open(
if (!found || oz->oz_allocated < found->oz_allocated)
found = oz;
}
if (found) {
found->oz_is_gc = true;
list_del_init(&found->oz_entry);
zi->zi_nr_open_zones--;
if (!found) {
spin_unlock(&zi->zi_open_zones_lock);
return -EIO;
}
trace_xfs_zone_gc_target_stolen(found->oz_rtg);
found->oz_is_gc = true;
zi->zi_nr_open_zones--;
zi->zi_nr_open_gc_zones++;
spin_unlock(&zi->zi_open_zones_lock);
return found;
}
static struct xfs_open_zone *
xfs_zone_gc_select_target(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_open_zone *oz = zi->zi_open_gc_zone;
/*
* We need to wait for pending writes to finish.
*/
if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
return NULL;
ASSERT(zi->zi_nr_open_zones <=
mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
if (oz)
trace_xfs_zone_gc_target_opened(oz->oz_rtg);
spin_lock(&zi->zi_open_zones_lock);
zi->zi_open_gc_zone = oz;
spin_unlock(&zi->zi_open_zones_lock);
return oz;
atomic_inc(&found->oz_ref);
data->oz = found;
return 0;
}
/*
* Ensure we have a valid open zone to write the GC data to.
*
* If the current target zone has space keep writing to it, else first wait for
* all pending writes and then pick a new one.
* Ensure we have a valid open zone to write to.
*/
static struct xfs_open_zone *
xfs_zone_gc_ensure_target(
struct xfs_mount *mp)
static bool
xfs_zone_gc_select_target(
struct xfs_zone_gc_data *data)
{
struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
struct xfs_zone_info *zi = data->mp->m_zone_info;
if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return xfs_zone_gc_select_target(mp);
return oz;
if (data->oz) {
/*
* If we have space available, just keep using the existing
* zone.
*/
if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg))
return true;
/*
* Wait for all writes to the current zone to finish before
* picking a new one.
*/
if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg))
return false;
xfs_open_zone_put(data->oz);
}
/*
* Open a new zone when there is none currently in use.
*/
ASSERT(zi->zi_nr_open_zones <=
data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true);
if (!data->oz)
return false;
trace_xfs_zone_gc_target_opened(data->oz->oz_rtg);
atomic_inc(&data->oz->oz_ref);
spin_lock(&zi->zi_open_zones_lock);
zi->zi_nr_open_gc_zones++;
list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones);
spin_unlock(&zi->zi_open_zones_lock);
return true;
}
static void
@@ -590,7 +609,7 @@ xfs_zone_gc_end_io(
wake_up_process(data->mp->m_zone_info->zi_gc_thread);
}
static struct xfs_open_zone *
static bool
xfs_zone_gc_alloc_blocks(
struct xfs_zone_gc_data *data,
xfs_extlen_t *count_fsb,
@@ -598,11 +617,7 @@ xfs_zone_gc_alloc_blocks(
bool *is_seq)
{
struct xfs_mount *mp = data->mp;
struct xfs_open_zone *oz;
oz = xfs_zone_gc_ensure_target(mp);
if (!oz)
return NULL;
struct xfs_open_zone *oz = data->oz;
*count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
@@ -624,7 +639,7 @@ xfs_zone_gc_alloc_blocks(
spin_unlock(&mp->m_sb_lock);
if (!*count_fsb)
return NULL;
return false;
*daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
@@ -632,7 +647,7 @@ xfs_zone_gc_alloc_blocks(
*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
oz->oz_allocated += *count_fsb;
atomic_inc(&oz->oz_ref);
return oz;
return true;
}
static void
@@ -657,6 +672,28 @@ xfs_zone_gc_add_data(
} while (len);
}
static bool
xfs_zone_gc_can_start_chunk(
struct xfs_zone_gc_data *data)
{
if (xfs_is_shutdown(data->mp))
return false;
if (!data->scratch_available)
return false;
if (!data->iter.victim_rtg) {
if (kthread_should_stop() || kthread_should_park())
return false;
if (!xfs_zoned_need_gc(data->mp))
return false;
if (!xfs_zone_gc_select_victim(data))
return false;
}
return xfs_zone_gc_select_target(data);
}
static bool
xfs_zone_gc_start_chunk(
struct xfs_zone_gc_data *data)
@@ -664,7 +701,6 @@ xfs_zone_gc_start_chunk(
struct xfs_zone_gc_iter *iter = &data->iter;
struct xfs_mount *mp = data->mp;
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
struct xfs_open_zone *oz;
struct xfs_rmap_irec irec;
struct xfs_gc_bio *chunk;
struct xfs_inode *ip;
@@ -672,14 +708,15 @@ xfs_zone_gc_start_chunk(
xfs_daddr_t daddr;
bool is_seq;
if (xfs_is_shutdown(mp))
if (!xfs_zone_gc_can_start_chunk(data))
return false;
if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
set_current_state(TASK_RUNNING);
if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip))
return false;
oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
&is_seq);
if (!oz) {
if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
&is_seq)) {
xfs_irele(ip);
return false;
}
@@ -699,7 +736,7 @@ xfs_zone_gc_start_chunk(
chunk->new_daddr = daddr;
chunk->is_seq = is_seq;
chunk->data = data;
chunk->oz = oz;
chunk->oz = data->oz;
chunk->victim_rtg = iter->victim_rtg;
atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
atomic_inc(&chunk->victim_rtg->rtg_gccount);
@@ -985,33 +1022,6 @@ xfs_zone_gc_reset_zones(
} while (next);
}
static bool
xfs_zone_gc_should_start_new_work(
struct xfs_zone_gc_data *data)
{
struct xfs_open_zone *oz;
if (xfs_is_shutdown(data->mp))
return false;
if (!data->scratch_available)
return false;
oz = xfs_zone_gc_ensure_target(data->mp);
if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return false;
if (!data->iter.victim_rtg) {
if (kthread_should_stop() || kthread_should_park())
return false;
if (!xfs_zoned_need_gc(data->mp))
return false;
if (!xfs_zone_gc_select_victim(data))
return false;
}
return true;
}
/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
@@ -1061,13 +1071,10 @@ xfs_zone_gc_handle_work(
}
blk_finish_plug(&plug);
if (xfs_zone_gc_should_start_new_work(data)) {
set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
while (xfs_zone_gc_start_chunk(data))
;
blk_finish_plug(&plug);
}
blk_start_plug(&plug);
while (xfs_zone_gc_start_chunk(data))
;
blk_finish_plug(&plug);
}
/*
@@ -1127,6 +1134,8 @@ xfs_zoned_gcd(
}
xfs_clear_zonegc_running(mp);
if (data->oz)
xfs_open_zone_put(data->oz);
if (data->iter.victim_rtg)
xfs_rtgroup_rele(data->iter.victim_rtg);
@@ -1151,41 +1160,49 @@ xfs_zone_gc_stop(
kthread_park(mp->m_zone_info->zi_gc_thread);
}
void
xfs_zone_gc_wakeup(
struct xfs_mount *mp)
{
struct super_block *sb = mp->m_super;
/*
* If we are unmounting the file system we must not try to
* wake gc as m_zone_info might have been freed already.
*/
if (down_read_trylock(&sb->s_umount)) {
if (!xfs_is_readonly(mp))
wake_up_process(mp->m_zone_info->zi_gc_thread);
up_read(&sb->s_umount);
}
}
int
xfs_zone_gc_mount(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_gc_data *data;
struct xfs_open_zone *oz;
int error;
data = xfs_zone_gc_data_alloc(mp);
if (!data)
return -ENOMEM;
/*
* If there are no free zones available for GC, pick the open zone with
* If there are no free zones available for GC, or the number of open
* zones has reached the open zone limit, pick the open zone with
* the least used space to GC into. This should only happen after an
* unclean shutdown near ENOSPC while GC was ongoing.
*
* We also need to do this for the first gc zone allocation if we
* unmounted while at the open limit.
* unclean shutdown while GC was ongoing. Otherwise a GC zone will
* be selected from the free zone pool on demand.
*/
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
zi->zi_nr_open_zones == mp->m_max_open_zones)
oz = xfs_zone_gc_steal_open(zi);
else
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
if (!oz) {
xfs_warn(mp, "unable to allocate a zone for gc");
error = -EIO;
goto out;
}
trace_xfs_zone_gc_target_opened(oz->oz_rtg);
zi->zi_open_gc_zone = oz;
data = xfs_zone_gc_data_alloc(mp);
if (!data) {
error = -ENOMEM;
goto out_put_gc_zone;
zi->zi_nr_open_zones >= mp->m_max_open_zones) {
error = xfs_zone_gc_steal_open_zone(data);
if (error) {
xfs_warn(mp, "unable to steal an open zone for gc");
goto out_free_gc_data;
}
}
zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
@@ -1193,18 +1210,18 @@ xfs_zone_gc_mount(
if (IS_ERR(zi->zi_gc_thread)) {
xfs_warn(mp, "unable to create zone gc thread");
error = PTR_ERR(zi->zi_gc_thread);
goto out_free_gc_data;
goto out_put_oz;
}
/* xfs_zone_gc_start will unpark for rw mounts */
kthread_park(zi->zi_gc_thread);
return 0;
out_put_oz:
if (data->oz)
xfs_open_zone_put(data->oz);
out_free_gc_data:
kfree(data);
out_put_gc_zone:
xfs_open_zone_put(zi->zi_open_gc_zone);
out:
return error;
}
@@ -1215,6 +1232,4 @@ xfs_zone_gc_unmount(
struct xfs_zone_info *zi = mp->m_zone_info;
kthread_stop(zi->zi_gc_thread);
if (zi->zi_open_gc_zone)
xfs_open_zone_put(zi->zi_open_gc_zone);
}

View File

@@ -30,11 +30,12 @@ xfs_show_open_zone(
struct seq_file *m,
struct xfs_open_zone *oz)
{
seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s %s\n",
rtg_rgno(oz->oz_rtg),
oz->oz_allocated, oz->oz_written,
rtg_rmap(oz->oz_rtg)->i_used_blocks,
xfs_write_hint_to_str(oz->oz_write_hint));
xfs_write_hint_to_str(oz->oz_write_hint),
oz->oz_is_gc ? "(GC)" : "");
}
static void
@@ -58,9 +59,8 @@ xfs_show_full_zone_used_distribution(
spin_unlock(&zi->zi_used_buckets_lock);
full = mp->m_sb.sb_rgcount;
if (zi->zi_open_gc_zone)
full--;
full -= zi->zi_nr_open_zones;
full -= zi->zi_nr_open_gc_zones;
full -= atomic_read(&zi->zi_nr_free_zones);
full -= reclaimable;
@@ -90,15 +90,20 @@ xfs_zoned_show_stats(
seq_printf(m, "\tRT GC required: %d\n",
xfs_zoned_need_gc(mp));
seq_printf(m, "\ttotal number of zones: %u\n",
mp->m_sb.sb_rgcount);
seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
seq_puts(m, "\topen zones:\n");
spin_lock(&zi->zi_open_zones_lock);
seq_printf(m, "\tmax open zones: %u\n",
mp->m_max_open_zones);
seq_printf(m, "\tnr open zones: %u\n",
zi->zi_nr_open_zones);
seq_printf(m, "\tnr open GC zones: %u\n",
zi->zi_nr_open_gc_zones);
seq_puts(m, "\topen zones:\n");
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
xfs_show_open_zone(m, oz);
if (zi->zi_open_gc_zone) {
seq_puts(m, "\topen gc zone:\n");
xfs_show_open_zone(m, zi->zi_open_gc_zone);
}
spin_unlock(&zi->zi_open_zones_lock);
seq_puts(m, "\tused blocks distribution (fully written zones):\n");
xfs_show_full_zone_used_distribution(m, mp);

View File

@@ -32,11 +32,7 @@ struct xfs_open_zone {
*/
enum rw_hint oz_write_hint;
/*
* Is this open zone used for garbage collection? There can only be a
* single open GC zone, which is pointed to by zi_open_gc_zone in
* struct xfs_zone_info. Constant over the life time of an open zone.
*/
/* Is this open zone used for garbage collection? */
bool oz_is_gc;
/*
@@ -68,6 +64,7 @@ struct xfs_zone_info {
spinlock_t zi_open_zones_lock;
struct list_head zi_open_zones;
unsigned int zi_nr_open_zones;
unsigned int zi_nr_open_gc_zones;
/*
* Free zone search cursor and number of free zones:
@@ -81,15 +78,9 @@ struct xfs_zone_info {
wait_queue_head_t zi_zone_wait;
/*
* Pointer to the GC thread, and the current open zone used by GC
* (if any).
*
* zi_open_gc_zone is mostly private to the GC thread, but can be read
* for debugging from other threads, in which case zi_open_zones_lock
* must be taken to access it.
* Pointer to the GC thread.
*/
struct task_struct *zi_gc_thread;
struct xfs_open_zone *zi_open_gc_zone;
/*
* List of zones that need a reset: