From 9b25f381de6b8942645f43735cb0a4fb0ab3a6d1 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Fri, 5 Dec 2025 11:29:14 +0530 Subject: [PATCH 01/37] ext4: unmap invalidated folios from page tables in mpage_release_unused_pages() When delayed block allocation fails (e.g., due to filesystem corruption detected in ext4_map_blocks()), the writeback error handler calls mpage_release_unused_pages(invalidate=true) which invalidates affected folios by clearing their uptodate flag via folio_clear_uptodate(). However, these folios may still be mapped in process page tables. If a subsequent operation (such as ftruncate calling ext4_block_truncate_page) triggers a write fault, the existing page table entry allows access to the now-invalidated folio. This leads to ext4_page_mkwrite() being called with a non-uptodate folio, which then gets marked dirty, triggering: WARNING: CPU: 0 PID: 5 at mm/page-writeback.c:2960 __folio_mark_dirty+0x578/0x880 Call Trace: fault_dirty_shared_page+0x16e/0x2d0 do_wp_page+0x38b/0xd20 handle_pte_fault+0x1da/0x450 The sequence leading to this warning is: 1. Process writes to mmap'd file, folio becomes uptodate and dirty 2. Writeback begins, but delayed allocation fails due to corruption 3. mpage_release_unused_pages(invalidate=true) is called: - block_invalidate_folio() clears dirty flag - folio_clear_uptodate() clears uptodate flag - But folio remains mapped in page tables 4. Later, ftruncate triggers ext4_block_truncate_page() 5. This causes a write fault on the still-mapped folio 6. ext4_page_mkwrite() is called with folio that is !uptodate 7. block_page_mkwrite() marks buffers dirty 8. fault_dirty_shared_page() tries to mark folio dirty 9. block_dirty_folio() calls __folio_mark_dirty(warn=1) 10. WARNING triggers: WARN_ON_ONCE(warn && !uptodate && !dirty) Fix this by unmapping folios from page tables before invalidating them using unmap_mapping_pages(). This ensures that subsequent accesses trigger new page faults rather than reusing invalidated folios through stale page table entries. Note that this results in data loss for any writes to the mmap'd region that couldn't be written back, but this is expected behavior when writeback fails due to filesystem corruption. The existing error message already states "This should not happen!! Data will be lost". Reported-by: syzbot+b0a0670332b6b3230a0a@syzkaller.appspotmail.com Tested-by: syzbot+b0a0670332b6b3230a0a@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b0a0670332b6b3230a0a Suggested-by: Matthew Wilcox Signed-off-by: Deepanshu Kartikey Link: https://patch.msgid.link/20251205055914.1393799-1-kartikey406@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1123d995494b..025ea8f0c41b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1757,8 +1757,22 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { - if (folio_mapped(folio)) + if (folio_mapped(folio)) { folio_clear_dirty_for_io(folio); + /* + * Unmap folio from page + * tables to prevent + * subsequent accesses through + * stale PTEs. This ensures + * future accesses trigger new + * page faults rather than + * reusing the invalidated + * folio. + */ + unmap_mapping_pages(folio->mapping, + folio->index, + folio_nr_pages(folio), false); + } block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); From eb10607628acd1408a02e49b545e6421bb7a6ea2 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Tue, 20 Jan 2026 20:19:41 +0800 Subject: [PATCH 02/37] ext4: remove unused i_fc_wait i_fc_wait is only initialized in ext4_fc_init_inode() and never used for waiting or wakeups. Drop it. Signed-off-by: Li Chen Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20260120121941.144192-1-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ---- fs/ext4/fast_commit.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7617e2d454ea..7d2564f64226 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -1082,9 +1081,6 @@ struct ext4_inode_info { spinlock_t i_raw_lock; /* protects updates to the raw inode */ - /* Fast commit wait queue for this inode */ - wait_queue_head_t i_fc_wait; - /* * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len * and inode's EXT4_FC_STATE_COMMITTING state bit. diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 2f0057e04934..7ebbfb137ef8 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -13,6 +13,7 @@ #include "mballoc.h" #include +#include /* * Ext4 Fast Commits * ----------------- @@ -215,7 +216,6 @@ void ext4_fc_init_inode(struct inode *inode) ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); - init_waitqueue_head(&ei->i_fc_wait); } static bool ext4_fc_disabled(struct super_block *sb) From 2f17d1993b01960579761284e9a0da533a7a82fa Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 21 Jan 2026 14:38:05 +0800 Subject: [PATCH 03/37] ext4: remove tl argument from ext4_fc_replay_{add,del}_range Since commit a7ba36bc94f2 ("ext4: fix fast commit alignment issues"), both ext4_fc_replay_add_range and ext4_fc_replay_del_range get ex based on 'val' instead of 'tl'. Signed-off-by: Guoqing Jiang Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20260121063805.19863-1-guoqing.jiang@linux.dev Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7ebbfb137ef8..6dc406bfe8a5 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1759,8 +1759,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, } /* Replay add range tag */ -static int ext4_fc_replay_add_range(struct super_block *sb, - struct ext4_fc_tl_mem *tl, u8 *val) +static int ext4_fc_replay_add_range(struct super_block *sb, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; @@ -1880,8 +1879,7 @@ out: /* Replay DEL_RANGE tag */ static int -ext4_fc_replay_del_range(struct super_block *sb, - struct ext4_fc_tl_mem *tl, u8 *val) +ext4_fc_replay_del_range(struct super_block *sb, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; @@ -2251,13 +2249,13 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: - ret = ext4_fc_replay_add_range(sb, &tl, val); + ret = ext4_fc_replay_add_range(sb, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: - ret = ext4_fc_replay_del_range(sb, &tl, val); + ret = ext4_fc_replay_del_range(sb, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); From a804ecc399d91a529726fa1b10ff699bb531253d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 22 Feb 2026 13:50:49 +0100 Subject: [PATCH 04/37] ext4/move_extent: use folio_next_pos() A series of patches such as commit 60a70e61430b ("mm: Use folio_next_pos()") replace folio_pos() + folio_size() by folio_next_pos(). The former performs x << z + y << z while the latter performs (x + y) << z, which is slightly more efficient. This case was not taken into account, perhaps because the argument is not named folio. The change was performed using the following Coccinelle semantic patch: @@ expression folio; @@ - folio_pos(folio) + folio_size(folio) + folio_next_pos(folio) Signed-off-by: Julia Lawall Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20260222125049.1309075-1-Julia.Lawall@inria.fr Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index ce1f738dff93..78569ed91b97 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -224,8 +224,8 @@ static int mext_move_begin(struct mext_data *mext, struct folio *folio[2], } /* Adjust the moving length according to the length of shorter folio. */ - move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos, - folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos); + move_len = umin(folio_next_pos(folio[0]) - orig_pos, + folio_next_pos(folio[1]) - donor_pos); move_len >>= blkbits; if (move_len < mext->orig_map.m_len) mext->orig_map.m_len = move_len; From af1502f98e2cdd43504596cd438f3aa6d0be8712 Mon Sep 17 00:00:00 2001 From: Weixie Cui Date: Wed, 25 Feb 2026 13:02:31 +0800 Subject: [PATCH 05/37] ext4: simplify mballoc preallocation size rounding for small files The if-else ladder in ext4_mb_normalize_request() manually rounds up the preallocation size to the next power of two for files up to 1MB, enumerating each step from 16KB to 1MB individually. Replace this with a single roundup_pow_of_two() call clamped to a 16KB minimum, which is functionally equivalent but much more concise. Also replace raw byte constants with SZ_1M and SZ_16K from for clarity, and remove the stale "XXX: should this table be tunable?" comment that has been there since the original mballoc code. No functional change. Reviewed-by: Andreas Dilger Signed-off-by: Weixie Cui Link: https://patch.msgid.link/tencent_E9C5F1B2E9939B3037501FD04A7E9CF0C407@qq.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bb58eafb87bc..3d73f64fc49a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4561,22 +4561,16 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ - /* XXX: should this table be tunable? */ start_off = 0; - if (size <= 16 * 1024) { - size = 16 * 1024; - } else if (size <= 32 * 1024) { - size = 32 * 1024; - } else if (size <= 64 * 1024) { - size = 64 * 1024; - } else if (size <= 128 * 1024) { - size = 128 * 1024; - } else if (size <= 256 * 1024) { - size = 256 * 1024; - } else if (size <= 512 * 1024) { - size = 512 * 1024; - } else if (size <= 1024 * 1024) { - size = 1024 * 1024; + if (size <= SZ_1M) { + /* + * For files up to 1MB, round up the preallocation size to + * the next power of two, with a minimum of 16KB. + */ + if (size <= (unsigned long)SZ_16K) + size = SZ_16K; + else + size = roundup_pow_of_two(size); } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (21 - bsbits)) << 21; From 64924362f833fd15d75d2b8fc771eff9646c0933 Mon Sep 17 00:00:00 2001 From: Milos Nikic Date: Wed, 4 Mar 2026 09:20:15 -0800 Subject: [PATCH 06/37] jbd2: gracefully abort instead of panicking on unlocked buffer In jbd2_journal_get_create_access(), if the caller passes an unlocked buffer, the code currently triggers a fatal J_ASSERT. While an unlocked buffer here is a clear API violation and a bug in the caller, crashing the entire system is an overly severe response. It brings down the whole machine for a localized filesystem inconsistency. Replace the J_ASSERT with a WARN_ON_ONCE to capture the offending caller's stack trace, and return an error (-EINVAL). This allows the journal to gracefully abort the transaction, protecting data integrity without causing a kernel panic. Signed-off-by: Milos Nikic Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://patch.msgid.link/20260304172016.23525-2-nikic.milos@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index dca4b5d8aaaa..04d17a5f2a82 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1302,7 +1302,12 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) goto out; } - J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + if (WARN_ON_ONCE(!buffer_locked(jh2bh(jh)))) { + err = -EINVAL; + spin_unlock(&jh->b_state_lock); + jbd2_journal_abort(journal, err); + goto out; + } if (jh->b_transaction == NULL) { /* From f7fc28b014ebb00796f99f12f0583caab23276e3 Mon Sep 17 00:00:00 2001 From: Milos Nikic Date: Wed, 4 Mar 2026 09:20:16 -0800 Subject: [PATCH 07/37] jbd2: gracefully abort on transaction state corruptions Auditing the jbd2 codebase reveals several legacy J_ASSERT calls that enforce internal state machine invariants (e.g., verifying jh->b_transaction or jh->b_next_transaction pointers). When these invariants are broken, the journal is in a corrupted state. However, triggering a fatal panic brings down the entire system for a localized filesystem error. This patch targets a specific class of these asserts: those residing inside functions that natively return integer error codes, booleans, or error pointers. It replaces the hard J_ASSERTs with WARN_ON_ONCE to capture the offending stack trace, safely drops any held locks, gracefully aborts the journal, and returns -EINVAL. This prevents a catastrophic kernel panic while ensuring the corrupted journal state is safely contained and upstream callers (like ext4 or ocfs2) can gracefully handle the aborted handle. Functions modified in fs/jbd2/transaction.c: - jbd2__journal_start() - do_get_write_access() - jbd2_journal_dirty_metadata() - jbd2_journal_forget() - jbd2_journal_try_to_free_buffers() - jbd2_journal_file_inode() Signed-off-by: Milos Nikic Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://patch.msgid.link/20260304172016.23525-3-nikic.milos@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 114 +++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 28 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 04d17a5f2a82..02cb87dc6fa8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -474,7 +474,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, return ERR_PTR(-EROFS); if (handle) { - J_ASSERT(handle->h_transaction->t_journal == journal); + if (WARN_ON_ONCE(handle->h_transaction->t_journal != journal)) + return ERR_PTR(-EINVAL); handle->h_ref++; return handle; } @@ -1036,7 +1037,13 @@ repeat: */ if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); + if (WARN_ON_ONCE(jh->b_next_transaction)) { + spin_unlock(&jh->b_state_lock); + unlock_buffer(bh); + error = -EINVAL; + jbd2_journal_abort(journal, error); + goto out; + } JBUFFER_TRACE(jh, "file as BJ_Reserved"); /* * Make sure all stores to jh (b_modified, b_frozen_data) are @@ -1069,13 +1076,27 @@ repeat: */ if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + if (WARN_ON_ONCE(jh->b_next_transaction)) { + spin_unlock(&jh->b_state_lock); + error = -EINVAL; + jbd2_journal_abort(journal, error); + goto out; + } goto attach_next; } JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); + if (WARN_ON_ONCE(jh->b_next_transaction || + jh->b_transaction != + journal->j_committing_transaction)) { + pr_err("JBD2: %s: assertion failure: b_next_transaction=%p b_transaction=%p j_committing_transaction=%p\n", + journal->j_devname, jh->b_next_transaction, + jh->b_transaction, journal->j_committing_transaction); + spin_unlock(&jh->b_state_lock); + error = -EINVAL; + jbd2_journal_abort(journal, error); + goto out; + } /* * There is one case we have to be very careful about. If the @@ -1496,7 +1517,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal; + journal_t *journal = transaction->t_journal; struct journal_head *jh; int ret = 0; @@ -1520,8 +1541,14 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (data_race(jh->b_transaction != transaction && jh->b_next_transaction != transaction)) { spin_lock(&jh->b_state_lock); - J_ASSERT_JH(jh, jh->b_transaction == transaction || - jh->b_next_transaction == transaction); + if (WARN_ON_ONCE(jh->b_transaction != transaction && + jh->b_next_transaction != transaction)) { + pr_err("JBD2: %s: assertion failure: b_transaction=%p transaction=%p b_next_transaction=%p\n", + journal->j_devname, jh->b_transaction, + transaction, jh->b_next_transaction); + ret = -EINVAL; + goto out_unlock_bh; + } spin_unlock(&jh->b_state_lock); } if (data_race(jh->b_modified == 1)) { @@ -1529,15 +1556,15 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { spin_lock(&jh->b_state_lock); - if (jh->b_transaction == transaction && - jh->b_jlist != BJ_Metadata) - pr_err("JBD2: assertion failure: h_type=%u " - "h_line_no=%u block_no=%llu jlist=%u\n", + if (WARN_ON_ONCE(jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata)) { + pr_err("JBD2: assertion failure: h_type=%u h_line_no=%u block_no=%llu jlist=%u\n", handle->h_type, handle->h_line_no, (unsigned long long) bh->b_blocknr, jh->b_jlist); - J_ASSERT_JH(jh, jh->b_transaction != transaction || - jh->b_jlist == BJ_Metadata); + ret = -EINVAL; + goto out_unlock_bh; + } spin_unlock(&jh->b_state_lock); } goto out; @@ -1557,8 +1584,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out_unlock_bh; } - journal = transaction->t_journal; - if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part @@ -1636,7 +1661,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } /* That test should have eliminated the following case: */ - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + if (WARN_ON_ONCE(jh->b_frozen_data)) { + ret = -EINVAL; + goto out_unlock_bh; + } JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); @@ -1675,6 +1703,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) int err = 0; int was_modified = 0; int wait_for_writeback = 0; + int abort_journal = 0; if (is_handle_aborted(handle)) return -EROFS; @@ -1708,7 +1737,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) jh->b_modified = 0; if (jh->b_transaction == transaction) { - J_ASSERT_JH(jh, !jh->b_frozen_data); + if (WARN_ON_ONCE(jh->b_frozen_data)) { + err = -EINVAL; + abort_journal = 1; + goto drop; + } /* If we are forgetting a buffer which is already part * of this transaction, then we can just drop it from @@ -1747,8 +1780,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { - J_ASSERT_JH(jh, (jh->b_transaction == - journal->j_committing_transaction)); + if (WARN_ON_ONCE(jh->b_transaction != journal->j_committing_transaction)) { + err = -EINVAL; + abort_journal = 1; + goto drop; + } /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); @@ -1766,7 +1802,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } else { - J_ASSERT(jh->b_next_transaction == transaction); + if (WARN_ON_ONCE(jh->b_next_transaction != transaction)) { + err = -EINVAL; + abort_journal = 1; + goto drop; + } /* * only drop a reference if this transaction modified @@ -1812,6 +1852,8 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) drop: __brelse(bh); spin_unlock(&jh->b_state_lock); + if (abort_journal) + jbd2_journal_abort(journal, err); if (wait_for_writeback) wait_on_buffer(bh); jbd2_journal_put_journal_head(jh); @@ -2136,7 +2178,8 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) struct buffer_head *bh; bool ret = false; - J_ASSERT(folio_test_locked(folio)); + if (WARN_ON_ONCE(!folio_test_locked(folio))) + return false; head = folio_buffers(folio); bh = head; @@ -2651,6 +2694,8 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, { transaction_t *transaction = handle->h_transaction; journal_t *journal; + int err = 0; + int abort_transaction = 0; if (is_handle_aborted(handle)) return -EROFS; @@ -2685,20 +2730,33 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { - J_ASSERT(jinode->i_next_transaction == NULL); - J_ASSERT(jinode->i_transaction == - journal->j_committing_transaction); + if (WARN_ON_ONCE(jinode->i_next_transaction || + jinode->i_transaction != + journal->j_committing_transaction)) { + pr_err("JBD2: %s: assertion failure: i_next_transaction=%p i_transaction=%p j_committing_transaction=%p\n", + journal->j_devname, jinode->i_next_transaction, + jinode->i_transaction, + journal->j_committing_transaction); + err = -EINVAL; + abort_transaction = 1; + goto done; + } jinode->i_next_transaction = transaction; goto done; } /* Not on any transaction list... */ - J_ASSERT(!jinode->i_next_transaction); + if (WARN_ON_ONCE(jinode->i_next_transaction)) { + err = -EINVAL; + abort_transaction = 1; + goto done; + } jinode->i_transaction = transaction; list_add(&jinode->i_list, &transaction->t_inode_list); done: spin_unlock(&journal->j_list_lock); - - return 0; + if (abort_transaction) + jbd2_journal_abort(journal, err); + return err; } int jbd2_journal_inode_ranged_write(handle_t *handle, From 5267f6ef49cb5fba426f2d286817b1355fde31da Mon Sep 17 00:00:00 2001 From: Li Chen Date: Fri, 6 Mar 2026 16:56:39 +0800 Subject: [PATCH 08/37] jbd2: add jinode dirty range accessors Provide a helper to fetch jinode dirty ranges in bytes. This lets filesystem callbacks avoid depending on the internal representation, preparing for a later conversion to page units. Suggested-by: Andreas Dilger Reviewed-by: Jan Kara Signed-off-by: Li Chen Link: https://patch.msgid.link/20260306085643.465275-2-me@linux.beauty Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a53a00d36228..64392baf5f4b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -445,6 +445,20 @@ struct jbd2_inode { loff_t i_dirty_end; }; +static inline bool jbd2_jinode_get_dirty_range(const struct jbd2_inode *jinode, + loff_t *start, loff_t *end) +{ + loff_t start_byte = jinode->i_dirty_start; + loff_t end_byte = jinode->i_dirty_end; + + if (!end_byte) + return false; + + *start = start_byte; + *end = end_byte; + return true; +} + struct jbd2_revoke_table_s; /** From 660d23669982202c99798658e2a15ccdd001f82b Mon Sep 17 00:00:00 2001 From: Li Chen Date: Fri, 6 Mar 2026 16:56:40 +0800 Subject: [PATCH 09/37] ext4: use jbd2 jinode dirty range accessor ext4 journal commit callbacks access jbd2_inode dirty range fields without holding journal->j_list_lock. Use jbd2_jinode_get_dirty_range() to get the range in bytes, and read i_transaction with READ_ONCE() in the redirty check. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Li Chen Link: https://patch.msgid.link/20260306085643.465275-3-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 10 ++++++++-- fs/ext4/super.c | 16 +++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 025ea8f0c41b..13cd564f89e1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3055,17 +3055,23 @@ static int ext4_writepages(struct address_space *mapping, int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { + loff_t range_start, range_end; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; + + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) + return 0; + + wbc.range_start = range_start; + wbc.range_end = range_end; + return ext4_do_writepages(&mpd); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a34efb44e73d..638d859f4fca 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -521,6 +521,7 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, { struct buffer_head *bh, *head; struct journal_head *jh; + transaction_t *trans = READ_ONCE(jinode->i_transaction); bh = head = folio_buffers(folio); do { @@ -539,7 +540,7 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != jinode->i_transaction || + (jh && (jh->b_transaction != trans || jh->b_next_transaction))) return true; } while ((bh = bh->b_this_page) != head); @@ -550,15 +551,20 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + loff_t range_start, range_end; struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; + }; struct folio *folio = NULL; int error; + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) + return 0; + + wbc.range_start = range_start; + wbc.range_end = range_end; + /* * writeback_iter() already checks for dirty pages and calls * folio_clear_dirty_for_io(), which we want to write protect the From be81084e032c2d74f51173e30f687ce13476cb73 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Fri, 6 Mar 2026 16:56:41 +0800 Subject: [PATCH 10/37] ocfs2: use jbd2 jinode dirty range accessor ocfs2 journal commit callback reads jbd2_inode dirty range fields without holding journal->j_list_lock. Use jbd2_jinode_get_dirty_range() to get the range in bytes. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Li Chen Link: https://patch.msgid.link/20260306085643.465275-4-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/ocfs2/journal.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4c86a9d46870..f9bf3bac085d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -899,8 +899,13 @@ bail: static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, - jinode->i_dirty_start, jinode->i_dirty_end); + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + loff_t range_start, range_end; + + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) + return 0; + + return filemap_fdatawrite_range(mapping, range_start, range_end); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) From 4edafa81a1d6020272d0c6eb68faeb810dd083c1 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Fri, 6 Mar 2026 16:56:42 +0800 Subject: [PATCH 11/37] jbd2: store jinode dirty range in PAGE_SIZE units jbd2_inode fields are updated under journal->j_list_lock, but some paths read them without holding the lock (e.g. fast commit helpers and ordered truncate helpers). READ_ONCE() alone is not sufficient for the dirty range fields when they are stored as loff_t because 32-bit platforms can observe torn loads. Store the dirty range in PAGE_SIZE units as pgoff_t instead. Represent the dirty range end as an exclusive end page. This avoids a special sentinel value and keeps MAX_LFS_FILESIZE on 32-bit representable. Publish a new dirty range by updating end_page before start_page, and treat start_page >= end_page as empty in the accessor for robustness. Use READ_ONCE() on the read side and WRITE_ONCE() on the write side for the dirty range and i_flags to match the existing lockless access pattern. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Li Chen Link: https://patch.msgid.link/20260306085643.465275-5-me@linux.beauty Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 55 +++++++++++++++++++++++++++++++++---------- fs/jbd2/journal.c | 5 ++-- fs/jbd2/transaction.c | 23 +++++++++++------- include/linux/jbd2.h | 34 ++++++++++++++++---------- 4 files changed, 81 insertions(+), 36 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7203d2d2624d..8cf61e7185c4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -180,7 +180,13 @@ static int journal_wait_on_commit_record(journal_t *journal, /* Send all the data buffers related to an inode */ int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) + unsigned long flags; + + if (!jinode) + return 0; + + flags = READ_ONCE(jinode->i_flags); + if (!(flags & JI_WRITE_DATA)) return 0; trace_jbd2_submit_inode_data(jinode->i_vfs_inode); @@ -191,12 +197,30 @@ EXPORT_SYMBOL(jbd2_submit_inode_data); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || - !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) + struct address_space *mapping; + struct inode *inode; + unsigned long flags; + loff_t start_byte, end_byte; + + if (!jinode) + return 0; + + flags = READ_ONCE(jinode->i_flags); + if (!(flags & JI_WAIT_DATA)) + return 0; + + inode = jinode->i_vfs_inode; + if (!inode) + return 0; + + mapping = inode->i_mapping; + if (!mapping) + return 0; + + if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) return 0; return filemap_fdatawait_range_keep_errors( - jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, - jinode->i_dirty_end); + mapping, start_byte, end_byte); } EXPORT_SYMBOL(jbd2_wait_inode_data); @@ -218,7 +242,8 @@ static int journal_submit_data_buffers(journal_t *journal, list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { if (!(jinode->i_flags & JI_WRITE_DATA)) continue; - jinode->i_flags |= JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, + jinode->i_flags | JI_COMMIT_RUNNING); spin_unlock(&journal->j_list_lock); /* submit the inode data buffers. */ trace_jbd2_submit_inode_data(jinode->i_vfs_inode); @@ -229,7 +254,8 @@ static int journal_submit_data_buffers(journal_t *journal, } spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - jinode->i_flags &= ~JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, + jinode->i_flags & ~JI_COMMIT_RUNNING); smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -240,10 +266,13 @@ static int journal_submit_data_buffers(journal_t *journal, int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + loff_t start_byte, end_byte; + + if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) + return 0; return filemap_fdatawait_range_keep_errors(mapping, - jinode->i_dirty_start, - jinode->i_dirty_end); + start_byte, end_byte); } /* @@ -262,7 +291,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { if (!(jinode->i_flags & JI_WAIT_DATA)) continue; - jinode->i_flags |= JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING); spin_unlock(&journal->j_list_lock); /* wait for the inode data buffers writeout. */ if (journal->j_finish_inode_data_buffers) { @@ -272,7 +301,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, } cond_resched(); spin_lock(&journal->j_list_lock); - jinode->i_flags &= ~JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING); smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -288,8 +317,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, &jinode->i_transaction->t_inode_list); } else { jinode->i_transaction = NULL; - jinode->i_dirty_start = 0; - jinode->i_dirty_end = 0; + WRITE_ONCE(jinode->i_dirty_start_page, 0); + WRITE_ONCE(jinode->i_dirty_end_page, 0); } } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index cb2c529a8f1b..609c8d965f12 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -3018,8 +3018,8 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) jinode->i_next_transaction = NULL; jinode->i_vfs_inode = inode; jinode->i_flags = 0; - jinode->i_dirty_start = 0; - jinode->i_dirty_end = 0; + jinode->i_dirty_start_page = 0; + jinode->i_dirty_end_page = 0; INIT_LIST_HEAD(&jinode->i_list); } @@ -3176,4 +3176,3 @@ MODULE_DESCRIPTION("Generic filesystem journal-writing module"); MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); - diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 02cb87dc6fa8..495f00129844 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2694,6 +2694,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, { transaction_t *transaction = handle->h_transaction; journal_t *journal; + pgoff_t start_page, end_page; int err = 0; int abort_transaction = 0; @@ -2704,15 +2705,21 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); - spin_lock(&journal->j_list_lock); - jinode->i_flags |= flags; + start_page = (pgoff_t)(start_byte >> PAGE_SHIFT); + end_page = (pgoff_t)(end_byte >> PAGE_SHIFT) + 1; - if (jinode->i_dirty_end) { - jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); - jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); + spin_lock(&journal->j_list_lock); + WRITE_ONCE(jinode->i_flags, jinode->i_flags | flags); + + if (jinode->i_dirty_start_page != jinode->i_dirty_end_page) { + WRITE_ONCE(jinode->i_dirty_start_page, + min(jinode->i_dirty_start_page, start_page)); + WRITE_ONCE(jinode->i_dirty_end_page, + max(jinode->i_dirty_end_page, end_page)); } else { - jinode->i_dirty_start = start_byte; - jinode->i_dirty_end = end_byte; + /* Publish a new non-empty range by making end visible first. */ + WRITE_ONCE(jinode->i_dirty_end_page, end_page); + WRITE_ONCE(jinode->i_dirty_start_page, start_page); } /* Is inode already attached where we need it? */ @@ -2802,7 +2809,7 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, int ret = 0; /* This is a quick check to avoid locking if not necessary */ - if (!jinode->i_transaction) + if (!READ_ONCE(jinode->i_transaction)) goto out; /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 64392baf5f4b..7e785aa6d35d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -429,33 +429,43 @@ struct jbd2_inode { unsigned long i_flags; /** - * @i_dirty_start: + * @i_dirty_start_page: + * + * Dirty range start in PAGE_SIZE units. + * + * The dirty range is empty if @i_dirty_start_page is greater than or + * equal to @i_dirty_end_page. * - * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ - loff_t i_dirty_start; + pgoff_t i_dirty_start_page; /** - * @i_dirty_end: + * @i_dirty_end_page: * - * Inclusive offset in bytes where the dirty range for this inode - * ends. [j_list_lock] + * Dirty range end in PAGE_SIZE units (exclusive). + * + * [j_list_lock] */ - loff_t i_dirty_end; + pgoff_t i_dirty_end_page; }; +/* + * Lockless readers treat start_page >= end_page as an empty range. + * Writers publish a new non-empty range by storing i_dirty_end_page before + * i_dirty_start_page. + */ static inline bool jbd2_jinode_get_dirty_range(const struct jbd2_inode *jinode, loff_t *start, loff_t *end) { - loff_t start_byte = jinode->i_dirty_start; - loff_t end_byte = jinode->i_dirty_end; + pgoff_t start_page = READ_ONCE(jinode->i_dirty_start_page); + pgoff_t end_page = READ_ONCE(jinode->i_dirty_end_page); - if (!end_byte) + if (start_page >= end_page) return false; - *start = start_byte; - *end = end_byte; + *start = (loff_t)start_page << PAGE_SHIFT; + *end = ((loff_t)end_page << PAGE_SHIFT) - 1; return true; } From 1d749e110277ce4103f27bd60d6181e52c0cc1e3 Mon Sep 17 00:00:00 2001 From: Philipp Hahn Date: Tue, 10 Mar 2026 12:48:30 +0100 Subject: [PATCH 12/37] ext4: prefer IS_ERR_OR_NULL over manual NULL check Prefer using IS_ERR_OR_NULL() over using IS_ERR() and a manual NULL check. Change generated with coccinelle. To: "Theodore Ts'o" To: Andreas Dilger Cc: linux-ext4@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Philipp Hahn Link: https://patch.msgid.link/20260310-b4-is_err_or_null-v1-4-bd63b656022d@avm.de Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 2 +- fs/ext4/mballoc.c | 2 +- fs/ext4/namei.c | 2 +- fs/ext4/symlink.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6dc406bfe8a5..e0ce49f99ca4 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -320,7 +320,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl if (ext4_fc_disabled(sb)) return; - if (handle && !IS_ERR(handle)) + if (!IS_ERR_OR_NULL(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3d73f64fc49a..25e3d9204233 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2876,7 +2876,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); - if (bh && !IS_ERR(bh)) { + if (!IS_ERR_OR_NULL(bh)) { if (!buffer_uptodate(bh) && cnt) (*cnt)++; brelse(bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c4b5e252af0e..4fdfc81f7902 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -723,7 +723,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); bh = ext4_bread(NULL,dir, block, 0); - if (!bh || IS_ERR(bh)) + if (IS_ERR_OR_NULL(bh)) continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 645240cc0229..b612262719ed 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,7 +92,7 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh) || !bh) + if (IS_ERR_OR_NULL(bh)) return ERR_PTR(-ECHILD); if (!ext4_buffer_uptodate(bh)) { brelse(bh); From 2879374604b72bd43b346777fa05d3ac6dea9c45 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Mar 2026 11:03:16 +1100 Subject: [PATCH 13/37] ext4: split __ext4_add_entry() out of ext4_add_entry() __ext4_add_entry() is not given a dentry - just inodes and name. This will help the next patch which simplifies __ex4_link(). Reviewed-by: Andreas Dilger Reviewed-by: Jan Kara Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260320000838.3797494-2-neilb@ownmail.net Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4fdfc81f7902..ba3d85a9c120 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2353,10 +2353,10 @@ out_frames: * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ -static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +static int __ext4_add_entry(handle_t *handle, struct inode *dir, + const struct qstr *d_name, struct inode *inode) { - struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; @@ -2373,13 +2373,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, sb = dir->i_sb; blocksize = sb->s_blocksize; - if (fscrypt_is_nokey_name(dentry)) - return -ENOKEY; - - if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + if (!generic_ci_validate_strict_name(dir, d_name)) return -EINVAL; - retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + retval = ext4_fname_setup_filename(dir, d_name, 0, &fname); if (retval) return retval; @@ -2460,6 +2457,16 @@ out: return retval; } +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + return __ext4_add_entry(handle, dir, &dentry->d_name, inode); +} + /* * Returns 0 for success, or a negative error value */ From 0f5f14f334c85efd80503489f8c7cba1dd64bd51 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Mar 2026 11:03:17 +1100 Subject: [PATCH 14/37] ext4: add ext4_fc_eligible() Testing EXT4_MF_FC_INELIGIBLE is almost always combined with testing ext4_fc_disabled(). The code can be simplified by combining these two in a new ext4_fc_eligible(). In ext4_fc_track_inode() this moves the ext4_fc_disabled() test after ext4_fc_mark_ineligible(), but as that is a non-op when ext4_fc_disabled() is true, this is no no consequence. Note that it is important to still call ext4_fc_mark_ineligible() in ext4_fc_track_inode() even when ext4_fc_eligible() would return true. ext4_fc_mark_ineligible() does not ONLY set the "INELIGIBLE" flag but also updates ->s_fc_ineligible_tid to make sure that the flag remains set until all ineligible transactions have been committed. Reviewed-by: Andreas Dilger Reviewed-by: Jan Kara Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260320000838.3797494-3-neilb@ownmail.net Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e0ce49f99ca4..e58484d69d8e 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -224,6 +224,12 @@ static bool ext4_fc_disabled(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } +static bool ext4_fc_eligible(struct super_block *sb) +{ + return !ext4_fc_disabled(sb) && + !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)); +} + /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. @@ -473,13 +479,8 @@ void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) - return; - - __ext4_fc_track_unlink(handle, inode, dentry); + if (ext4_fc_eligible(inode->i_sb)) + __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, @@ -500,13 +501,8 @@ void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) - return; - - __ext4_fc_track_link(handle, inode, dentry); + if (ext4_fc_eligible(inode->i_sb)) + __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, @@ -527,13 +523,8 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) - return; - - __ext4_fc_track_create(handle, inode, dentry); + if (ext4_fc_eligible(inode->i_sb)) + __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ @@ -557,16 +548,13 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (S_ISDIR(inode->i_mode)) return; - if (ext4_fc_disabled(inode->i_sb)) - return; - if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + if (!ext4_fc_eligible(inode->i_sb)) return; /* @@ -644,10 +632,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star if (S_ISDIR(inode->i_mode)) return; - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + if (!ext4_fc_eligible(inode->i_sb)) return; if (ext4_has_inline_data(inode)) { From 52b4fea162dd384792d0dec7f817e4ba5d8d4c9b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Mar 2026 11:03:18 +1100 Subject: [PATCH 15/37] ext4: move dcache manipulation out of __ext4_link() __ext4_link() has two callers. - ext4_link() calls it during normal handling of the link() system call or similar - ext4_fc_replay_link_internal() calls it when replaying the journal at mount time. The former needs changes to dcache - instantiating the dentry to the inode on success. The latter doesn't need or want any dcache manipulation. So move the manipulation out of __ext4_link() and do it in ext4_link() only. This requires: - passing the qname from the dentry explicitly to __ext4_link. The parent dir is already passed. The dentry is still passed in the ext4_link() case purely for use by ext4_fc_track_link(). - passing the inode separately to ext4_fc_track_link() as the dentry will not be instantiated yet. - using __ext4_add_entry() in ext4_link, which doesn't need a dentry. - moving ihold(), d_instantiate(), drop_nlink() and iput() calls out of __ext4_link() into ext4_link(). Note that ext4_inc_count() and drop_nlink() remain in __ext4_link() as both callers need them and they are not related to the dentry. This substantially simplifies ext4_fc_replay_link_internal(), and removes a use of d_alloc() which, it is planned, will be removed. Reviewed-by: Jan Kara Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260320000838.3797494-4-neilb@ownmail.net Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 +++-- fs/ext4/fast_commit.c | 32 ++++---------------------------- fs/ext4/namei.c | 19 +++++++++++-------- 3 files changed, 18 insertions(+), 38 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7d2564f64226..58fd1ea1e501 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2971,7 +2971,8 @@ void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); -void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); @@ -3716,7 +3717,7 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, - struct dentry *dentry); + const struct qstr *d_name, struct dentry *dentry); #define S_SHIFT 12 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e58484d69d8e..7bcba3cb550f 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -497,10 +497,9 @@ void __ext4_fc_track_link(handle_t *handle, trace_ext4_fc_track_link(handle, inode, dentry, ret); } -void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) +void ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - if (ext4_fc_eligible(inode->i_sb)) __ext4_fc_track_link(handle, inode, dentry); } @@ -1431,7 +1430,6 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, struct inode *inode) { struct inode *dir = NULL; - struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; @@ -1442,21 +1440,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, goto out; } - dentry_dir = d_obtain_alias(dir); - if (IS_ERR(dentry_dir)) { - ext4_debug("Failed to obtain dentry"); - dentry_dir = NULL; - goto out; - } - - dentry_inode = d_alloc(dentry_dir, &qstr_dname); - if (!dentry_inode) { - ext4_debug("Inode dentry not created."); - ret = -ENOMEM; - goto out; - } - - ret = __ext4_link(dir, inode, dentry_inode); + ret = __ext4_link(dir, inode, &qstr_dname, NULL); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR @@ -1470,16 +1454,8 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, ret = 0; out: - if (dentry_dir) { - d_drop(dentry_dir); - dput(dentry_dir); - } else if (dir) { + if (dir) iput(dir); - } - if (dentry_inode) { - d_drop(dentry_inode); - dput(dentry_inode); - } return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ba3d85a9c120..0b8e25198b17 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3452,7 +3452,8 @@ out_retry: return err; } -int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) +int __ext4_link(struct inode *dir, struct inode *inode, + const struct qstr *d_name, struct dentry *dentry) { handle_t *handle; int err, retries = 0; @@ -3468,9 +3469,8 @@ retry: inode_set_ctime_current(inode); ext4_inc_count(inode); - ihold(inode); - err = ext4_add_entry(handle, dentry, inode); + err = __ext4_add_entry(handle, dir, d_name, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being @@ -3478,11 +3478,10 @@ retry: */ if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); - d_instantiate(dentry, inode); - ext4_fc_track_link(handle, dentry); + if (dentry) + ext4_fc_track_link(handle, inode, dentry); } else { drop_nlink(inode); - iput(inode); } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) @@ -3511,9 +3510,13 @@ static int ext4_link(struct dentry *old_dentry, err = dquot_initialize(dir); if (err) return err; - return __ext4_link(dir, inode, dentry); + err = __ext4_link(dir, inode, &dentry->d_name, dentry); + if (!err) { + ihold(inode); + d_instantiate(dentry, inode); + } + return err; } - /* * Try to find buffer head where contains the parent block. * It should be the inode block if it is inlined or the 1st block From 6ea3b34d8625ef5544d1c619bd67e2c6080ea4c2 Mon Sep 17 00:00:00 2001 From: David Laight Date: Thu, 26 Mar 2026 20:18:04 +0000 Subject: [PATCH 16/37] ext4: fix diagnostic printf formats The formats for non-terminated names should be "%.*s" not "%*.s". The kernel currently treats "%*.s" as equivalent to "%*s" whereas userspace requires it be equivalent to "%*.0s". Neither is correct here. Signed-off-by: David Laight Link: https://patch.msgid.link/20260326201804.3881-1-david.laight.linux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0b8e25198b17..838c01eb46ea 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -647,7 +647,7 @@ static struct stats dx_show_leaf(struct inode *dir, /* Directory is not encrypted */ (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); - printk("%*.s:(U)%x.%u ", len, + printk("%.*s:(U)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -683,7 +683,7 @@ static struct stats dx_show_leaf(struct inode *dir, (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); - printk("%*.s:(E)%x.%u ", len, name, + printk("%.*s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); fscrypt_fname_free_buffer( @@ -694,7 +694,7 @@ static struct stats dx_show_leaf(struct inode *dir, char *name = de->name; (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); - printk("%*.s:%x.%u ", len, name, h.hash, + printk("%.*s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif } From 5447c8b9de7581ca7254d712652678cc460a18c2 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:27 +0800 Subject: [PATCH 17/37] ext4: add did_zero output parameter to ext4_block_zero_page_range() Add a bool *did_zero output parameter to ext4_block_zero_page_range() and __ext4_block_zero_page_range(). The parameter reports whether a partial block was zeroed out, which is needed for the upcoming iomap buffered I/O conversion. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 13cd564f89e1..f0c9c63f618b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4033,7 +4033,8 @@ void ext4_set_aops(struct inode *inode) * racing writeback can come later and flush the stale pagecache to disk. */ static int __ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) + struct address_space *mapping, loff_t from, loff_t length, + bool *did_zero) { unsigned int offset, blocksize, pos; ext4_lblk_t iblock; @@ -4121,6 +4122,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, err = ext4_jbd2_inode_add_write(handle, inode, from, length); } + if (!err && did_zero) + *did_zero = true; unlock: folio_unlock(folio); @@ -4136,7 +4139,8 @@ unlock: * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) + struct address_space *mapping, loff_t from, loff_t length, + bool *did_zero) { struct inode *inode = mapping->host; unsigned blocksize = inode->i_sb->s_blocksize; @@ -4150,10 +4154,11 @@ static int ext4_block_zero_page_range(handle_t *handle, length = max; if (IS_DAX(inode)) { - return dax_zero_range(inode, from, length, NULL, + return dax_zero_range(inode, from, length, did_zero, &ext4_iomap_ops); } - return __ext4_block_zero_page_range(handle, mapping, from, length); + return __ext4_block_zero_page_range(handle, mapping, from, length, + did_zero); } /* @@ -4176,7 +4181,7 @@ static int ext4_block_truncate_page(handle_t *handle, blocksize = i_blocksize(inode); length = blocksize - (from & (blocksize - 1)); - return ext4_block_zero_page_range(handle, mapping, from, length); + return ext4_block_zero_page_range(handle, mapping, from, length, NULL); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, @@ -4199,13 +4204,13 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, - lstart, length); + lstart, length, NULL); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { - err = ext4_block_zero_page_range(handle, mapping, - lstart, sb->s_blocksize); + err = ext4_block_zero_page_range(handle, mapping, lstart, + sb->s_blocksize, NULL); if (err) return err; } @@ -4213,7 +4218,7 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, - partial_end + 1); + partial_end + 1, NULL); return err; } From bd099a0565fce5c771e1d0bfcefec26fb5b1c1b7 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:28 +0800 Subject: [PATCH 18/37] ext4: rename and extend ext4_block_truncate_page() Rename ext4_block_truncate_page() to ext4_block_zero_eof() and extend its signature to accept an explicit 'end' offset instead of calculating the block boundary. This helper function now can replace all cases requiring zeroing of the partial EOF block, including the append buffered write paths in ext4_*_write_end(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents.c | 4 ++-- fs/ext4/inode.c | 43 ++++++++++++++++++++++++------------------- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 58fd1ea1e501..0149022ace5e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3097,6 +3097,8 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); +extern int ext4_block_zero_eof(handle_t *handle, struct inode *inode, + loff_t from, loff_t end); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8cce1479be6d..4e1792647937 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4654,8 +4654,8 @@ retry: inode_get_ctime(inode)); if (epos > old_size) { pagecache_isize_extended(inode, old_size, epos); - ext4_zero_partial_blocks(handle, inode, - old_size, epos - old_size); + ext4_block_zero_eof(handle, inode, old_size, + epos); } } ret2 = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f0c9c63f618b..5eade0040e53 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1468,7 +1468,7 @@ static int ext4_write_end(const struct kiocb *iocb, if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); - ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + ext4_block_zero_eof(handle, inode, old_size, pos); } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily @@ -1586,7 +1586,7 @@ static int ext4_journalled_write_end(const struct kiocb *iocb, if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); - ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + ext4_block_zero_eof(handle, inode, old_size, pos); } if (size_changed) { @@ -3282,7 +3282,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, if (IS_ERR(handle)) return PTR_ERR(handle); if (zero_len) - ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_block_zero_eof(handle, inode, old_size, pos); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); @@ -4162,26 +4162,31 @@ static int ext4_block_zero_page_range(handle_t *handle, } /* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. + * Zero out a mapping from file offset 'from' up to the end of the block + * which corresponds to 'from' or to the given 'end' inside this block. + * This required during truncate up and performing append writes. We need + * to physically zero the tail end of that block so it doesn't yield old + * data if the file is grown. */ -static int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) +int ext4_block_zero_eof(handle_t *handle, struct inode *inode, + loff_t from, loff_t end) { - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; + unsigned int blocksize = i_blocksize(inode); + unsigned int offset; + loff_t length = end - from; + offset = from & (blocksize - 1); + if (!offset || from >= end) + return 0; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; - blocksize = i_blocksize(inode); - length = blocksize - (from & (blocksize - 1)); + if (length > blocksize - offset) + length = blocksize - offset; - return ext4_block_zero_page_range(handle, mapping, from, length, NULL); + return ext4_block_zero_page_range(handle, inode->i_mapping, from, + length, NULL); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, @@ -4535,7 +4540,6 @@ int ext4_truncate(struct inode *inode) unsigned int credits; int err = 0, err2; handle_t *handle; - struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode @@ -4578,8 +4582,9 @@ int ext4_truncate(struct inode *inode) goto out_trace; } + /* Zero to the end of the block containing i_size */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, mapping, inode->i_size); + ext4_block_zero_eof(handle, inode, inode->i_size, LLONG_MAX); /* * We add the inode to the orphan list, so that if this @@ -5968,8 +5973,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (oldsize & (inode->i_sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, - inode->i_mapping, oldsize); + ext4_block_zero_eof(handle, inode, + oldsize, LLONG_MAX); } if (shrink) From 3b312a6f510ca217607ffacf5cbca2f08c402ec0 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:29 +0800 Subject: [PATCH 19/37] ext4: factor out journalled block zeroing range Refactor __ext4_block_zero_page_range() by separating the block zeroing operations for ordered data mode and journal data mode into two distinct functions: - ext4_block_do_zero_range(): handles non-journal data mode with ordered data support - ext4_block_journalled_zero_range(): handles journal data mode Also extract a common helper, ext4_load_tail_bh(), to handle buffer head and folio retrieval, along with the associated error handling. This prepares for converting the partial block zero range to the iomap infrastructure. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 98 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 29 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5eade0040e53..3d4650cfc3e0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4032,13 +4032,11 @@ void ext4_set_aops(struct inode *inode) * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ -static int __ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length, - bool *did_zero) +static struct buffer_head *ext4_load_tail_bh(struct inode *inode, loff_t from) { unsigned int offset, blocksize, pos; ext4_lblk_t iblock; - struct inode *inode = mapping->host; + struct address_space *mapping = inode->i_mapping; struct buffer_head *bh; struct folio *folio; int err = 0; @@ -4047,7 +4045,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) - return PTR_ERR(folio); + return ERR_CAST(folio); blocksize = inode->i_sb->s_blocksize; @@ -4099,33 +4097,73 @@ static int __ext4_block_zero_page_range(handle_t *handle, } } } - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, inode->i_sb, bh, - EXT4_JTR_NONE); - if (err) - goto unlock; - } - folio_zero_range(folio, offset, length); + return bh; + +unlock: + folio_unlock(folio); + folio_put(folio); + return err ? ERR_PTR(err) : NULL; +} + +static int ext4_block_do_zero_range(handle_t *handle, struct inode *inode, + loff_t from, loff_t length, bool *did_zero) +{ + struct buffer_head *bh; + struct folio *folio; + int err = 0; + + bh = ext4_load_tail_bh(inode, from); + if (IS_ERR_OR_NULL(bh)) + return PTR_ERR_OR_ZERO(bh); + + folio = bh->b_folio; + folio_zero_range(folio, offset_in_folio(folio, from), length); BUFFER_TRACE(bh, "zeroed end of block"); - if (ext4_should_journal_data(inode)) { - err = ext4_dirty_journalled_data(handle, bh); - } else { - mark_buffer_dirty(bh); - /* - * Only the written block requires ordered data to prevent - * exposing stale data. - */ - if (!buffer_unwritten(bh) && !buffer_delay(bh) && - ext4_should_order_data(inode)) - err = ext4_jbd2_inode_add_write(handle, inode, from, - length); - } + mark_buffer_dirty(bh); + /* + * Only the written block requires ordered data to prevent exposing + * stale data. + */ + if (ext4_should_order_data(inode) && + !buffer_unwritten(bh) && !buffer_delay(bh)) + err = ext4_jbd2_inode_add_write(handle, inode, from, length); if (!err && did_zero) *did_zero = true; -unlock: + folio_unlock(folio); + folio_put(folio); + return err; +} + +static int ext4_block_journalled_zero_range(handle_t *handle, + struct inode *inode, loff_t from, loff_t length, bool *did_zero) +{ + struct buffer_head *bh; + struct folio *folio; + int err; + + bh = ext4_load_tail_bh(inode, from); + if (IS_ERR_OR_NULL(bh)) + return PTR_ERR_OR_ZERO(bh); + folio = bh->b_folio; + + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (err) + goto out; + + folio_zero_range(folio, offset_in_folio(folio, from), length); + BUFFER_TRACE(bh, "zeroed end of block"); + + err = ext4_dirty_journalled_data(handle, bh); + if (err) + goto out; + + if (did_zero) + *did_zero = true; +out: folio_unlock(folio); folio_put(folio); return err; @@ -4156,9 +4194,11 @@ static int ext4_block_zero_page_range(handle_t *handle, if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, did_zero, &ext4_iomap_ops); + } else if (ext4_should_journal_data(inode)) { + return ext4_block_journalled_zero_range(handle, inode, from, + length, did_zero); } - return __ext4_block_zero_page_range(handle, mapping, from, length, - did_zero); + return ext4_block_do_zero_range(handle, inode, from, length, did_zero); } /* From ad11526d1504641b632918e202e23c9c80923fff Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:30 +0800 Subject: [PATCH 20/37] ext4: rename ext4_block_zero_page_range() to ext4_block_zero_range() Rename ext4_block_zero_page_range() to ext4_block_zero_range() since the "page" naming is no longer appropriate for current context. Also change its signature to take an inode pointer instead of an address_space. This aligns with the caller ext4_block_zero_eof() and ext4_zero_partial_blocks(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3d4650cfc3e0..0e39c65880aa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4170,17 +4170,14 @@ out: } /* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that corresponds to 'from' + * Zeros out a mapping of length 'length' starting from file offset + * 'from'. The range to be zero'd must be contained with in one block. + * If the specified range exceeds the end of the block it will be + * shortened to end of the block that corresponds to 'from'. */ -static int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length, - bool *did_zero) +static int ext4_block_zero_range(handle_t *handle, struct inode *inode, + loff_t from, loff_t length, bool *did_zero) { - struct inode *inode = mapping->host; unsigned blocksize = inode->i_sb->s_blocksize; unsigned int max = blocksize - (from & (blocksize - 1)); @@ -4225,15 +4222,13 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode, if (length > blocksize - offset) length = blocksize - offset; - return ext4_block_zero_page_range(handle, inode->i_mapping, from, - length, NULL); + return ext4_block_zero_range(handle, inode, from, length, NULL); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; - struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); @@ -4248,22 +4243,22 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { - err = ext4_block_zero_page_range(handle, mapping, - lstart, length, NULL); + err = ext4_block_zero_range(handle, inode, lstart, + length, NULL); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { - err = ext4_block_zero_page_range(handle, mapping, lstart, - sb->s_blocksize, NULL); + err = ext4_block_zero_range(handle, inode, lstart, + sb->s_blocksize, NULL); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) - err = ext4_block_zero_page_range(handle, mapping, - byte_end - partial_end, - partial_end + 1, NULL); + err = ext4_block_zero_range(handle, inode, + byte_end - partial_end, + partial_end + 1, NULL); return err; } From 69e2d5c1f544982389327ff90b491a0f7d1afe48 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:31 +0800 Subject: [PATCH 21/37] ext4: move ordered data handling out of ext4_block_do_zero_range() Remove the handle parameter from ext4_block_do_zero_range() and move the ordered data handling to ext4_block_zero_eof(). This is necessary for truncate up and append writes across a range extending beyond EOF. The ordered data must be committed before updating i_disksize to prevent exposing stale on-disk data from concurrent post-EOF mmap writes during previous folio writeback or in case of system crash during append writes. This is unnecessary for partial block hole punching because the entire punch operation does not provide atomicity guarantees and can already expose intermediate results in case of crash. Hole punching can only ever expose data that was there before the punch but missed zeroing during append / truncate could expose data that was not visible in the file before the operation. Since ordered data handling is no longer performed inside ext4_zero_partial_blocks(), ext4_punch_hole() no longer needs to attach jinode. This is prepared for the conversion to the iomap infrastructure, which does not use ordered data mode while zeroing post-EOF partial blocks. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 61 ++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0e39c65880aa..57e36b8b5070 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4105,12 +4105,12 @@ unlock: return err ? ERR_PTR(err) : NULL; } -static int ext4_block_do_zero_range(handle_t *handle, struct inode *inode, - loff_t from, loff_t length, bool *did_zero) +static int ext4_block_do_zero_range(struct inode *inode, loff_t from, + loff_t length, bool *did_zero, + bool *zero_written) { struct buffer_head *bh; struct folio *folio; - int err = 0; bh = ext4_load_tail_bh(inode, from); if (IS_ERR_OR_NULL(bh)) @@ -4121,19 +4121,14 @@ static int ext4_block_do_zero_range(handle_t *handle, struct inode *inode, BUFFER_TRACE(bh, "zeroed end of block"); mark_buffer_dirty(bh); - /* - * Only the written block requires ordered data to prevent exposing - * stale data. - */ - if (ext4_should_order_data(inode) && - !buffer_unwritten(bh) && !buffer_delay(bh)) - err = ext4_jbd2_inode_add_write(handle, inode, from, length); - if (!err && did_zero) + if (did_zero) *did_zero = true; + if (zero_written && !buffer_unwritten(bh) && !buffer_delay(bh)) + *zero_written = true; folio_unlock(folio); folio_put(folio); - return err; + return 0; } static int ext4_block_journalled_zero_range(handle_t *handle, @@ -4176,7 +4171,8 @@ out: * shortened to end of the block that corresponds to 'from'. */ static int ext4_block_zero_range(handle_t *handle, struct inode *inode, - loff_t from, loff_t length, bool *did_zero) + loff_t from, loff_t length, bool *did_zero, + bool *zero_written) { unsigned blocksize = inode->i_sb->s_blocksize; unsigned int max = blocksize - (from & (blocksize - 1)); @@ -4195,7 +4191,8 @@ static int ext4_block_zero_range(handle_t *handle, struct inode *inode, return ext4_block_journalled_zero_range(handle, inode, from, length, did_zero); } - return ext4_block_do_zero_range(handle, inode, from, length, did_zero); + return ext4_block_do_zero_range(inode, from, length, did_zero, + zero_written); } /* @@ -4211,6 +4208,9 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode, unsigned int blocksize = i_blocksize(inode); unsigned int offset; loff_t length = end - from; + bool did_zero = false; + bool zero_written = false; + int err; offset = from & (blocksize - 1); if (!offset || from >= end) @@ -4222,7 +4222,21 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode, if (length > blocksize - offset) length = blocksize - offset; - return ext4_block_zero_range(handle, inode, from, length, NULL); + err = ext4_block_zero_range(handle, inode, from, length, + &did_zero, &zero_written); + if (err) + return err; + /* + * It's necessary to order zeroed data before update i_disksize when + * truncating up or performing an append write, because there might be + * exposing stale on-disk data which may caused by concurrent post-EOF + * mmap write during folio writeback. + */ + if (ext4_should_order_data(inode) && + did_zero && zero_written && !IS_DAX(inode)) + err = ext4_jbd2_inode_add_write(handle, inode, from, length); + + return err; } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, @@ -4244,13 +4258,13 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_range(handle, inode, lstart, - length, NULL); + length, NULL, NULL); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_range(handle, inode, lstart, - sb->s_blocksize, NULL); + sb->s_blocksize, NULL, NULL); if (err) return err; } @@ -4258,7 +4272,7 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_range(handle, inode, byte_end - partial_end, - partial_end + 1, NULL); + partial_end + 1, NULL, NULL); return err; } @@ -4433,17 +4447,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) end = max_end; length = end - offset; - /* - * Attach jinode to inode for jbd2 if we do any zeroing of partial - * block. - */ - if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { - ret = ext4_inode_attach_jinode(inode); - if (ret < 0) - return ret; - } - - ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) return ret; From d3609a71b777d073ea6ead2e6eed93e97841fa21 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:32 +0800 Subject: [PATCH 22/37] ext4: remove handle parameters from zero partial block functions Only journal data mode requires an active journal handle when zeroing partial blocks. Stop passing handle_t *handle to ext4_zero_partial_blocks() and related functions, and make ext4_block_journalled_zero_range() start a handle independently. This change has no practical impact now because all callers invoke these functions within the context of an active handle. It prepares for moving ext4_block_zero_eof() out of an active handle in the next patch, which is a prerequisite for converting block zero range operations to iomap infrastructure. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 ++--- fs/ext4/extents.c | 5 ++-- fs/ext4/inode.c | 71 ++++++++++++++++++++++++++++------------------- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0149022ace5e..75559a771a54 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3097,10 +3097,9 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); -extern int ext4_block_zero_eof(handle_t *handle, struct inode *inode, - loff_t from, loff_t end); -extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, - loff_t lstart, loff_t lend); +extern int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end); +extern int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, + loff_t length); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4e1792647937..477f939828b9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4654,8 +4654,7 @@ retry: inode_get_ctime(inode)); if (epos > old_size) { pagecache_isize_extended(inode, old_size, epos); - ext4_block_zero_eof(handle, inode, old_size, - epos); + ext4_block_zero_eof(inode, old_size, epos); } } ret2 = ext4_mark_inode_dirty(handle, inode); @@ -4773,7 +4772,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); + ret = ext4_zero_partial_blocks(inode, offset, len); if (ret) goto out_handle; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 57e36b8b5070..cf537a577392 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1468,7 +1468,7 @@ static int ext4_write_end(const struct kiocb *iocb, if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); - ext4_block_zero_eof(handle, inode, old_size, pos); + ext4_block_zero_eof(inode, old_size, pos); } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily @@ -1586,7 +1586,7 @@ static int ext4_journalled_write_end(const struct kiocb *iocb, if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); - ext4_block_zero_eof(handle, inode, old_size, pos); + ext4_block_zero_eof(inode, old_size, pos); } if (size_changed) { @@ -3282,7 +3282,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, if (IS_ERR(handle)) return PTR_ERR(handle); if (zero_len) - ext4_block_zero_eof(handle, inode, old_size, pos); + ext4_block_zero_eof(inode, old_size, pos); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); @@ -4131,16 +4131,23 @@ static int ext4_block_do_zero_range(struct inode *inode, loff_t from, return 0; } -static int ext4_block_journalled_zero_range(handle_t *handle, - struct inode *inode, loff_t from, loff_t length, bool *did_zero) +static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from, + loff_t length, bool *did_zero) { struct buffer_head *bh; struct folio *folio; + handle_t *handle; int err; + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + bh = ext4_load_tail_bh(inode, from); - if (IS_ERR_OR_NULL(bh)) - return PTR_ERR_OR_ZERO(bh); + if (IS_ERR_OR_NULL(bh)) { + err = PTR_ERR_OR_ZERO(bh); + goto out_handle; + } folio = bh->b_folio; BUFFER_TRACE(bh, "get write access"); @@ -4161,6 +4168,8 @@ static int ext4_block_journalled_zero_range(handle_t *handle, out: folio_unlock(folio); folio_put(folio); +out_handle: + ext4_journal_stop(handle); return err; } @@ -4170,7 +4179,7 @@ out: * If the specified range exceeds the end of the block it will be * shortened to end of the block that corresponds to 'from'. */ -static int ext4_block_zero_range(handle_t *handle, struct inode *inode, +static int ext4_block_zero_range(struct inode *inode, loff_t from, loff_t length, bool *did_zero, bool *zero_written) { @@ -4188,8 +4197,8 @@ static int ext4_block_zero_range(handle_t *handle, struct inode *inode, return dax_zero_range(inode, from, length, did_zero, &ext4_iomap_ops); } else if (ext4_should_journal_data(inode)) { - return ext4_block_journalled_zero_range(handle, inode, from, - length, did_zero); + return ext4_block_journalled_zero_range(inode, from, length, + did_zero); } return ext4_block_do_zero_range(inode, from, length, did_zero, zero_written); @@ -4202,8 +4211,7 @@ static int ext4_block_zero_range(handle_t *handle, struct inode *inode, * to physically zero the tail end of that block so it doesn't yield old * data if the file is grown. */ -int ext4_block_zero_eof(handle_t *handle, struct inode *inode, - loff_t from, loff_t end) +int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end) { unsigned int blocksize = i_blocksize(inode); unsigned int offset; @@ -4222,7 +4230,7 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode, if (length > blocksize - offset) length = blocksize - offset; - err = ext4_block_zero_range(handle, inode, from, length, + err = ext4_block_zero_range(inode, from, length, &did_zero, &zero_written); if (err) return err; @@ -4233,14 +4241,23 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode, * mmap write during folio writeback. */ if (ext4_should_order_data(inode) && - did_zero && zero_written && !IS_DAX(inode)) - err = ext4_jbd2_inode_add_write(handle, inode, from, length); + did_zero && zero_written && !IS_DAX(inode)) { + handle_t *handle; - return err; + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_jbd2_inode_add_write(handle, inode, from, length); + ext4_journal_stop(handle); + if (err) + return err; + } + + return 0; } -int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, - loff_t lstart, loff_t length) +int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; unsigned partial_start, partial_end; @@ -4257,21 +4274,19 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { - err = ext4_block_zero_range(handle, inode, lstart, - length, NULL, NULL); + err = ext4_block_zero_range(inode, lstart, length, NULL, NULL); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { - err = ext4_block_zero_range(handle, inode, lstart, - sb->s_blocksize, NULL, NULL); + err = ext4_block_zero_range(inode, lstart, sb->s_blocksize, + NULL, NULL); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) - err = ext4_block_zero_range(handle, inode, - byte_end - partial_end, + err = ext4_block_zero_range(inode, byte_end - partial_end, partial_end + 1, NULL, NULL); return err; } @@ -4467,7 +4482,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, length); + ret = ext4_zero_partial_blocks(inode, offset, length); if (ret) goto out_handle; @@ -4622,7 +4637,7 @@ int ext4_truncate(struct inode *inode) /* Zero to the end of the block containing i_size */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) - ext4_block_zero_eof(handle, inode, inode->i_size, LLONG_MAX); + ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX); /* * We add the inode to the orphan list, so that if this @@ -6011,8 +6026,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (oldsize & (inode->i_sb->s_blocksize - 1)) - ext4_block_zero_eof(handle, inode, - oldsize, LLONG_MAX); + ext4_block_zero_eof(inode, oldsize, + LLONG_MAX); } if (shrink) From ad1876bc4c4cae59f747b4225007cdc31f834597 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:33 +0800 Subject: [PATCH 23/37] ext4: pass allocate range as loff_t to ext4_alloc_file_blocks() Change ext4_alloc_file_blocks() to accept offset and len in byte granularity instead of block granularity. This allows callers to pass byte offsets and lengths directly, and this prepares for moving the ext4_zero_partial_blocks() call from the while(len) loop for unaligned append writes, where it only needs to be invoked once before doing block allocation. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 53 ++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 477f939828b9..26fa81ee01dd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4571,15 +4571,15 @@ retry_remove_space: return err; } -static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, - ext4_lblk_t len, loff_t new_size, - int flags) +static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len, + loff_t new_size, int flags) { struct inode *inode = file_inode(file); handle_t *handle; int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; + ext4_lblk_t len_lblk; struct ext4_map_blocks map; unsigned int credits; loff_t epos, old_size = i_size_read(inode); @@ -4587,14 +4587,14 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, bool alloc_zero = false; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); - map.m_lblk = offset; - map.m_len = len; + map.m_lblk = offset >> blkbits; + map.m_len = len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple * extents. */ - if (len <= EXT_UNWRITTEN_MAX_LEN) + if (len_lblk <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* @@ -4611,16 +4611,16 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, /* * credits to insert 1 extent into extent tree */ - credits = ext4_chunk_trans_blocks(inode, len); + credits = ext4_chunk_trans_blocks(inode, len_lblk); depth = ext_depth(inode); retry: - while (len) { + while (len_lblk) { /* * Recalculate credits when extent tree depth changes. */ if (depth != ext_depth(inode)) { - credits = ext4_chunk_trans_blocks(inode, len); + credits = ext4_chunk_trans_blocks(inode, len_lblk); depth = ext_depth(inode); } @@ -4677,7 +4677,7 @@ retry: } map.m_lblk += ret; - map.m_len = len = len - ret; + map.m_len = len_lblk = len_lblk - ret; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -4694,11 +4694,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, { struct inode *inode = file_inode(file); handle_t *handle = NULL; - loff_t new_size = 0; + loff_t align_start, align_end, new_size = 0; loff_t end = offset + len; - ext4_lblk_t start_lblk, end_lblk; unsigned int blocksize = i_blocksize(inode); - unsigned int blkbits = inode->i_blkbits; int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4719,11 +4717,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; /* Preallocate the range including the unaligned edges */ if (!IS_ALIGNED(offset | end, blocksize)) { - ext4_lblk_t alloc_lblk = offset >> blkbits; - ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); - - ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, - new_size, flags); + ret = ext4_alloc_file_blocks(file, offset, len, new_size, + flags); if (ret) return ret; } @@ -4738,18 +4733,17 @@ static long ext4_zero_range(struct file *file, loff_t offset, return ret; /* Zero range excluding the unaligned edges */ - start_lblk = EXT4_B_TO_LBLK(inode, offset); - end_lblk = end >> blkbits; - if (end_lblk > start_lblk) { - ext4_lblk_t zero_blks = end_lblk - start_lblk; - + align_start = round_up(offset, blocksize); + align_end = round_down(end, blocksize); + if (align_end > align_start) { if (mode & FALLOC_FL_WRITE_ZEROES) flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; else flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, - new_size, flags); + ret = ext4_alloc_file_blocks(file, align_start, + align_end - align_start, new_size, + flags); if (ret) return ret; } @@ -4797,15 +4791,11 @@ static long ext4_do_fallocate(struct file *file, loff_t offset, struct inode *inode = file_inode(file); loff_t end = offset + len; loff_t new_size = 0; - ext4_lblk_t start_lblk, len_lblk; int ret; trace_ext4_fallocate_enter(inode, offset, len, mode); WARN_ON_ONCE(!inode_is_locked(inode)); - start_lblk = offset >> inode->i_blkbits; - len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); - /* We only support preallocation for extent-based files only. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; @@ -4820,7 +4810,7 @@ static long ext4_do_fallocate(struct file *file, loff_t offset, goto out; } - ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + ret = ext4_alloc_file_blocks(file, offset, len, new_size, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); if (ret) goto out; @@ -4830,7 +4820,8 @@ static long ext4_do_fallocate(struct file *file, loff_t offset, EXT4_I(inode)->i_sync_tid); } out: - trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); + trace_ext4_fallocate_exit(inode, offset, + EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits), ret); return ret; } From c4602a1d09ec7c6dd6f53e5faf3f04e9c02d71eb Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:34 +0800 Subject: [PATCH 24/37] ext4: move zero partial block range functions out of active handle Move ext4_block_zero_eof() and ext4_zero_partial_blocks() calls out of the active handle context, making them independent operations, and also add return value checks. This is safe because it still ensures data is updated before metadata for data=ordered mode and data=journal mode because we still zero data and ordering data before modifying the metadata. This change is required for iomap infrastructure conversion because the iomap buffered I/O path does not use the same journal infrastructure for partial block zeroing. The lock ordering of folio lock and starting transactions is "folio lock -> transaction start", which is opposite of the current path. Therefore, zeroing partial blocks cannot be performed under the active handle. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-9-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 32 +++++++++++++++----------------- fs/ext4/inode.c | 47 ++++++++++++++++++++++++++--------------------- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 26fa81ee01dd..0053e2123fea 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4614,6 +4614,13 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len, credits = ext4_chunk_trans_blocks(inode, len_lblk); depth = ext_depth(inode); + /* Zero to the end of the block containing i_size */ + if (new_size > old_size) { + ret = ext4_block_zero_eof(inode, old_size, LLONG_MAX); + if (ret) + return ret; + } + retry: while (len_lblk) { /* @@ -4652,10 +4659,8 @@ retry: if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); - if (epos > old_size) { + if (epos > old_size) pagecache_isize_extended(inode, old_size, epos); - ext4_block_zero_eof(inode, old_size, epos); - } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4697,7 +4702,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t align_start, align_end, new_size = 0; loff_t end = offset + len; unsigned int blocksize = i_blocksize(inode); - int ret, flags, credits; + int ret, flags; trace_ext4_zero_range(inode, offset, len, mode); WARN_ON_ONCE(!inode_is_locked(inode)); @@ -4751,25 +4756,18 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ALIGNED(offset | end, blocksize)) return ret; - /* - * In worst case we have to writeout two nonadjacent unwritten - * blocks and update the inode - */ - credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; - if (ext4_should_journal_data(inode)) - credits += 2; - handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(inode, offset, len); + if (ret) + return ret; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); return ret; } - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(inode, offset, len); - if (ret) - goto out_handle; - if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cf537a577392..54a376ee0717 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4471,8 +4471,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (ret) return ret; + ret = ext4_zero_partial_blocks(inode, offset, length); + if (ret) + return ret; + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_chunk_trans_extent(inode, 2); + credits = ext4_chunk_trans_extent(inode, 0); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); @@ -4482,10 +4486,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return ret; } - ret = ext4_zero_partial_blocks(inode, offset, length); - if (ret) - goto out_handle; - /* If there are blocks to remove, do it */ start_lblk = EXT4_B_TO_LBLK(inode, offset); end_lblk = end >> inode->i_blkbits; @@ -4622,6 +4622,11 @@ int ext4_truncate(struct inode *inode) err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; + + /* Zero to the end of the block containing i_size */ + err = ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX); + if (err) + goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4635,10 +4640,6 @@ int ext4_truncate(struct inode *inode) goto out_trace; } - /* Zero to the end of the block containing i_size */ - if (inode->i_size & (inode->i_sb->s_blocksize - 1)) - ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX); - /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will @@ -6008,15 +6009,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out_mmap_sem; } - handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto out_mmap_sem; - } - if (ext4_handle_valid(handle) && shrink) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } /* * Update c/mtime and tail zero the EOF folio on * truncate up. ext4_truncate() handles the shrink case @@ -6025,9 +6017,22 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - if (oldsize & (inode->i_sb->s_blocksize - 1)) - ext4_block_zero_eof(inode, oldsize, - LLONG_MAX); + if (oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_block_zero_eof(inode, + oldsize, LLONG_MAX); + if (error) + goto out_mmap_sem; + } + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_mmap_sem; + } + if (ext4_handle_valid(handle) && shrink) { + error = ext4_orphan_add(handle, inode); + orphan = 1; } if (shrink) From 7d81ec0246ff74b10d92a4617fea84eaf06162c0 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:35 +0800 Subject: [PATCH 25/37] ext4: ensure zeroed partial blocks are persisted in SYNC mode In ext4_zero_range() and ext4_punch_hole(), when operating in SYNC mode and zeroing a partial block, only data=journal modes guarantee that the zeroed data is synchronously persisted after the operation completes. For data=ordered/writeback mode and non-journal modes, this guarantee is missing. Introduce a partial_zero parameter to explicitly trigger writeback for all scenarios where a partial block is zeroed, ensuring the zeroed data is durably persisted. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20260327102939.1095257-10-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 9 ++++++++- fs/ext4/inode.c | 19 ++++++++++++++----- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 75559a771a54..56b82d4a15d7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3099,7 +3099,7 @@ extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); extern int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end); extern int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, - loff_t length); + loff_t length, bool *did_zero); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0053e2123fea..00b9860d3875 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4702,6 +4702,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t align_start, align_end, new_size = 0; loff_t end = offset + len; unsigned int blocksize = i_blocksize(inode); + bool partial_zeroed = false; int ret, flags; trace_ext4_zero_range(inode, offset, len, mode); @@ -4757,9 +4758,15 @@ static long ext4_zero_range(struct file *file, loff_t offset, return ret; /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(inode, offset, len); + ret = ext4_zero_partial_blocks(inode, offset, len, &partial_zeroed); if (ret) return ret; + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) { + ret = filemap_write_and_wait_range(inode->i_mapping, offset, + end - 1); + if (ret) + return ret; + } handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 54a376ee0717..cb1365bbb843 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4257,7 +4257,8 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end) return 0; } -int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length) +int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length, + bool *did_zero) { struct super_block *sb = inode->i_sb; unsigned partial_start, partial_end; @@ -4274,20 +4275,21 @@ int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length) /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { - err = ext4_block_zero_range(inode, lstart, length, NULL, NULL); + err = ext4_block_zero_range(inode, lstart, length, did_zero, + NULL); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_range(inode, lstart, sb->s_blocksize, - NULL, NULL); + did_zero, NULL); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_range(inode, byte_end - partial_end, - partial_end + 1, NULL, NULL); + partial_end + 1, did_zero, NULL); return err; } @@ -4436,6 +4438,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t end = offset + length; handle_t *handle; unsigned int credits; + bool partial_zeroed = false; int ret; trace_ext4_punch_hole(inode, offset, length, 0); @@ -4471,9 +4474,15 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (ret) return ret; - ret = ext4_zero_partial_blocks(inode, offset, length); + ret = ext4_zero_partial_blocks(inode, offset, length, &partial_zeroed); if (ret) return ret; + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) { + ret = filemap_write_and_wait_range(inode->i_mapping, offset, + end - 1); + if (ret) + return ret; + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_chunk_trans_extent(inode, 0); From c3688d212fc6306bbb7136fbc1d0be0f175a5270 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:36 +0800 Subject: [PATCH 26/37] ext4: unify SYNC mode checks in fallocate paths In the ext4 fallocate call chain, SYNC mode handling is inconsistent: some places check the inode state, while others check the open file descriptor state. Unify these checks by evaluating both conditions to ensure consistent behavior across all fallocate operations. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-11-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 9 +++++---- fs/ext4/inode.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 00b9860d3875..053aeb9f0e74 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4782,7 +4782,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (file->f_flags & O_SYNC) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: @@ -4820,7 +4820,8 @@ static long ext4_do_fallocate(struct file *file, loff_t offset, if (ret) goto out; - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && + EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } @@ -5593,7 +5594,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (IS_SYNC(inode)) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: @@ -5717,7 +5718,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (IS_SYNC(inode)) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cb1365bbb843..9c1b95c439d5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4531,7 +4531,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (IS_SYNC(inode)) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); From 116c0bdac2ec059d91045ba3f57cc90cb1e3b71d Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:37 +0800 Subject: [PATCH 27/37] ext4: remove ctime/mtime update from ext4_alloc_file_blocks() The ctime and mtime update is already handled by file_modified() in ext4_fallocate(), the caller of ext4_alloc_file_blocks(). So remove the redundant calls to inode_set_ctime_current() and inode_set_mtime_to_ts() in ext4_alloc_file_blocks(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-12-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 053aeb9f0e74..4e7e798a5e49 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4652,13 +4652,10 @@ retry: */ retries = 0; epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); - inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; - if (ext4_update_inode_size(inode, epos) & 0x1) - inode_set_mtime_to_ts(inode, - inode_get_ctime(inode)); + ext4_update_inode_size(inode, epos); if (epos > old_size) pagecache_isize_extended(inode, old_size, epos); } From 1ad0f42823291bcac371dafd37533f5e8d92acc3 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:38 +0800 Subject: [PATCH 28/37] ext4: move pagecache_isize_extended() out of active handle In ext4_alloc_file_blocks(), pagecache_isize_extended() is called under an active handle and may also hold folio lock if the block size is smaller than the folio size. This also breaks the "folio lock -> transaction start" lock ordering for the upcoming iomap buffered I/O path. Therefore, move pagecache_isize_extended() outside of an active handle. Additionally, it is unnecessary to update the file length during each iteration of the allocation loop. Instead, update the file length only to the position where the allocation is successful. Postpone updating the inode size until after the allocation loop completes or is interrupted due to an error. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-13-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 62 +++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4e7e798a5e49..11e76deace4b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4582,7 +4582,7 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len, ext4_lblk_t len_lblk; struct ext4_map_blocks map; unsigned int credits; - loff_t epos, old_size = i_size_read(inode); + loff_t epos = 0, old_size = i_size_read(inode); unsigned int blkbits = inode->i_blkbits; bool alloc_zero = false; @@ -4647,44 +4647,60 @@ retry: ext4_journal_stop(handle); break; } + ext4_update_inode_fsync_trans(handle, inode, 1); + ret = ext4_journal_stop(handle); + if (unlikely(ret)) + break; + /* * allow a full retry cycle for any remaining allocations */ retries = 0; - epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); - if (new_size) { - if (epos > new_size) - epos = new_size; - ext4_update_inode_size(inode, epos); - if (epos > old_size) - pagecache_isize_extended(inode, old_size, epos); - } - ret2 = ext4_mark_inode_dirty(handle, inode); - ext4_update_inode_fsync_trans(handle, inode, 1); - ret3 = ext4_journal_stop(handle); - ret2 = ret3 ? ret3 : ret2; - if (unlikely(ret2)) - break; if (alloc_zero && (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { - ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, - map.m_len); - if (likely(!ret2)) - ret2 = ext4_convert_unwritten_extents(NULL, + ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, + map.m_len); + if (likely(!ret)) + ret = ext4_convert_unwritten_extents(NULL, inode, (loff_t)map.m_lblk << blkbits, (loff_t)map.m_len << blkbits); - if (ret2) + if (ret) break; } - map.m_lblk += ret; - map.m_len = len_lblk = len_lblk - ret; + map.m_lblk += map.m_len; + map.m_len = len_lblk = len_lblk - map.m_len; + epos = EXT4_LBLK_TO_B(inode, map.m_lblk); } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - return ret > 0 ? ret2 : ret; + if (!epos || !new_size) + return ret; + + /* + * Allocate blocks, update the file size to match the size of the + * already successfully allocated blocks. + */ + if (epos > new_size) + epos = new_size; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return ret ? ret : PTR_ERR(handle); + + ext4_update_inode_size(inode, epos); + ret2 = ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); + ret3 = ext4_journal_stop(handle); + ret2 = ret3 ? ret3 : ret2; + + if (epos > old_size) + pagecache_isize_extended(inode, old_size, epos); + + return ret ? ret : ret2; } static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); From 3f60efd65412dfe4ff33b376a983220ef74056b1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 27 Mar 2026 18:29:39 +0800 Subject: [PATCH 29/37] ext4: zero post-EOF partial block before appending write In cases of appending write beyond EOF, ext4_zero_partial_blocks() is called within ext4_*_write_end() to zero out the partial block beyond EOF. This prevents exposing stale data that might be written through mmap. However, supporting only the regular buffered write path is insufficient. It is also necessary to support the DAX path as well as the upcoming iomap buffered write path. Therefore, move this operation to ext4_write_checks(). In addition, this may introduce a race window in which a post-EOF buffered write can race with an mmap write after the old EOF block has been zeroed. As a result, the data in this block written by the buffer-write and the data written by the mmap-write may be mixed. However, this is safe because users should not rely on the result of the race condition. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260327102939.1095257-14-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 17 +++++++++++++++++ fs/ext4/inode.c | 21 +++++++-------------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f1dc5ce791a7..ec0d81bea07a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -271,6 +271,8 @@ static ssize_t ext4_generic_write_checks(struct kiocb *iocb, static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { + struct inode *inode = file_inode(iocb->ki_filp); + loff_t old_size = i_size_read(inode); ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); @@ -280,6 +282,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) ret = file_modified(iocb->ki_filp); if (ret) return ret; + + /* + * If the position is beyond the EOF, it is necessary to zero out the + * partial block that beyond the existing EOF, as it may contains + * stale data written through mmap. + */ + if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos); + if (ret) + return ret; + } + return count; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c1b95c439d5..7eb4daea3faa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1466,10 +1466,9 @@ static int ext4_write_end(const struct kiocb *iocb, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) { + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - ext4_block_zero_eof(inode, old_size, pos); - } + /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1584,10 +1583,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) { + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - ext4_block_zero_eof(inode, old_size, pos); - } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -3226,7 +3223,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size, zero_len = 0; + loff_t new_i_size; handle_t *handle; if (unlikely(!folio_buffers(folio))) { @@ -3270,19 +3267,15 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (pos > old_size) { + if (pos > old_size) pagecache_isize_extended(inode, old_size, pos); - zero_len = pos - old_size; - } - if (!disksize_changed && !zero_len) + if (!disksize_changed) return copied; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); - if (zero_len) - ext4_block_zero_eof(inode, old_size, pos); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); From eceafc31ea7b42c984ece10d79d505c0bb6615d5 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Sat, 28 Mar 2026 20:30:38 +0530 Subject: [PATCH 30/37] ext4: fix bounds check in check_xattrs() to prevent out-of-bounds access The bounds check for the next xattr entry in check_xattrs() uses (void *)next >= end, which allows next to point within sizeof(u32) bytes of end. On the next loop iteration, IS_LAST_ENTRY() reads 4 bytes via *(__u32 *)(entry), which can overrun the valid xattr region. For example, if next lands at end - 1, the check passes since next < end, but IS_LAST_ENTRY() reads 4 bytes starting at end - 1, accessing 3 bytes beyond the valid region. Fix this by changing the check to (void *)next + sizeof(u32) > end, ensuring there is always enough space for the IS_LAST_ENTRY() read on the subsequent iteration. Fixes: 3478c83cf26b ("ext4: improve xattr consistency checking and error reporting") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20260224231429.31361-1-kartikey406@gmail.com/T/ [v1] Signed-off-by: Deepanshu Kartikey Link: https://patch.msgid.link/20260328150038.349497-1-kartikey406@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7bf9ba19a89d..c6205b405efe 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -226,7 +226,7 @@ check_xattrs(struct inode *inode, struct buffer_head *bh, /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); - if ((void *)next >= end) { + if ((void *)next + sizeof(u32) > end) { err_str = "e_name out of bounds"; goto errout; } From 5941a072d48841255005e3a5b5a620692d81d1a7 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Mar 2026 21:30:31 +0800 Subject: [PATCH 31/37] ext4: fix miss unlock 'sb->s_umount' in extents_kunit_init() There's warning as follows when do ext4 kunit test: WARNING: kunit_try_catch/15923 still has locks held! 7.0.0-rc3-next-20260309-00028-g73f965a1bbb1-dirty #281 Tainted: G E N 1 lock held by kunit_try_catch/15923: #0: ffff888139f860e0 (&type->s_umount_key#70/1){+.+.}-{4:4}, at: alloc_super.constprop.0+0x172/0xa90 Call Trace: dump_stack_lvl+0x180/0x1b0 debug_check_no_locks_held+0xc8/0xd0 do_exit+0x1502/0x2b20 kthread+0x3a9/0x540 ret_from_fork+0xa76/0xdf0 ret_from_fork_asm+0x1a/0x30 As sget() will return 'sb' which holds 's->s_umount' lock. However, "extents-test" miss unlock this lock. So unlock 's->s_umount' in the end of extents_kunit_init(). Fixes: cb1e0c1d1fad ("ext4: kunit tests for extent splitting and conversion") Signed-off-by: Ye Bin Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260330133035.287842-2-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents-test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 5496b2c8e2cd..82c59291e045 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -309,6 +309,8 @@ static int extents_kunit_init(struct kunit *test) kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub); kunit_activate_static_stub(test, ext4_issue_zeroout, ext4_issue_zeroout_stub); + up_write(&sb->s_umount); + return 0; } From f9c1f7647ac8fb70bebb1615ac112d1568abe339 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Mar 2026 21:30:32 +0800 Subject: [PATCH 32/37] ext4: call deactivate_super() in extents_kunit_exit() Call deactivate_super() is called in extents_kunit_exit() to cleanup the file system resource. Fixes: cb1e0c1d1fad ("ext4: kunit tests for extent splitting and conversion") Signed-off-by: Ye Bin Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260330133035.287842-3-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents-test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 82c59291e045..3d4663d99eb1 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -146,6 +146,7 @@ static void extents_kunit_exit(struct kunit *test) struct ext4_sb_info *sbi = sb->s_fs_info; ext4_es_unregister_shrinker(sbi); + deactivate_super(sbi->s_sb); kfree(sbi); kfree(k_ctx.k_ei); kfree(k_ctx.k_data); From 17f73c95d47325000ee68492be3ad76ae09f6f19 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Mar 2026 21:30:33 +0800 Subject: [PATCH 33/37] ext4: fix the error handling process in extents_kunit_init). The error processing in extents_kunit_init() is improper, causing resource leakage. Reconstruct the error handling process to prevent potential resource leaks Fixes: cb1e0c1d1fad ("ext4: kunit tests for extent splitting and conversion") Signed-off-by: Ye Bin Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260330133035.287842-4-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents-test.c | 54 +++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 3d4663d99eb1..4042bc8a95e2 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -225,33 +225,37 @@ static int extents_kunit_init(struct kunit *test) (struct kunit_ext_test_param *)(test->param_value); int err; - sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); - if (IS_ERR(sb)) - return PTR_ERR(sb); - - sb->s_blocksize = 4096; - sb->s_blocksize_bits = 12; - sbi = kzalloc_obj(struct ext4_sb_info); if (sbi == NULL) return -ENOMEM; + sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); + if (IS_ERR(sb)) { + kfree(sbi); + return PTR_ERR(sb); + } + sbi->s_sb = sb; sb->s_fs_info = sbi; + sb->s_blocksize = 4096; + sb->s_blocksize_bits = 12; + if (!param || !param->disable_zeroout) sbi->s_extent_max_zeroout_kb = 32; - /* setup the mock inode */ - k_ctx.k_ei = kzalloc_obj(struct ext4_inode_info); - if (k_ctx.k_ei == NULL) - return -ENOMEM; - ei = k_ctx.k_ei; - inode = &ei->vfs_inode; - err = ext4_es_register_shrinker(sbi); if (err) - return err; + goto out_deactivate; + + /* setup the mock inode */ + k_ctx.k_ei = kzalloc_obj(struct ext4_inode_info); + if (k_ctx.k_ei == NULL) { + err = -ENOMEM; + goto out; + } + ei = k_ctx.k_ei; + inode = &ei->vfs_inode; ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); @@ -267,8 +271,10 @@ static int extents_kunit_init(struct kunit *test) inode->i_sb = sb; k_ctx.k_data = kzalloc(EXT_DATA_LEN * 4096, GFP_KERNEL); - if (k_ctx.k_data == NULL) - return -ENOMEM; + if (k_ctx.k_data == NULL) { + err = -ENOMEM; + goto out; + } /* * set the data area to a junk value @@ -313,6 +319,20 @@ static int extents_kunit_init(struct kunit *test) up_write(&sb->s_umount); return 0; + +out: + kfree(k_ctx.k_ei); + k_ctx.k_ei = NULL; + + kfree(k_ctx.k_data); + k_ctx.k_data = NULL; + + ext4_es_unregister_shrinker(sbi); +out_deactivate: + deactivate_locked_super(sb); + kfree(sbi); + + return err; } /* From ca78c31af467ffe94b15f6a2e4e1cc1c164db19b Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Mar 2026 21:30:34 +0800 Subject: [PATCH 34/37] ext4: fix possible null-ptr-deref in extents_kunit_exit() There's issue as follows: KASAN: null-ptr-deref in range [0x00000000000002c0-0x00000000000002c7] Tainted: [E]=UNSIGNED_MODULE, [N]=TEST RIP: 0010:extents_kunit_exit+0x2e/0xc0 [ext4_test] Call Trace: kunit_try_run_case_cleanup+0xbc/0x100 [kunit] kunit_generic_run_threadfn_adapter+0x89/0x100 [kunit] kthread+0x408/0x540 ret_from_fork+0xa76/0xdf0 ret_from_fork_asm+0x1a/0x30 Above issue happens as extents_kunit_init() init testcase failed. So test if testcase is inited success. Fixes: cb1e0c1d1fad ("ext4: kunit tests for extent splitting and conversion") Signed-off-by: Ye Bin Reviewed-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/20260330133035.287842-5-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents-test.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 4042bc8a95e2..6b53a3f39fcd 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -142,9 +142,12 @@ static struct file_system_type ext_fs_type = { static void extents_kunit_exit(struct kunit *test) { - struct super_block *sb = k_ctx.k_ei->vfs_inode.i_sb; - struct ext4_sb_info *sbi = sb->s_fs_info; + struct ext4_sb_info *sbi; + if (!k_ctx.k_ei) + return; + + sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info; ext4_es_unregister_shrinker(sbi); deactivate_super(sbi->s_sb); kfree(sbi); From 22f53f08d9eb837ce69b1a07641d414aac8d045f Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Mar 2026 21:30:35 +0800 Subject: [PATCH 35/37] ext4: fix possible null-ptr-deref in mbt_kunit_exit() There's issue as follows: # test_new_blocks_simple: failed to initialize: -12 KASAN: null-ptr-deref in range [0x0000000000000638-0x000000000000063f] Tainted: [E]=UNSIGNED_MODULE, [N]=TEST RIP: 0010:mbt_kunit_exit+0x5e/0x3e0 [ext4_test] Call Trace: kunit_try_run_case_cleanup+0xbc/0x100 [kunit] kunit_generic_run_threadfn_adapter+0x89/0x100 [kunit] kthread+0x408/0x540 ret_from_fork+0xa76/0xdf0 ret_from_fork_asm+0x1a/0x30 If mbt_kunit_init() init testcase failed will lead to null-ptr-deref. So add test if 'sb' is inited success in mbt_kunit_exit(). Fixes: 7c9fa399a369 ("ext4: add first unit test for ext4_mb_new_blocks_simple in mballoc") Signed-off-by: Ye Bin Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/20260330133035.287842-6-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc-test.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 6f5bfbb0e8a4..95cb644cd32f 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -362,7 +362,6 @@ static int mbt_kunit_init(struct kunit *test) return ret; } - test->priv = sb; kunit_activate_static_stub(test, ext4_read_block_bitmap_nowait, ext4_read_block_bitmap_nowait_stub); @@ -383,6 +382,8 @@ static int mbt_kunit_init(struct kunit *test) return -ENOMEM; } + test->priv = sb; + return 0; } @@ -390,6 +391,9 @@ static void mbt_kunit_exit(struct kunit *test) { struct super_block *sb = (struct super_block *)test->priv; + if (!sb) + return; + mbt_mb_release(sb); mbt_ctx_release(sb); mbt_ext4_free_super_block(sb); From 77d059519382bd66283e6a4e83ee186e87e7708f Mon Sep 17 00:00:00 2001 From: Sohei Koyama Date: Mon, 6 Apr 2026 16:48:30 +0900 Subject: [PATCH 36/37] ext4: fix missing brelse() in ext4_xattr_inode_dec_ref_all() The commit c8e008b60492 ("ext4: ignore xattrs past end") introduced a refcount leak in when block_csum is false. ext4_xattr_inode_dec_ref_all() calls ext4_get_inode_loc() to get iloc.bh, but never releases it with brelse(). Fixes: c8e008b60492 ("ext4: ignore xattrs past end") Signed-off-by: Sohei Koyama Reviewed-by: Andreas Dilger Reviewed-by: Ritesh Harjani (IBM) Cc: stable@vger.kernel.org Reviewed-by: Zhang Yi Reviewed-by: Baokun Li Link: https://patch.msgid.link/20260406074830.8480-1-skoyama@ddn.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c6205b405efe..a4eaee58e545 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1165,7 +1165,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; - struct ext4_iloc iloc; + struct ext4_iloc iloc = { .bh = NULL }; bool dirty = false; unsigned int ea_ino; int err; @@ -1260,6 +1260,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, ext4_warning_inode(parent, "handle dirty metadata err=%d", err); } + + brelse(iloc.bh); } /* From 981fcc5674e67158d24d23e841523eccba19d0e7 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 9 Apr 2026 19:42:03 +0800 Subject: [PATCH 37/37] jbd2: fix deadlock in jbd2_journal_cancel_revoke() Commit f76d4c28a46a ("fs/jbd2: use sleeping version of __find_get_block()") changed jbd2_journal_cancel_revoke() to use __find_get_block_nonatomic() which holds the folio lock instead of i_private_lock. This breaks the lock ordering (folio -> buffer) and causes an ABBA deadlock when the filesystem blocksize < pagesize: T1 T2 ext4_mkdir() ext4_init_new_dir() ext4_append() ext4_getblk() lock_buffer() <- A sync_blockdev() blkdev_writepages() writeback_iter() writeback_get_folio() folio_lock() <- B ext4_journal_get_create_access() jbd2_journal_cancel_revoke() __find_get_block_nonatomic() folio_lock() <- B block_write_full_folio() lock_buffer() <- A This can occasionally cause generic/013 to hang. Fix by only calling __find_get_block_nonatomic() when the passed buffer_head doesn't belong to the bdev, which is the only case that we need to look up its bdev alias. Otherwise, the lookup is redundant since the found buffer_head is equal to the one we passed in. Fixes: f76d4c28a46a ("fs/jbd2: use sleeping version of __find_get_block()") Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20260409114204.917154-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/revoke.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 9016ddb82447..e4c2fbd381f1 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -428,6 +428,7 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) journal_t *journal = handle->h_transaction->t_journal; int need_cancel; struct buffer_head *bh = jh2bh(jh); + struct address_space *bh_mapping = bh->b_folio->mapping; jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); @@ -464,13 +465,14 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ - if (need_cancel) { + if (need_cancel && !sb_is_blkdev_sb(bh_mapping->host->i_sb)) { struct buffer_head *bh2; + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { - if (bh2 != bh) - clear_buffer_revoked(bh2); + WARN_ON_ONCE(bh2 == bh); + clear_buffer_revoked(bh2); __brelse(bh2); } }