diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0cf68f85dfd1..94283a991e5c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -1082,9 +1081,6 @@ struct ext4_inode_info { spinlock_t i_raw_lock; /* protects updates to the raw inode */ - /* Fast commit wait queue for this inode */ - wait_queue_head_t i_fc_wait; - /* * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len * and inode's EXT4_FC_STATE_COMMITTING state bit. @@ -2976,7 +2972,8 @@ void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); -void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); @@ -3101,8 +3098,9 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); -extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, - loff_t lstart, loff_t lend); +extern int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end); +extern int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, + loff_t length, bool *did_zero); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); @@ -3721,7 +3719,7 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, - struct dentry *dentry); + const struct qstr *d_name, struct dentry *dentry); #define S_SHIFT 12 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 5496b2c8e2cd..6b53a3f39fcd 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -142,10 +142,14 @@ static struct file_system_type ext_fs_type = { static void extents_kunit_exit(struct kunit *test) { - struct super_block *sb = k_ctx.k_ei->vfs_inode.i_sb; - struct ext4_sb_info *sbi = sb->s_fs_info; + struct ext4_sb_info *sbi; + if (!k_ctx.k_ei) + return; + + sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info; ext4_es_unregister_shrinker(sbi); + deactivate_super(sbi->s_sb); kfree(sbi); kfree(k_ctx.k_ei); kfree(k_ctx.k_data); @@ -224,33 +228,37 @@ static int extents_kunit_init(struct kunit *test) (struct kunit_ext_test_param *)(test->param_value); int err; - sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); - if (IS_ERR(sb)) - return PTR_ERR(sb); - - sb->s_blocksize = 4096; - sb->s_blocksize_bits = 12; - sbi = kzalloc_obj(struct ext4_sb_info); if (sbi == NULL) return -ENOMEM; + sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); + if (IS_ERR(sb)) { + kfree(sbi); + return PTR_ERR(sb); + } + sbi->s_sb = sb; sb->s_fs_info = sbi; + sb->s_blocksize = 4096; + sb->s_blocksize_bits = 12; + if (!param || !param->disable_zeroout) sbi->s_extent_max_zeroout_kb = 32; - /* setup the mock inode */ - k_ctx.k_ei = kzalloc_obj(struct ext4_inode_info); - if (k_ctx.k_ei == NULL) - return -ENOMEM; - ei = k_ctx.k_ei; - inode = &ei->vfs_inode; - err = ext4_es_register_shrinker(sbi); if (err) - return err; + goto out_deactivate; + + /* setup the mock inode */ + k_ctx.k_ei = kzalloc_obj(struct ext4_inode_info); + if (k_ctx.k_ei == NULL) { + err = -ENOMEM; + goto out; + } + ei = k_ctx.k_ei; + inode = &ei->vfs_inode; ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); @@ -266,8 +274,10 @@ static int extents_kunit_init(struct kunit *test) inode->i_sb = sb; k_ctx.k_data = kzalloc(EXT_DATA_LEN * 4096, GFP_KERNEL); - if (k_ctx.k_data == NULL) - return -ENOMEM; + if (k_ctx.k_data == NULL) { + err = -ENOMEM; + goto out; + } /* * set the data area to a junk value @@ -309,7 +319,23 @@ static int extents_kunit_init(struct kunit *test) kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub); kunit_activate_static_stub(test, ext4_issue_zeroout, ext4_issue_zeroout_stub); + up_write(&sb->s_umount); + return 0; + +out: + kfree(k_ctx.k_ei); + k_ctx.k_ei = NULL; + + kfree(k_ctx.k_data); + k_ctx.k_data = NULL; + + ext4_es_unregister_shrinker(sbi); +out_deactivate: + deactivate_locked_super(sb); + kfree(sbi); + + return err; } /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 95d404486f1a..125f628e738a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4571,30 +4571,30 @@ retry_remove_space: return err; } -static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, - ext4_lblk_t len, loff_t new_size, - int flags) +static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len, + loff_t new_size, int flags) { struct inode *inode = file_inode(file); handle_t *handle; int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; + ext4_lblk_t len_lblk; struct ext4_map_blocks map; unsigned int credits; - loff_t epos, old_size = i_size_read(inode); + loff_t epos = 0, old_size = i_size_read(inode); unsigned int blkbits = inode->i_blkbits; bool alloc_zero = false; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); - map.m_lblk = offset; - map.m_len = len; + map.m_lblk = offset >> blkbits; + map.m_len = len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple * extents. */ - if (len <= EXT_UNWRITTEN_MAX_LEN) + if (len_lblk <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* @@ -4611,16 +4611,23 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, /* * credits to insert 1 extent into extent tree */ - credits = ext4_chunk_trans_blocks(inode, len); + credits = ext4_chunk_trans_blocks(inode, len_lblk); depth = ext_depth(inode); + /* Zero to the end of the block containing i_size */ + if (new_size > old_size) { + ret = ext4_block_zero_eof(inode, old_size, LLONG_MAX); + if (ret) + return ret; + } + retry: - while (len) { + while (len_lblk) { /* * Recalculate credits when extent tree depth changes. */ if (depth != ext_depth(inode)) { - credits = ext4_chunk_trans_blocks(inode, len); + credits = ext4_chunk_trans_blocks(inode, len_lblk); depth = ext_depth(inode); } @@ -4640,50 +4647,60 @@ retry: ext4_journal_stop(handle); break; } + ext4_update_inode_fsync_trans(handle, inode, 1); + ret = ext4_journal_stop(handle); + if (unlikely(ret)) + break; + /* * allow a full retry cycle for any remaining allocations */ retries = 0; - epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); - inode_set_ctime_current(inode); - if (new_size) { - if (epos > new_size) - epos = new_size; - if (ext4_update_inode_size(inode, epos) & 0x1) - inode_set_mtime_to_ts(inode, - inode_get_ctime(inode)); - if (epos > old_size) { - pagecache_isize_extended(inode, old_size, epos); - ext4_zero_partial_blocks(handle, inode, - old_size, epos - old_size); - } - } - ret2 = ext4_mark_inode_dirty(handle, inode); - ext4_update_inode_fsync_trans(handle, inode, 1); - ret3 = ext4_journal_stop(handle); - ret2 = ret3 ? ret3 : ret2; - if (unlikely(ret2)) - break; if (alloc_zero && (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { - ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, - map.m_len); - if (likely(!ret2)) - ret2 = ext4_convert_unwritten_extents(NULL, + ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, + map.m_len); + if (likely(!ret)) + ret = ext4_convert_unwritten_extents(NULL, inode, (loff_t)map.m_lblk << blkbits, (loff_t)map.m_len << blkbits); - if (ret2) + if (ret) break; } - map.m_lblk += ret; - map.m_len = len = len - ret; + map.m_lblk += map.m_len; + map.m_len = len_lblk = len_lblk - map.m_len; + epos = EXT4_LBLK_TO_B(inode, map.m_lblk); } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - return ret > 0 ? ret2 : ret; + if (!epos || !new_size) + return ret; + + /* + * Allocate blocks, update the file size to match the size of the + * already successfully allocated blocks. + */ + if (epos > new_size) + epos = new_size; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return ret ? ret : PTR_ERR(handle); + + ext4_update_inode_size(inode, epos); + ret2 = ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); + ret3 = ext4_journal_stop(handle); + ret2 = ret3 ? ret3 : ret2; + + if (epos > old_size) + pagecache_isize_extended(inode, old_size, epos); + + return ret ? ret : ret2; } static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); @@ -4695,12 +4712,11 @@ static long ext4_zero_range(struct file *file, loff_t offset, { struct inode *inode = file_inode(file); handle_t *handle = NULL; - loff_t new_size = 0; + loff_t align_start, align_end, new_size = 0; loff_t end = offset + len; - ext4_lblk_t start_lblk, end_lblk; unsigned int blocksize = i_blocksize(inode); - unsigned int blkbits = inode->i_blkbits; - int ret, flags, credits; + bool partial_zeroed = false; + int ret, flags; trace_ext4_zero_range(inode, offset, len, mode); WARN_ON_ONCE(!inode_is_locked(inode)); @@ -4720,11 +4736,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; /* Preallocate the range including the unaligned edges */ if (!IS_ALIGNED(offset | end, blocksize)) { - ext4_lblk_t alloc_lblk = offset >> blkbits; - ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); - - ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, - new_size, flags); + ret = ext4_alloc_file_blocks(file, offset, len, new_size, + flags); if (ret) return ret; } @@ -4739,18 +4752,17 @@ static long ext4_zero_range(struct file *file, loff_t offset, return ret; /* Zero range excluding the unaligned edges */ - start_lblk = EXT4_B_TO_LBLK(inode, offset); - end_lblk = end >> blkbits; - if (end_lblk > start_lblk) { - ext4_lblk_t zero_blks = end_lblk - start_lblk; - + align_start = round_up(offset, blocksize); + align_end = round_down(end, blocksize); + if (align_end > align_start) { if (mode & FALLOC_FL_WRITE_ZEROES) flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; else flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, - new_size, flags); + ret = ext4_alloc_file_blocks(file, align_start, + align_end - align_start, new_size, + flags); if (ret) return ret; } @@ -4758,25 +4770,24 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ALIGNED(offset | end, blocksize)) return ret; - /* - * In worst case we have to writeout two nonadjacent unwritten - * blocks and update the inode - */ - credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; - if (ext4_should_journal_data(inode)) - credits += 2; - handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(inode, offset, len, &partial_zeroed); + if (ret) + return ret; + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) { + ret = filemap_write_and_wait_range(inode->i_mapping, offset, + end - 1); + if (ret) + return ret; + } + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); return ret; } - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); - if (ret) - goto out_handle; - if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); @@ -4784,7 +4795,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (file->f_flags & O_SYNC) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: @@ -4798,15 +4809,11 @@ static long ext4_do_fallocate(struct file *file, loff_t offset, struct inode *inode = file_inode(file); loff_t end = offset + len; loff_t new_size = 0; - ext4_lblk_t start_lblk, len_lblk; int ret; trace_ext4_fallocate_enter(inode, offset, len, mode); WARN_ON_ONCE(!inode_is_locked(inode)); - start_lblk = offset >> inode->i_blkbits; - len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); - /* We only support preallocation for extent-based files only. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; @@ -4821,17 +4828,19 @@ static long ext4_do_fallocate(struct file *file, loff_t offset, goto out; } - ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + ret = ext4_alloc_file_blocks(file, offset, len, new_size, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); if (ret) goto out; - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && + EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } out: - trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); + trace_ext4_fallocate_exit(inode, offset, + EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits), ret); return ret; } @@ -5598,7 +5607,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (IS_SYNC(inode)) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: @@ -5722,7 +5731,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (IS_SYNC(inode)) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 390f25dee2b1..b3c22636251d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -13,6 +13,7 @@ #include "mballoc.h" #include +#include /* * Ext4 Fast Commits * ----------------- @@ -215,7 +216,6 @@ void ext4_fc_init_inode(struct inode *inode) ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); - init_waitqueue_head(&ei->i_fc_wait); } static bool ext4_fc_disabled(struct super_block *sb) @@ -224,6 +224,12 @@ static bool ext4_fc_disabled(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } +static bool ext4_fc_eligible(struct super_block *sb) +{ + return !ext4_fc_disabled(sb) && + !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)); +} + /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. @@ -320,7 +326,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl if (ext4_fc_disabled(sb)) return; - if (handle && !IS_ERR(handle)) + if (!IS_ERR_OR_NULL(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); @@ -473,13 +479,8 @@ void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) - return; - - __ext4_fc_track_unlink(handle, inode, dentry); + if (ext4_fc_eligible(inode->i_sb)) + __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, @@ -496,17 +497,11 @@ void __ext4_fc_track_link(handle_t *handle, trace_ext4_fc_track_link(handle, inode, dentry, ret); } -void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) +void ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) - return; - - __ext4_fc_track_link(handle, inode, dentry); + if (ext4_fc_eligible(inode->i_sb)) + __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, @@ -527,13 +522,8 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) - return; - - __ext4_fc_track_create(handle, inode, dentry); + if (ext4_fc_eligible(inode->i_sb)) + __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ @@ -557,16 +547,13 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (S_ISDIR(inode->i_mode)) return; - if (ext4_fc_disabled(inode->i_sb)) - return; - if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + if (!ext4_fc_eligible(inode->i_sb)) return; /* @@ -644,10 +631,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star if (S_ISDIR(inode->i_mode)) return; - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) + if (!ext4_fc_eligible(inode->i_sb)) return; if (ext4_has_inline_data(inode)) { @@ -1446,7 +1430,6 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, struct inode *inode) { struct inode *dir = NULL; - struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; @@ -1457,21 +1440,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, goto out; } - dentry_dir = d_obtain_alias(dir); - if (IS_ERR(dentry_dir)) { - ext4_debug("Failed to obtain dentry"); - dentry_dir = NULL; - goto out; - } - - dentry_inode = d_alloc(dentry_dir, &qstr_dname); - if (!dentry_inode) { - ext4_debug("Inode dentry not created."); - ret = -ENOMEM; - goto out; - } - - ret = __ext4_link(dir, inode, dentry_inode); + ret = __ext4_link(dir, inode, &qstr_dname, NULL); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR @@ -1485,16 +1454,8 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, ret = 0; out: - if (dentry_dir) { - d_drop(dentry_dir); - dput(dentry_dir); - } else if (dir) { + if (dir) iput(dir); - } - if (dentry_inode) { - d_drop(dentry_inode); - dput(dentry_inode); - } return ret; } @@ -1759,8 +1720,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, } /* Replay add range tag */ -static int ext4_fc_replay_add_range(struct super_block *sb, - struct ext4_fc_tl_mem *tl, u8 *val) +static int ext4_fc_replay_add_range(struct super_block *sb, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; @@ -1880,8 +1840,7 @@ out: /* Replay DEL_RANGE tag */ static int -ext4_fc_replay_del_range(struct super_block *sb, - struct ext4_fc_tl_mem *tl, u8 *val) +ext4_fc_replay_del_range(struct super_block *sb, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; @@ -2251,13 +2210,13 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: - ret = ext4_fc_replay_add_range(sb, &tl, val); + ret = ext4_fc_replay_add_range(sb, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: - ret = ext4_fc_replay_del_range(sb, &tl, val); + ret = ext4_fc_replay_del_range(sb, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5e02f6cf653e..eb1a323962b1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -270,6 +270,8 @@ static ssize_t ext4_generic_write_checks(struct kiocb *iocb, static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { + struct inode *inode = file_inode(iocb->ki_filp); + loff_t old_size = i_size_read(inode); ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); @@ -279,6 +281,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) ret = file_modified(iocb->ki_filp); if (ret) return ret; + + /* + * If the position is beyond the EOF, it is necessary to zero out the + * partial block that beyond the existing EOF, as it may contains + * stale data written through mmap. + */ + if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos); + if (ret) + return ret; + } + return count; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 64dba7679245..c2c2d6ac7f3d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1468,10 +1468,9 @@ static int ext4_write_end(const struct kiocb *iocb, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) { + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); - } + /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1586,10 +1585,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) { + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); - } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1759,8 +1756,22 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { - if (folio_mapped(folio)) + if (folio_mapped(folio)) { folio_clear_dirty_for_io(folio); + /* + * Unmap folio from page + * tables to prevent + * subsequent accesses through + * stale PTEs. This ensures + * future accesses trigger new + * page faults rather than + * reusing the invalidated + * folio. + */ + unmap_mapping_pages(folio->mapping, + folio->index, + folio_nr_pages(folio), false); + } block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); @@ -3043,17 +3054,23 @@ static int ext4_writepages(struct address_space *mapping, int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { + loff_t range_start, range_end; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; + + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) + return 0; + + wbc.range_start = range_start; + wbc.range_end = range_end; + return ext4_do_writepages(&mpd); } @@ -3208,7 +3225,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size, zero_len = 0; + loff_t new_i_size; handle_t *handle; if (unlikely(!folio_buffers(folio))) { @@ -3252,19 +3269,15 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (pos > old_size) { + if (pos > old_size) pagecache_isize_extended(inode, old_size, pos); - zero_len = pos - old_size; - } - if (!disksize_changed && !zero_len) + if (!disksize_changed) return copied; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); - if (zero_len) - ext4_zero_partial_blocks(handle, inode, old_size, zero_len); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); @@ -4014,12 +4027,11 @@ void ext4_set_aops(struct inode *inode) * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ -static int __ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) +static struct buffer_head *ext4_load_tail_bh(struct inode *inode, loff_t from) { unsigned int offset, blocksize, pos; ext4_lblk_t iblock; - struct inode *inode = mapping->host; + struct address_space *mapping = inode->i_mapping; struct buffer_head *bh; struct folio *folio; int err = 0; @@ -4028,7 +4040,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) - return PTR_ERR(folio); + return ERR_CAST(folio); blocksize = inode->i_sb->s_blocksize; @@ -4080,47 +4092,92 @@ static int __ext4_block_zero_page_range(handle_t *handle, } } } - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, inode->i_sb, bh, - EXT4_JTR_NONE); - if (err) - goto unlock; - } - folio_zero_range(folio, offset, length); - BUFFER_TRACE(bh, "zeroed end of block"); - - if (ext4_should_journal_data(inode)) { - err = ext4_dirty_journalled_data(handle, bh); - } else { - mark_buffer_dirty(bh); - /* - * Only the written block requires ordered data to prevent - * exposing stale data. - */ - if (!buffer_unwritten(bh) && !buffer_delay(bh) && - ext4_should_order_data(inode)) - err = ext4_jbd2_inode_add_write(handle, inode, from, - length); - } + return bh; unlock: folio_unlock(folio); folio_put(folio); + return err ? ERR_PTR(err) : NULL; +} + +static int ext4_block_do_zero_range(struct inode *inode, loff_t from, + loff_t length, bool *did_zero, + bool *zero_written) +{ + struct buffer_head *bh; + struct folio *folio; + + bh = ext4_load_tail_bh(inode, from); + if (IS_ERR_OR_NULL(bh)) + return PTR_ERR_OR_ZERO(bh); + + folio = bh->b_folio; + folio_zero_range(folio, offset_in_folio(folio, from), length); + BUFFER_TRACE(bh, "zeroed end of block"); + + mark_buffer_dirty(bh); + if (did_zero) + *did_zero = true; + if (zero_written && !buffer_unwritten(bh) && !buffer_delay(bh)) + *zero_written = true; + + folio_unlock(folio); + folio_put(folio); + return 0; +} + +static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from, + loff_t length, bool *did_zero) +{ + struct buffer_head *bh; + struct folio *folio; + handle_t *handle; + int err; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + bh = ext4_load_tail_bh(inode, from); + if (IS_ERR_OR_NULL(bh)) { + err = PTR_ERR_OR_ZERO(bh); + goto out_handle; + } + folio = bh->b_folio; + + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); + if (err) + goto out; + + folio_zero_range(folio, offset_in_folio(folio, from), length); + BUFFER_TRACE(bh, "zeroed end of block"); + + err = ext4_dirty_journalled_data(handle, bh); + if (err) + goto out; + + if (did_zero) + *did_zero = true; +out: + folio_unlock(folio); + folio_put(folio); +out_handle: + ext4_journal_stop(handle); return err; } /* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that corresponds to 'from' + * Zeros out a mapping of length 'length' starting from file offset + * 'from'. The range to be zero'd must be contained with in one block. + * If the specified range exceeds the end of the block it will be + * shortened to end of the block that corresponds to 'from'. */ -static int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) +static int ext4_block_zero_range(struct inode *inode, + loff_t from, loff_t length, bool *did_zero, + bool *zero_written) { - struct inode *inode = mapping->host; unsigned blocksize = inode->i_sb->s_blocksize; unsigned int max = blocksize - (from & (blocksize - 1)); @@ -4132,40 +4189,73 @@ static int ext4_block_zero_page_range(handle_t *handle, length = max; if (IS_DAX(inode)) { - return dax_zero_range(inode, from, length, NULL, + return dax_zero_range(inode, from, length, did_zero, &ext4_iomap_ops); + } else if (ext4_should_journal_data(inode)) { + return ext4_block_journalled_zero_range(inode, from, length, + did_zero); } - return __ext4_block_zero_page_range(handle, mapping, from, length); + return ext4_block_do_zero_range(inode, from, length, did_zero, + zero_written); } /* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. + * Zero out a mapping from file offset 'from' up to the end of the block + * which corresponds to 'from' or to the given 'end' inside this block. + * This required during truncate up and performing append writes. We need + * to physically zero the tail end of that block so it doesn't yield old + * data if the file is grown. */ -static int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) +int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end) { - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; + unsigned int blocksize = i_blocksize(inode); + unsigned int offset; + loff_t length = end - from; + bool did_zero = false; + bool zero_written = false; + int err; + offset = from & (blocksize - 1); + if (!offset || from >= end) + return 0; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; - blocksize = i_blocksize(inode); - length = blocksize - (from & (blocksize - 1)); + if (length > blocksize - offset) + length = blocksize - offset; - return ext4_block_zero_page_range(handle, mapping, from, length); + err = ext4_block_zero_range(inode, from, length, + &did_zero, &zero_written); + if (err) + return err; + /* + * It's necessary to order zeroed data before update i_disksize when + * truncating up or performing an append write, because there might be + * exposing stale on-disk data which may caused by concurrent post-EOF + * mmap write during folio writeback. + */ + if (ext4_should_order_data(inode) && + did_zero && zero_written && !IS_DAX(inode)) { + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_jbd2_inode_add_write(handle, inode, from, length); + ext4_journal_stop(handle); + if (err) + return err; + } + + return 0; } -int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, - loff_t lstart, loff_t length) +int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length, + bool *did_zero) { struct super_block *sb = inode->i_sb; - struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); @@ -4180,22 +4270,21 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { - err = ext4_block_zero_page_range(handle, mapping, - lstart, length); + err = ext4_block_zero_range(inode, lstart, length, did_zero, + NULL); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { - err = ext4_block_zero_page_range(handle, mapping, - lstart, sb->s_blocksize); + err = ext4_block_zero_range(inode, lstart, sb->s_blocksize, + did_zero, NULL); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) - err = ext4_block_zero_page_range(handle, mapping, - byte_end - partial_end, - partial_end + 1); + err = ext4_block_zero_range(inode, byte_end - partial_end, + partial_end + 1, did_zero, NULL); return err; } @@ -4344,6 +4433,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t end = offset + length; handle_t *handle; unsigned int credits; + bool partial_zeroed = false; int ret; trace_ext4_punch_hole(inode, offset, length, 0); @@ -4370,17 +4460,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) end = max_end; length = end - offset; - /* - * Attach jinode to inode for jbd2 if we do any zeroing of partial - * block. - */ - if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { - ret = ext4_inode_attach_jinode(inode); - if (ret < 0) - return ret; - } - - ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) return ret; @@ -4390,8 +4469,18 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (ret) return ret; + ret = ext4_zero_partial_blocks(inode, offset, length, &partial_zeroed); + if (ret) + return ret; + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) { + ret = filemap_write_and_wait_range(inode->i_mapping, offset, + end - 1); + if (ret) + return ret; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_chunk_trans_extent(inode, 2); + credits = ext4_chunk_trans_extent(inode, 0); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); @@ -4401,10 +4490,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, length); - if (ret) - goto out_handle; - /* If there are blocks to remove, do it */ start_lblk = EXT4_B_TO_LBLK(inode, offset); end_lblk = end >> inode->i_blkbits; @@ -4441,7 +4526,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); - if (IS_SYNC(inode)) + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); @@ -4512,7 +4597,6 @@ int ext4_truncate(struct inode *inode) unsigned int credits; int err = 0, err2; handle_t *handle; - struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode @@ -4542,6 +4626,11 @@ int ext4_truncate(struct inode *inode) err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; + + /* Zero to the end of the block containing i_size */ + err = ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX); + if (err) + goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4555,9 +4644,6 @@ int ext4_truncate(struct inode *inode) goto out_trace; } - if (inode->i_size & (inode->i_sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, mapping, inode->i_size); - /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will @@ -5927,15 +6013,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out_mmap_sem; } - handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto out_mmap_sem; - } - if (ext4_handle_valid(handle) && shrink) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } /* * Update c/mtime and tail zero the EOF folio on * truncate up. ext4_truncate() handles the shrink case @@ -5944,9 +6021,22 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - if (oldsize & (inode->i_sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, - inode->i_mapping, oldsize); + if (oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_block_zero_eof(inode, + oldsize, LLONG_MAX); + if (error) + goto out_mmap_sem; + } + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_mmap_sem; + } + if (ext4_handle_valid(handle) && shrink) { + error = ext4_orphan_add(handle, inode); + orphan = 1; } if (shrink) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index c75b91ae0cf0..90ed505fa4b1 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -362,7 +362,6 @@ static int mbt_kunit_init(struct kunit *test) return ret; } - test->priv = sb; kunit_activate_static_stub(test, ext4_read_block_bitmap_nowait, ext4_read_block_bitmap_nowait_stub); @@ -383,6 +382,8 @@ static int mbt_kunit_init(struct kunit *test) return -ENOMEM; } + test->priv = sb; + return 0; } @@ -390,6 +391,9 @@ static void mbt_kunit_exit(struct kunit *test) { struct super_block *sb = (struct super_block *)test->priv; + if (!sb) + return; + mbt_mb_release(sb); mbt_ctx_release(sb); mbt_ext4_free_super_block(sb); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a44514a3f5fe..ed1bd00e11cd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2876,7 +2876,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); - if (bh && !IS_ERR(bh)) { + if (!IS_ERR_OR_NULL(bh)) { if (!buffer_uptodate(bh) && cnt) (*cnt)++; brelse(bh); @@ -4561,22 +4561,16 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ - /* XXX: should this table be tunable? */ start_off = 0; - if (size <= 16 * 1024) { - size = 16 * 1024; - } else if (size <= 32 * 1024) { - size = 32 * 1024; - } else if (size <= 64 * 1024) { - size = 64 * 1024; - } else if (size <= 128 * 1024) { - size = 128 * 1024; - } else if (size <= 256 * 1024) { - size = 256 * 1024; - } else if (size <= 512 * 1024) { - size = 512 * 1024; - } else if (size <= 1024 * 1024) { - size = 1024 * 1024; + if (size <= SZ_1M) { + /* + * For files up to 1MB, round up the preallocation size to + * the next power of two, with a minimum of 16KB. + */ + if (size <= (unsigned long)SZ_16K) + size = SZ_16K; + else + size = roundup_pow_of_two(size); } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (21 - bsbits)) << 21; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index ab17c1d3a7b5..3329b7ad5dbd 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -224,8 +224,8 @@ static int mext_move_begin(struct mext_data *mext, struct folio *folio[2], } /* Adjust the moving length according to the length of shorter folio. */ - move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos, - folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos); + move_len = umin(folio_next_pos(folio[0]) - orig_pos, + folio_next_pos(folio[1]) - donor_pos); move_len >>= blkbits; if (move_len < mext->orig_map.m_len) mext->orig_map.m_len = move_len; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 503dc9ffd614..4a47fbd8dd30 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -647,7 +647,7 @@ static struct stats dx_show_leaf(struct inode *dir, /* Directory is not encrypted */ (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); - printk("%*.s:(U)%x.%u ", len, + printk("%.*s:(U)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -683,7 +683,7 @@ static struct stats dx_show_leaf(struct inode *dir, (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); - printk("%*.s:(E)%x.%u ", len, name, + printk("%.*s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); fscrypt_fname_free_buffer( @@ -694,7 +694,7 @@ static struct stats dx_show_leaf(struct inode *dir, char *name = de->name; (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); - printk("%*.s:%x.%u ", len, name, h.hash, + printk("%.*s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif } @@ -723,7 +723,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); bh = ext4_bread(NULL,dir, block, 0); - if (!bh || IS_ERR(bh)) + if (IS_ERR_OR_NULL(bh)) continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): @@ -2353,10 +2353,10 @@ out_frames: * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ -static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +static int __ext4_add_entry(handle_t *handle, struct inode *dir, + const struct qstr *d_name, struct inode *inode) { - struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; @@ -2373,13 +2373,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, sb = dir->i_sb; blocksize = sb->s_blocksize; - if (fscrypt_is_nokey_name(dentry)) - return -ENOKEY; - - if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + if (!generic_ci_validate_strict_name(dir, d_name)) return -EINVAL; - retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + retval = ext4_fname_setup_filename(dir, d_name, 0, &fname); if (retval) return retval; @@ -2460,6 +2457,16 @@ out: return retval; } +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = d_inode(dentry->d_parent); + + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + return __ext4_add_entry(handle, dir, &dentry->d_name, inode); +} + /* * Returns 0 for success, or a negative error value */ @@ -3445,7 +3452,8 @@ out_retry: return err; } -int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) +int __ext4_link(struct inode *dir, struct inode *inode, + const struct qstr *d_name, struct dentry *dentry) { handle_t *handle; int err, retries = 0; @@ -3461,9 +3469,8 @@ retry: inode_set_ctime_current(inode); ext4_inc_count(inode); - ihold(inode); - err = ext4_add_entry(handle, dentry, inode); + err = __ext4_add_entry(handle, dir, d_name, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being @@ -3471,11 +3478,10 @@ retry: */ if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); - d_instantiate(dentry, inode); - ext4_fc_track_link(handle, dentry); + if (dentry) + ext4_fc_track_link(handle, inode, dentry); } else { drop_nlink(inode); - iput(inode); } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) @@ -3504,9 +3510,13 @@ static int ext4_link(struct dentry *old_dentry, err = dquot_initialize(dir); if (err) return err; - return __ext4_link(dir, inode, dentry); + err = __ext4_link(dir, inode, &dentry->d_name, dentry); + if (!err) { + ihold(inode); + d_instantiate(dentry, inode); + } + return err; } - /* * Try to find buffer head where contains the parent block. * It should be the inode block if it is inlined or the 1st block diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 578508eb4f1a..6a77db4d3124 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -521,6 +521,7 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, { struct buffer_head *bh, *head; struct journal_head *jh; + transaction_t *trans = READ_ONCE(jinode->i_transaction); bh = head = folio_buffers(folio); do { @@ -539,7 +540,7 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != jinode->i_transaction || + (jh && (jh->b_transaction != trans || jh->b_next_transaction))) return true; } while ((bh = bh->b_this_page) != head); @@ -550,15 +551,20 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + loff_t range_start, range_end; struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; + }; struct folio *folio = NULL; int error; + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) + return 0; + + wbc.range_start = range_start; + wbc.range_end = range_end; + /* * writeback_iter() already checks for dirty pages and calls * folio_clear_dirty_for_io(), which we want to write protect the diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 645240cc0229..b612262719ed 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -92,7 +92,7 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); - if (IS_ERR(bh) || !bh) + if (IS_ERR_OR_NULL(bh)) return ERR_PTR(-ECHILD); if (!ext4_buffer_uptodate(bh)) { brelse(bh); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 60aec4712f7f..982a1f831e22 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -226,7 +226,7 @@ check_xattrs(struct inode *inode, struct buffer_head *bh, /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); - if ((void *)next >= end) { + if ((void *)next + sizeof(u32) > end) { err_str = "e_name out of bounds"; goto errout; } @@ -1165,7 +1165,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; - struct ext4_iloc iloc; + struct ext4_iloc iloc = { .bh = NULL }; bool dirty = false; unsigned int ea_ino; int err; @@ -1260,6 +1260,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, ext4_warning_inode(parent, "handle dirty metadata err=%d", err); } + + brelse(iloc.bh); } /* diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7203d2d2624d..8cf61e7185c4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -180,7 +180,13 @@ static int journal_wait_on_commit_record(journal_t *journal, /* Send all the data buffers related to an inode */ int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) + unsigned long flags; + + if (!jinode) + return 0; + + flags = READ_ONCE(jinode->i_flags); + if (!(flags & JI_WRITE_DATA)) return 0; trace_jbd2_submit_inode_data(jinode->i_vfs_inode); @@ -191,12 +197,30 @@ EXPORT_SYMBOL(jbd2_submit_inode_data); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || - !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) + struct address_space *mapping; + struct inode *inode; + unsigned long flags; + loff_t start_byte, end_byte; + + if (!jinode) + return 0; + + flags = READ_ONCE(jinode->i_flags); + if (!(flags & JI_WAIT_DATA)) + return 0; + + inode = jinode->i_vfs_inode; + if (!inode) + return 0; + + mapping = inode->i_mapping; + if (!mapping) + return 0; + + if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) return 0; return filemap_fdatawait_range_keep_errors( - jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, - jinode->i_dirty_end); + mapping, start_byte, end_byte); } EXPORT_SYMBOL(jbd2_wait_inode_data); @@ -218,7 +242,8 @@ static int journal_submit_data_buffers(journal_t *journal, list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { if (!(jinode->i_flags & JI_WRITE_DATA)) continue; - jinode->i_flags |= JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, + jinode->i_flags | JI_COMMIT_RUNNING); spin_unlock(&journal->j_list_lock); /* submit the inode data buffers. */ trace_jbd2_submit_inode_data(jinode->i_vfs_inode); @@ -229,7 +254,8 @@ static int journal_submit_data_buffers(journal_t *journal, } spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - jinode->i_flags &= ~JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, + jinode->i_flags & ~JI_COMMIT_RUNNING); smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -240,10 +266,13 @@ static int journal_submit_data_buffers(journal_t *journal, int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + loff_t start_byte, end_byte; + + if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) + return 0; return filemap_fdatawait_range_keep_errors(mapping, - jinode->i_dirty_start, - jinode->i_dirty_end); + start_byte, end_byte); } /* @@ -262,7 +291,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { if (!(jinode->i_flags & JI_WAIT_DATA)) continue; - jinode->i_flags |= JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING); spin_unlock(&journal->j_list_lock); /* wait for the inode data buffers writeout. */ if (journal->j_finish_inode_data_buffers) { @@ -272,7 +301,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, } cond_resched(); spin_lock(&journal->j_list_lock); - jinode->i_flags &= ~JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING); smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -288,8 +317,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, &jinode->i_transaction->t_inode_list); } else { jinode->i_transaction = NULL; - jinode->i_dirty_start = 0; - jinode->i_dirty_end = 0; + WRITE_ONCE(jinode->i_dirty_start_page, 0); + WRITE_ONCE(jinode->i_dirty_end_page, 0); } } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b60918ed8a99..4f397fcdb13c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -3018,8 +3018,8 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) jinode->i_next_transaction = NULL; jinode->i_vfs_inode = inode; jinode->i_flags = 0; - jinode->i_dirty_start = 0; - jinode->i_dirty_end = 0; + jinode->i_dirty_start_page = 0; + jinode->i_dirty_end_page = 0; INIT_LIST_HEAD(&jinode->i_list); } @@ -3176,4 +3176,3 @@ MODULE_DESCRIPTION("Generic filesystem journal-writing module"); MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); - diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 9016ddb82447..e4c2fbd381f1 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -428,6 +428,7 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) journal_t *journal = handle->h_transaction->t_journal; int need_cancel; struct buffer_head *bh = jh2bh(jh); + struct address_space *bh_mapping = bh->b_folio->mapping; jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); @@ -464,13 +465,14 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ - if (need_cancel) { + if (need_cancel && !sb_is_blkdev_sb(bh_mapping->host->i_sb)) { struct buffer_head *bh2; + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { - if (bh2 != bh) - clear_buffer_revoked(bh2); + WARN_ON_ONCE(bh2 == bh); + clear_buffer_revoked(bh2); __brelse(bh2); } } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a90f9092706c..4885903bbd10 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -474,7 +474,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, return ERR_PTR(-EROFS); if (handle) { - J_ASSERT(handle->h_transaction->t_journal == journal); + if (WARN_ON_ONCE(handle->h_transaction->t_journal != journal)) + return ERR_PTR(-EINVAL); handle->h_ref++; return handle; } @@ -1036,7 +1037,13 @@ repeat: */ if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); + if (WARN_ON_ONCE(jh->b_next_transaction)) { + spin_unlock(&jh->b_state_lock); + unlock_buffer(bh); + error = -EINVAL; + jbd2_journal_abort(journal, error); + goto out; + } JBUFFER_TRACE(jh, "file as BJ_Reserved"); /* * Make sure all stores to jh (b_modified, b_frozen_data) are @@ -1069,13 +1076,27 @@ repeat: */ if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + if (WARN_ON_ONCE(jh->b_next_transaction)) { + spin_unlock(&jh->b_state_lock); + error = -EINVAL; + jbd2_journal_abort(journal, error); + goto out; + } goto attach_next; } JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); + if (WARN_ON_ONCE(jh->b_next_transaction || + jh->b_transaction != + journal->j_committing_transaction)) { + pr_err("JBD2: %s: assertion failure: b_next_transaction=%p b_transaction=%p j_committing_transaction=%p\n", + journal->j_devname, jh->b_next_transaction, + jh->b_transaction, journal->j_committing_transaction); + spin_unlock(&jh->b_state_lock); + error = -EINVAL; + jbd2_journal_abort(journal, error); + goto out; + } /* * There is one case we have to be very careful about. If the @@ -1302,7 +1323,12 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) goto out; } - J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + if (WARN_ON_ONCE(!buffer_locked(jh2bh(jh)))) { + err = -EINVAL; + spin_unlock(&jh->b_state_lock); + jbd2_journal_abort(journal, err); + goto out; + } if (jh->b_transaction == NULL) { /* @@ -1491,7 +1517,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal; + journal_t *journal = transaction->t_journal; struct journal_head *jh; int ret = 0; @@ -1515,8 +1541,14 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (data_race(jh->b_transaction != transaction && jh->b_next_transaction != transaction)) { spin_lock(&jh->b_state_lock); - J_ASSERT_JH(jh, jh->b_transaction == transaction || - jh->b_next_transaction == transaction); + if (WARN_ON_ONCE(jh->b_transaction != transaction && + jh->b_next_transaction != transaction)) { + pr_err("JBD2: %s: assertion failure: b_transaction=%p transaction=%p b_next_transaction=%p\n", + journal->j_devname, jh->b_transaction, + transaction, jh->b_next_transaction); + ret = -EINVAL; + goto out_unlock_bh; + } spin_unlock(&jh->b_state_lock); } if (data_race(jh->b_modified == 1)) { @@ -1524,15 +1556,15 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { spin_lock(&jh->b_state_lock); - if (jh->b_transaction == transaction && - jh->b_jlist != BJ_Metadata) - pr_err("JBD2: assertion failure: h_type=%u " - "h_line_no=%u block_no=%llu jlist=%u\n", + if (WARN_ON_ONCE(jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata)) { + pr_err("JBD2: assertion failure: h_type=%u h_line_no=%u block_no=%llu jlist=%u\n", handle->h_type, handle->h_line_no, (unsigned long long) bh->b_blocknr, jh->b_jlist); - J_ASSERT_JH(jh, jh->b_transaction != transaction || - jh->b_jlist == BJ_Metadata); + ret = -EINVAL; + goto out_unlock_bh; + } spin_unlock(&jh->b_state_lock); } goto out; @@ -1552,8 +1584,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out_unlock_bh; } - journal = transaction->t_journal; - if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part @@ -1631,7 +1661,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } /* That test should have eliminated the following case: */ - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + if (WARN_ON_ONCE(jh->b_frozen_data)) { + ret = -EINVAL; + goto out_unlock_bh; + } JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); @@ -1670,6 +1703,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) int err = 0; int was_modified = 0; int wait_for_writeback = 0; + int abort_journal = 0; if (is_handle_aborted(handle)) return -EROFS; @@ -1703,7 +1737,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) jh->b_modified = 0; if (jh->b_transaction == transaction) { - J_ASSERT_JH(jh, !jh->b_frozen_data); + if (WARN_ON_ONCE(jh->b_frozen_data)) { + err = -EINVAL; + abort_journal = 1; + goto drop; + } /* If we are forgetting a buffer which is already part * of this transaction, then we can just drop it from @@ -1742,8 +1780,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { - J_ASSERT_JH(jh, (jh->b_transaction == - journal->j_committing_transaction)); + if (WARN_ON_ONCE(jh->b_transaction != journal->j_committing_transaction)) { + err = -EINVAL; + abort_journal = 1; + goto drop; + } /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); @@ -1761,7 +1802,11 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } else { - J_ASSERT(jh->b_next_transaction == transaction); + if (WARN_ON_ONCE(jh->b_next_transaction != transaction)) { + err = -EINVAL; + abort_journal = 1; + goto drop; + } /* * only drop a reference if this transaction modified @@ -1807,6 +1852,8 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) drop: __brelse(bh); spin_unlock(&jh->b_state_lock); + if (abort_journal) + jbd2_journal_abort(journal, err); if (wait_for_writeback) wait_on_buffer(bh); jbd2_journal_put_journal_head(jh); @@ -2131,7 +2178,8 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio) struct buffer_head *bh; bool ret = false; - J_ASSERT(folio_test_locked(folio)); + if (WARN_ON_ONCE(!folio_test_locked(folio))) + return false; head = folio_buffers(folio); bh = head; @@ -2646,6 +2694,9 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, { transaction_t *transaction = handle->h_transaction; journal_t *journal; + pgoff_t start_page, end_page; + int err = 0; + int abort_transaction = 0; if (is_handle_aborted(handle)) return -EROFS; @@ -2654,15 +2705,21 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, jbd2_debug(4, "Adding inode %llu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); - spin_lock(&journal->j_list_lock); - jinode->i_flags |= flags; + start_page = (pgoff_t)(start_byte >> PAGE_SHIFT); + end_page = (pgoff_t)(end_byte >> PAGE_SHIFT) + 1; - if (jinode->i_dirty_end) { - jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); - jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); + spin_lock(&journal->j_list_lock); + WRITE_ONCE(jinode->i_flags, jinode->i_flags | flags); + + if (jinode->i_dirty_start_page != jinode->i_dirty_end_page) { + WRITE_ONCE(jinode->i_dirty_start_page, + min(jinode->i_dirty_start_page, start_page)); + WRITE_ONCE(jinode->i_dirty_end_page, + max(jinode->i_dirty_end_page, end_page)); } else { - jinode->i_dirty_start = start_byte; - jinode->i_dirty_end = end_byte; + /* Publish a new non-empty range by making end visible first. */ + WRITE_ONCE(jinode->i_dirty_end_page, end_page); + WRITE_ONCE(jinode->i_dirty_start_page, start_page); } /* Is inode already attached where we need it? */ @@ -2680,20 +2737,33 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { - J_ASSERT(jinode->i_next_transaction == NULL); - J_ASSERT(jinode->i_transaction == - journal->j_committing_transaction); + if (WARN_ON_ONCE(jinode->i_next_transaction || + jinode->i_transaction != + journal->j_committing_transaction)) { + pr_err("JBD2: %s: assertion failure: i_next_transaction=%p i_transaction=%p j_committing_transaction=%p\n", + journal->j_devname, jinode->i_next_transaction, + jinode->i_transaction, + journal->j_committing_transaction); + err = -EINVAL; + abort_transaction = 1; + goto done; + } jinode->i_next_transaction = transaction; goto done; } /* Not on any transaction list... */ - J_ASSERT(!jinode->i_next_transaction); + if (WARN_ON_ONCE(jinode->i_next_transaction)) { + err = -EINVAL; + abort_transaction = 1; + goto done; + } jinode->i_transaction = transaction; list_add(&jinode->i_list, &transaction->t_inode_list); done: spin_unlock(&journal->j_list_lock); - - return 0; + if (abort_transaction) + jbd2_journal_abort(journal, err); + return err; } int jbd2_journal_inode_ranged_write(handle_t *handle, @@ -2739,7 +2809,7 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, int ret = 0; /* This is a quick check to avoid locking if not necessary */ - if (!jinode->i_transaction) + if (!READ_ONCE(jinode->i_transaction)) goto out; /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4c86a9d46870..f9bf3bac085d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -899,8 +899,13 @@ bail: static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, - jinode->i_dirty_start, jinode->i_dirty_end); + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + loff_t range_start, range_end; + + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) + return 0; + + return filemap_fdatawrite_range(mapping, range_start, range_end); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a53a00d36228..7e785aa6d35d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -429,22 +429,46 @@ struct jbd2_inode { unsigned long i_flags; /** - * @i_dirty_start: + * @i_dirty_start_page: + * + * Dirty range start in PAGE_SIZE units. + * + * The dirty range is empty if @i_dirty_start_page is greater than or + * equal to @i_dirty_end_page. * - * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ - loff_t i_dirty_start; + pgoff_t i_dirty_start_page; /** - * @i_dirty_end: + * @i_dirty_end_page: * - * Inclusive offset in bytes where the dirty range for this inode - * ends. [j_list_lock] + * Dirty range end in PAGE_SIZE units (exclusive). + * + * [j_list_lock] */ - loff_t i_dirty_end; + pgoff_t i_dirty_end_page; }; +/* + * Lockless readers treat start_page >= end_page as an empty range. + * Writers publish a new non-empty range by storing i_dirty_end_page before + * i_dirty_start_page. + */ +static inline bool jbd2_jinode_get_dirty_range(const struct jbd2_inode *jinode, + loff_t *start, loff_t *end) +{ + pgoff_t start_page = READ_ONCE(jinode->i_dirty_start_page); + pgoff_t end_page = READ_ONCE(jinode->i_dirty_end_page); + + if (start_page >= end_page) + return false; + + *start = (loff_t)start_page << PAGE_SHIFT; + *end = ((loff_t)end_page << PAGE_SHIFT) - 1; + return true; +} + struct jbd2_revoke_table_s; /**