mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Under memory pressure, direct reclaim can kick in during compressed readahead. This puts the associated task into D-state. Then shrink_lruvec() disables interrupts when acquiring the LRU lock. Under heavy pressure, we've observed reclaim can run long enough that the CPU becomes prone to CSD lock stalls since it cannot service incoming IPIs. Although the CSD lock stalls are the worst case scenario, we have found many more subtle occurrences of this latency on the order of seconds, over a minute in some cases. Prevent direct reclaim during compressed readahead. This is achieved by using different GFP flags at key points when the bio is marked for readahead. There are two functions that allocate during compressed readahead: btrfs_alloc_compr_folio() and add_ra_bio_pages(). Both currently use GFP_NOFS which includes __GFP_DIRECT_RECLAIM. For the internal API call btrfs_alloc_compr_folio(), the signature changes to accept an additional gfp_t parameter. At the readahead call site, it gets flags similar to GFP_NOFS but stripped of __GFP_DIRECT_RECLAIM. __GFP_NOWARN is added since these allocations are allowed to fail. Demand reads still use full GFP_NOFS and will enter reclaim if needed. All other existing call sites of btrfs_alloc_compr_folio() now explicitly pass GFP_NOFS to retain their current behavior. add_ra_bio_pages() gains a bool parameter which allows callers to specify if they want to allow direct reclaim or not. In either case, the __GFP_NOWARN flag was added unconditionally since the allocations are speculative. There has been some previous work done on calling add_ra_bio_pages() [0]. This patch is complementary: where that patch reduces call frequency, this patch reduces the latency associated with those calls. [0] https://lore.kernel.org/linux-btrfs/656838ec1232314a2657716e59f4f15a8eadba64.1751492111.git.boris@bur.io/ Reviewed-by: Mark Harmstone <mark@harmstone.com> Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: JP Kobryn (Meta) <jp.kobryn@linux.dev> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
515 lines
14 KiB
C
515 lines
14 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2008 Oracle. All rights reserved.
|
|
*
|
|
* Based on jffs2 zlib code:
|
|
* Copyright © 2001-2007 Red Hat, Inc.
|
|
* Created by David Woodhouse <dwmw2@infradead.org>
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/zlib.h>
|
|
#include <linux/zutil.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/init.h>
|
|
#include <linux/err.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/refcount.h>
|
|
#include "btrfs_inode.h"
|
|
#include "compression.h"
|
|
#include "fs.h"
|
|
#include "subpage.h"
|
|
|
|
/* workspace buffer size for s390 zlib hardware support */
|
|
#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE)
|
|
|
|
struct workspace {
|
|
z_stream strm;
|
|
char *buf;
|
|
unsigned int buf_size;
|
|
struct list_head list;
|
|
int level;
|
|
};
|
|
|
|
struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
|
|
{
|
|
struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level);
|
|
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
|
|
|
workspace->level = level;
|
|
|
|
return ws;
|
|
}
|
|
|
|
void zlib_free_workspace(struct list_head *ws)
|
|
{
|
|
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
|
|
|
kvfree(workspace->strm.workspace);
|
|
kfree(workspace->buf);
|
|
kfree(workspace);
|
|
}
|
|
|
|
/*
|
|
* For s390 hardware acceleration, the buffer size should be at least
|
|
* ZLIB_DFLTCC_BUF_SIZE to achieve the best performance.
|
|
*
|
|
* But if bs > ps we can have large enough folios that meet the s390 hardware
|
|
* handling.
|
|
*/
|
|
static bool need_special_buffer(struct btrfs_fs_info *fs_info)
|
|
{
|
|
if (!zlib_deflate_dfltcc_enabled())
|
|
return false;
|
|
if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
|
|
{
|
|
struct workspace *workspace;
|
|
int workspacesize;
|
|
|
|
workspace = kzalloc_obj(*workspace);
|
|
if (!workspace)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
|
|
zlib_inflate_workspacesize());
|
|
workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN);
|
|
workspace->level = level;
|
|
workspace->buf = NULL;
|
|
if (need_special_buffer(fs_info)) {
|
|
workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
|
|
__GFP_NOMEMALLOC | __GFP_NORETRY |
|
|
__GFP_NOWARN | GFP_NOIO);
|
|
workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
|
|
}
|
|
if (!workspace->buf) {
|
|
workspace->buf = kmalloc(fs_info->sectorsize, GFP_KERNEL);
|
|
workspace->buf_size = fs_info->sectorsize;
|
|
}
|
|
if (!workspace->strm.workspace || !workspace->buf)
|
|
goto fail;
|
|
|
|
INIT_LIST_HEAD(&workspace->list);
|
|
|
|
return &workspace->list;
|
|
fail:
|
|
zlib_free_workspace(&workspace->list);
|
|
return ERR_PTR(-ENOMEM);
|
|
}
|
|
|
|
/*
|
|
* Helper for S390x with hardware zlib compression support.
|
|
*
|
|
* That hardware acceleration requires a buffer size larger than a single page
|
|
* to get ideal performance, thus we need to do the memory copy rather than
|
|
* use the page cache directly as input buffer.
|
|
*/
|
|
static int copy_data_into_buffer(struct address_space *mapping,
|
|
struct workspace *workspace, u64 filepos,
|
|
unsigned long length)
|
|
{
|
|
u64 cur = filepos;
|
|
|
|
/* It's only for hardware accelerated zlib code. */
|
|
ASSERT(zlib_deflate_dfltcc_enabled());
|
|
|
|
while (cur < filepos + length) {
|
|
struct folio *folio;
|
|
void *data_in;
|
|
unsigned int offset;
|
|
unsigned long copy_length;
|
|
int ret;
|
|
|
|
ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
offset = offset_in_folio(folio, cur);
|
|
copy_length = min(folio_size(folio) - offset,
|
|
filepos + length - cur);
|
|
|
|
data_in = kmap_local_folio(folio, offset);
|
|
memcpy(workspace->buf + cur - filepos, data_in, copy_length);
|
|
kunmap_local(data_in);
|
|
folio_put(folio);
|
|
cur += copy_length;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
{
|
|
struct btrfs_inode *inode = cb->bbio.inode;
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
|
struct address_space *mapping = inode->vfs_inode.i_mapping;
|
|
struct bio *bio = &cb->bbio.bio;
|
|
u64 start = cb->start;
|
|
u32 len = cb->len;
|
|
const u32 min_folio_size = btrfs_min_folio_size(fs_info);
|
|
int ret;
|
|
char *data_in = NULL;
|
|
struct folio *in_folio = NULL;
|
|
struct folio *out_folio = NULL;
|
|
const u64 orig_end = start + len;
|
|
|
|
ret = zlib_deflateInit(&workspace->strm, workspace->level);
|
|
if (unlikely(ret != Z_OK)) {
|
|
btrfs_err(fs_info,
|
|
"zlib compression init failed, error %d root %llu inode %llu offset %llu",
|
|
ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
|
|
ret = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
workspace->strm.total_in = 0;
|
|
workspace->strm.total_out = 0;
|
|
|
|
out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS);
|
|
if (out_folio == NULL) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
workspace->strm.next_in = workspace->buf;
|
|
workspace->strm.avail_in = 0;
|
|
workspace->strm.next_out = folio_address(out_folio);
|
|
workspace->strm.avail_out = min_folio_size;
|
|
|
|
while (workspace->strm.total_in < len) {
|
|
/*
|
|
* Get next input pages and copy the contents to the workspace
|
|
* buffer if required.
|
|
*/
|
|
if (workspace->strm.avail_in == 0) {
|
|
unsigned long bytes_left = len - workspace->strm.total_in;
|
|
unsigned int copy_length = min(bytes_left, workspace->buf_size);
|
|
|
|
/*
|
|
* For s390 hardware accelerated zlib, and our folio is smaller
|
|
* than the copy_length, we need to fill the buffer so that
|
|
* we can take full advantage of hardware acceleration.
|
|
*/
|
|
if (need_special_buffer(fs_info)) {
|
|
ret = copy_data_into_buffer(mapping, workspace,
|
|
start, copy_length);
|
|
if (ret < 0)
|
|
goto out;
|
|
start += copy_length;
|
|
workspace->strm.next_in = workspace->buf;
|
|
workspace->strm.avail_in = copy_length;
|
|
} else {
|
|
unsigned int cur_len;
|
|
|
|
if (data_in) {
|
|
kunmap_local(data_in);
|
|
folio_put(in_folio);
|
|
data_in = NULL;
|
|
}
|
|
ret = btrfs_compress_filemap_get_folio(mapping,
|
|
start, &in_folio);
|
|
if (ret < 0)
|
|
goto out;
|
|
cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
|
|
data_in = kmap_local_folio(in_folio,
|
|
offset_in_folio(in_folio, start));
|
|
start += cur_len;
|
|
workspace->strm.next_in = data_in;
|
|
workspace->strm.avail_in = cur_len;
|
|
}
|
|
}
|
|
|
|
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
|
|
if (unlikely(ret != Z_OK)) {
|
|
btrfs_warn(fs_info,
|
|
"zlib compression failed, error %d root %llu inode %llu offset %llu",
|
|
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
|
|
start);
|
|
zlib_deflateEnd(&workspace->strm);
|
|
ret = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
/* We're making it bigger, give up. */
|
|
if (workspace->strm.total_in > fs_info->sectorsize * 2 &&
|
|
workspace->strm.total_in < workspace->strm.total_out) {
|
|
ret = -E2BIG;
|
|
goto out;
|
|
}
|
|
if (workspace->strm.total_out >= len) {
|
|
ret = -E2BIG;
|
|
goto out;
|
|
}
|
|
/* Queue the full folio and allocate a new one. */
|
|
if (workspace->strm.avail_out == 0) {
|
|
if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) {
|
|
ret = -E2BIG;
|
|
goto out;
|
|
}
|
|
|
|
out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS);
|
|
if (out_folio == NULL) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
workspace->strm.avail_out = min_folio_size;
|
|
workspace->strm.next_out = folio_address(out_folio);
|
|
}
|
|
/* We're all done. */
|
|
if (workspace->strm.total_in >= len)
|
|
break;
|
|
}
|
|
|
|
workspace->strm.avail_in = 0;
|
|
|
|
/*
|
|
* Call deflate with Z_FINISH flush parameter providing more output
|
|
* space but no more input data, until it returns with Z_STREAM_END.
|
|
*/
|
|
while (ret != Z_STREAM_END) {
|
|
ret = zlib_deflate(&workspace->strm, Z_FINISH);
|
|
if (ret == Z_STREAM_END)
|
|
break;
|
|
if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) {
|
|
zlib_deflateEnd(&workspace->strm);
|
|
ret = -EIO;
|
|
goto out;
|
|
} else if (workspace->strm.avail_out == 0) {
|
|
if (workspace->strm.total_out >= len) {
|
|
ret = -E2BIG;
|
|
goto out;
|
|
}
|
|
if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) {
|
|
ret = -E2BIG;
|
|
goto out;
|
|
}
|
|
/* Get another folio for the stream end. */
|
|
out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS);
|
|
if (out_folio == NULL) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
workspace->strm.avail_out = min_folio_size;
|
|
workspace->strm.next_out = folio_address(out_folio);
|
|
}
|
|
}
|
|
/* Queue the remaining part of the folio. */
|
|
if (workspace->strm.total_out > bio->bi_iter.bi_size) {
|
|
const u32 cur_len = workspace->strm.total_out - bio->bi_iter.bi_size;
|
|
|
|
ASSERT(cur_len <= folio_size(out_folio));
|
|
|
|
if (!bio_add_folio(bio, out_folio, cur_len, 0)) {
|
|
ret = -E2BIG;
|
|
goto out;
|
|
}
|
|
} else {
|
|
/* The last folio hasn't' been utilized. */
|
|
btrfs_free_compr_folio(out_folio);
|
|
}
|
|
out_folio = NULL;
|
|
ASSERT(bio->bi_iter.bi_size == workspace->strm.total_out);
|
|
zlib_deflateEnd(&workspace->strm);
|
|
|
|
if (workspace->strm.total_out >= workspace->strm.total_in) {
|
|
ret = -E2BIG;
|
|
goto out;
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
if (out_folio)
|
|
btrfs_free_compr_folio(out_folio);
|
|
if (data_in) {
|
|
kunmap_local(data_in);
|
|
folio_put(in_folio);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
{
|
|
struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
|
|
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
|
struct folio_iter fi;
|
|
const u32 min_folio_size = btrfs_min_folio_size(fs_info);
|
|
int ret = 0, ret2;
|
|
int wbits = MAX_WBITS;
|
|
char *data_in;
|
|
size_t total_out = 0;
|
|
const size_t srclen = bio_get_size(&cb->bbio.bio);
|
|
unsigned long buf_start;
|
|
|
|
bio_first_folio(&fi, &cb->bbio.bio, 0);
|
|
|
|
/* We must have at least one folio here, that has the correct size. */
|
|
if (unlikely(!fi.folio))
|
|
return -EINVAL;
|
|
ASSERT(folio_size(fi.folio) == min_folio_size);
|
|
|
|
data_in = kmap_local_folio(fi.folio, 0);
|
|
workspace->strm.next_in = data_in;
|
|
workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size);
|
|
workspace->strm.total_in = 0;
|
|
|
|
workspace->strm.total_out = 0;
|
|
workspace->strm.next_out = workspace->buf;
|
|
workspace->strm.avail_out = workspace->buf_size;
|
|
|
|
/* If it's deflate, and it's got no preset dictionary, then
|
|
we can tell zlib to skip the adler32 check. */
|
|
if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
|
|
((data_in[0] & 0x0f) == Z_DEFLATED) &&
|
|
!(((data_in[0]<<8) + data_in[1]) % 31)) {
|
|
|
|
wbits = -((data_in[0] >> 4) + 8);
|
|
workspace->strm.next_in += 2;
|
|
workspace->strm.avail_in -= 2;
|
|
}
|
|
|
|
ret = zlib_inflateInit2(&workspace->strm, wbits);
|
|
if (unlikely(ret != Z_OK)) {
|
|
struct btrfs_inode *inode = cb->bbio.inode;
|
|
|
|
kunmap_local(data_in);
|
|
btrfs_err(inode->root->fs_info,
|
|
"zlib decompression init failed, error %d root %llu inode %llu offset %llu",
|
|
ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
|
|
return -EIO;
|
|
}
|
|
while (workspace->strm.total_in < srclen) {
|
|
ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
|
|
if (ret != Z_OK && ret != Z_STREAM_END)
|
|
break;
|
|
|
|
buf_start = total_out;
|
|
total_out = workspace->strm.total_out;
|
|
|
|
/* we didn't make progress in this inflate call, we're done */
|
|
if (buf_start == total_out)
|
|
break;
|
|
|
|
ret2 = btrfs_decompress_buf2page(workspace->buf,
|
|
total_out - buf_start, cb, buf_start);
|
|
if (ret2 == 0) {
|
|
ret = 0;
|
|
goto done;
|
|
}
|
|
|
|
workspace->strm.next_out = workspace->buf;
|
|
workspace->strm.avail_out = workspace->buf_size;
|
|
|
|
if (workspace->strm.avail_in == 0) {
|
|
unsigned long tmp;
|
|
kunmap_local(data_in);
|
|
bio_next_folio(&fi, &cb->bbio.bio);
|
|
if (!fi.folio) {
|
|
data_in = NULL;
|
|
break;
|
|
}
|
|
ASSERT(folio_size(fi.folio) == min_folio_size);
|
|
data_in = kmap_local_folio(fi.folio, 0);
|
|
workspace->strm.next_in = data_in;
|
|
tmp = srclen - workspace->strm.total_in;
|
|
workspace->strm.avail_in = min(tmp, min_folio_size);
|
|
}
|
|
}
|
|
if (unlikely(ret != Z_STREAM_END)) {
|
|
btrfs_err(cb->bbio.inode->root->fs_info,
|
|
"zlib decompression failed, error %d root %llu inode %llu offset %llu",
|
|
ret, btrfs_root_id(cb->bbio.inode->root),
|
|
btrfs_ino(cb->bbio.inode), cb->start);
|
|
ret = -EIO;
|
|
} else {
|
|
ret = 0;
|
|
}
|
|
done:
|
|
zlib_inflateEnd(&workspace->strm);
|
|
if (data_in)
|
|
kunmap_local(data_in);
|
|
return ret;
|
|
}
|
|
|
|
int zlib_decompress(struct list_head *ws, const u8 *data_in,
|
|
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
|
|
size_t destlen)
|
|
{
|
|
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
|
int ret = 0;
|
|
int wbits = MAX_WBITS;
|
|
unsigned long to_copy;
|
|
|
|
workspace->strm.next_in = data_in;
|
|
workspace->strm.avail_in = srclen;
|
|
workspace->strm.total_in = 0;
|
|
|
|
workspace->strm.next_out = workspace->buf;
|
|
workspace->strm.avail_out = workspace->buf_size;
|
|
workspace->strm.total_out = 0;
|
|
/* If it's deflate, and it's got no preset dictionary, then
|
|
we can tell zlib to skip the adler32 check. */
|
|
if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
|
|
((data_in[0] & 0x0f) == Z_DEFLATED) &&
|
|
!(((data_in[0]<<8) + data_in[1]) % 31)) {
|
|
|
|
wbits = -((data_in[0] >> 4) + 8);
|
|
workspace->strm.next_in += 2;
|
|
workspace->strm.avail_in -= 2;
|
|
}
|
|
|
|
ret = zlib_inflateInit2(&workspace->strm, wbits);
|
|
if (unlikely(ret != Z_OK)) {
|
|
struct btrfs_inode *inode = folio_to_inode(dest_folio);
|
|
|
|
btrfs_err(inode->root->fs_info,
|
|
"zlib decompression init failed, error %d root %llu inode %llu offset %llu",
|
|
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
|
|
folio_pos(dest_folio));
|
|
return -EIO;
|
|
}
|
|
|
|
/*
|
|
* Everything (in/out buf) should be at most one sector, there should
|
|
* be no need to switch any input/output buffer.
|
|
*/
|
|
ret = zlib_inflate(&workspace->strm, Z_FINISH);
|
|
to_copy = min(workspace->strm.total_out, destlen);
|
|
if (ret != Z_STREAM_END)
|
|
goto out;
|
|
|
|
memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy);
|
|
|
|
out:
|
|
if (unlikely(to_copy != destlen)) {
|
|
struct btrfs_inode *inode = folio_to_inode(dest_folio);
|
|
|
|
btrfs_err(inode->root->fs_info,
|
|
"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu",
|
|
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
|
|
folio_pos(dest_folio), to_copy, destlen);
|
|
ret = -EIO;
|
|
} else {
|
|
ret = 0;
|
|
}
|
|
|
|
zlib_inflateEnd(&workspace->strm);
|
|
|
|
if (unlikely(to_copy < destlen))
|
|
folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy);
|
|
return ret;
|
|
}
|
|
|
|
const struct btrfs_compress_levels btrfs_zlib_compress = {
|
|
.min_level = 1,
|
|
.max_level = 9,
|
|
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
|
|
};
|