mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
net-sysfs: use rps_tag_ptr and remove metadata from rps_sock_flow_table
Instead of storing the @mask at the beginning of rps_sock_flow_table, use 5 low order bits of the rps_tag_ptr to store the log of the size. This removes a potential cache line miss to fetch @mask. More importantly, we can switch to vmalloc_huge() without wasting memory. Tested with: numactl --interleave=all bash -c "echo 4194304 >/proc/sys/net/core/rps_sock_flow_entries" Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Link: https://patch.msgid.link/20260302181432.1836150-5-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
committed by
Jakub Kicinski
parent
9cde131cdd
commit
dd378109d2
@@ -403,16 +403,21 @@ Both of these need to be set before RFS is enabled for a receive queue.
|
||||
Values for both are rounded up to the nearest power of two. The
|
||||
suggested flow count depends on the expected number of active connections
|
||||
at any given time, which may be significantly less than the number of open
|
||||
connections. We have found that a value of 32768 for rps_sock_flow_entries
|
||||
works fairly well on a moderately loaded server.
|
||||
connections. We have found that a value of 65536 for rps_sock_flow_entries
|
||||
works fairly well on a moderately loaded server. Big servers might
|
||||
need 1048576 or even higher values.
|
||||
|
||||
On a NUMA host it is advisable to spread rps_sock_flow_entries on all nodes.
|
||||
|
||||
numactl --interleave=all bash -c "echo 1048576 >/proc/sys/net/core/rps_sock_flow_entries"
|
||||
|
||||
For a single queue device, the rps_flow_cnt value for the single queue
|
||||
would normally be configured to the same value as rps_sock_flow_entries.
|
||||
For a multi-queue device, the rps_flow_cnt for each queue might be
|
||||
configured as rps_sock_flow_entries / N, where N is the number of
|
||||
queues. So for instance, if rps_sock_flow_entries is set to 32768 and there
|
||||
queues. So for instance, if rps_sock_flow_entries is set to 131072 and there
|
||||
are 16 configured receive queues, rps_flow_cnt for each queue might be
|
||||
configured as 2048.
|
||||
configured as 8192.
|
||||
|
||||
|
||||
Accelerated RFS
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <net/protocol.h>
|
||||
#ifdef CONFIG_RPS
|
||||
#include <net/rps-types.h>
|
||||
#endif
|
||||
|
||||
struct skb_defer_node {
|
||||
struct llist_head defer_list;
|
||||
@@ -33,7 +36,7 @@ struct net_hotdata {
|
||||
struct kmem_cache *skbuff_fclone_cache;
|
||||
struct kmem_cache *skb_small_head_cache;
|
||||
#ifdef CONFIG_RPS
|
||||
struct rps_sock_flow_table __rcu *rps_sock_flow_table;
|
||||
rps_tag_ptr rps_sock_flow_table;
|
||||
u32 rps_cpu_mask;
|
||||
#endif
|
||||
struct skb_defer_node __percpu *skb_defer_nodes;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <net/hotdata.h>
|
||||
|
||||
#ifdef CONFIG_RPS
|
||||
#include <net/rps-types.h>
|
||||
|
||||
extern struct static_key_false rps_needed;
|
||||
extern struct static_key_false rfs_needed;
|
||||
@@ -60,45 +61,38 @@ struct rps_dev_flow_table {
|
||||
* meaning we use 32-6=26 bits for the hash.
|
||||
*/
|
||||
struct rps_sock_flow_table {
|
||||
u32 _mask;
|
||||
|
||||
u32 ents[] ____cacheline_aligned_in_smp;
|
||||
u32 ent;
|
||||
};
|
||||
#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
|
||||
|
||||
static inline u32 rps_sock_flow_table_mask(const struct rps_sock_flow_table *table)
|
||||
{
|
||||
return table->_mask;
|
||||
}
|
||||
|
||||
#define RPS_NO_CPU 0xffff
|
||||
|
||||
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
|
||||
u32 hash)
|
||||
static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
|
||||
{
|
||||
unsigned int index = hash & rps_sock_flow_table_mask(table);
|
||||
unsigned int index = hash & rps_tag_to_mask(tag_ptr);
|
||||
u32 val = hash & ~net_hotdata.rps_cpu_mask;
|
||||
struct rps_sock_flow_table *table;
|
||||
|
||||
/* We only give a hint, preemption can change CPU under us */
|
||||
val |= raw_smp_processor_id();
|
||||
|
||||
table = rps_tag_to_table(tag_ptr);
|
||||
/* The following WRITE_ONCE() is paired with the READ_ONCE()
|
||||
* here, and another one in get_rps_cpu().
|
||||
*/
|
||||
if (READ_ONCE(table->ents[index]) != val)
|
||||
WRITE_ONCE(table->ents[index], val);
|
||||
if (READ_ONCE(table[index].ent) != val)
|
||||
WRITE_ONCE(table[index].ent, val);
|
||||
}
|
||||
|
||||
static inline void _sock_rps_record_flow_hash(__u32 hash)
|
||||
{
|
||||
struct rps_sock_flow_table *sock_flow_table;
|
||||
rps_tag_ptr tag_ptr;
|
||||
|
||||
if (!hash)
|
||||
return;
|
||||
rcu_read_lock();
|
||||
sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
|
||||
if (sock_flow_table)
|
||||
rps_record_sock_flow(sock_flow_table, hash);
|
||||
tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
|
||||
if (tag_ptr)
|
||||
rps_record_sock_flow(tag_ptr, hash);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
@@ -125,6 +119,7 @@ static inline void _sock_rps_record_flow(const struct sock *sk)
|
||||
static inline void _sock_rps_delete_flow(const struct sock *sk)
|
||||
{
|
||||
struct rps_sock_flow_table *table;
|
||||
rps_tag_ptr tag_ptr;
|
||||
u32 hash, index;
|
||||
|
||||
hash = READ_ONCE(sk->sk_rxhash);
|
||||
@@ -132,11 +127,12 @@ static inline void _sock_rps_delete_flow(const struct sock *sk)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
table = rcu_dereference(net_hotdata.rps_sock_flow_table);
|
||||
if (table) {
|
||||
index = hash & rps_sock_flow_table_mask(table);
|
||||
if (READ_ONCE(table->ents[index]) != RPS_NO_CPU)
|
||||
WRITE_ONCE(table->ents[index], RPS_NO_CPU);
|
||||
tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
|
||||
if (tag_ptr) {
|
||||
index = hash & rps_tag_to_mask(tag_ptr);
|
||||
table = rps_tag_to_table(tag_ptr);
|
||||
if (READ_ONCE(table[index].ent) != RPS_NO_CPU)
|
||||
WRITE_ONCE(table[index].ent, RPS_NO_CPU);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
@@ -5075,9 +5075,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
|
||||
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
|
||||
struct rps_dev_flow **rflowp)
|
||||
{
|
||||
const struct rps_sock_flow_table *sock_flow_table;
|
||||
struct netdev_rx_queue *rxqueue = dev->_rx;
|
||||
struct rps_dev_flow_table *flow_table;
|
||||
rps_tag_ptr global_tag_ptr;
|
||||
struct rps_map *map;
|
||||
int cpu = -1;
|
||||
u32 tcpu;
|
||||
@@ -5108,8 +5108,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
|
||||
if (!hash)
|
||||
goto done;
|
||||
|
||||
sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
|
||||
if (flow_table && sock_flow_table) {
|
||||
global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
|
||||
if (flow_table && global_tag_ptr) {
|
||||
struct rps_sock_flow_table *sock_flow_table;
|
||||
struct rps_dev_flow *rflow;
|
||||
u32 next_cpu;
|
||||
u32 flow_id;
|
||||
@@ -5118,8 +5119,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
|
||||
/* First check into global flow table if there is a match.
|
||||
* This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
|
||||
*/
|
||||
flow_id = hash & rps_sock_flow_table_mask(sock_flow_table);
|
||||
ident = READ_ONCE(sock_flow_table->ents[flow_id]);
|
||||
flow_id = hash & rps_tag_to_mask(global_tag_ptr);
|
||||
sock_flow_table = rps_tag_to_table(global_tag_ptr);
|
||||
ident = READ_ONCE(sock_flow_table[flow_id].ent);
|
||||
if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
|
||||
goto try_rps;
|
||||
|
||||
|
||||
@@ -138,68 +138,73 @@ done:
|
||||
static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct rps_sock_flow_table *o_sock_table, *sock_table;
|
||||
static DEFINE_MUTEX(sock_flow_mutex);
|
||||
rps_tag_ptr o_tag_ptr, tag_ptr;
|
||||
unsigned int orig_size, size;
|
||||
int ret, i;
|
||||
struct ctl_table tmp = {
|
||||
.data = &size,
|
||||
.maxlen = sizeof(size),
|
||||
.mode = table->mode
|
||||
};
|
||||
struct rps_sock_flow_table *o_sock_table, *sock_table;
|
||||
static DEFINE_MUTEX(sock_flow_mutex);
|
||||
void *tofree = NULL;
|
||||
int ret, i;
|
||||
u8 log;
|
||||
|
||||
mutex_lock(&sock_flow_mutex);
|
||||
|
||||
o_sock_table = rcu_dereference_protected(
|
||||
net_hotdata.rps_sock_flow_table,
|
||||
lockdep_is_held(&sock_flow_mutex));
|
||||
size = o_sock_table ? rps_sock_flow_table_mask(o_sock_table) + 1 : 0;
|
||||
o_tag_ptr = tag_ptr = net_hotdata.rps_sock_flow_table;
|
||||
|
||||
size = o_tag_ptr ? rps_tag_to_mask(o_tag_ptr) + 1 : 0;
|
||||
o_sock_table = rps_tag_to_table(o_tag_ptr);
|
||||
orig_size = size;
|
||||
|
||||
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
|
||||
|
||||
if (write) {
|
||||
if (size) {
|
||||
if (size > 1<<29) {
|
||||
/* Enforce limit to prevent overflow */
|
||||
if (!write)
|
||||
goto unlock;
|
||||
|
||||
if (size) {
|
||||
if (size > 1<<29) {
|
||||
/* Enforce limit to prevent overflow */
|
||||
mutex_unlock(&sock_flow_mutex);
|
||||
return -EINVAL;
|
||||
}
|
||||
sock_table = o_sock_table;
|
||||
size = roundup_pow_of_two(size);
|
||||
if (size != orig_size) {
|
||||
sock_table = vmalloc_huge(size * sizeof(*sock_table),
|
||||
GFP_KERNEL);
|
||||
if (!sock_table) {
|
||||
mutex_unlock(&sock_flow_mutex);
|
||||
return -EINVAL;
|
||||
}
|
||||
sock_table = o_sock_table;
|
||||
size = roundup_pow_of_two(size);
|
||||
if (size != orig_size) {
|
||||
sock_table =
|
||||
vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
|
||||
if (!sock_table) {
|
||||
mutex_unlock(&sock_flow_mutex);
|
||||
return -ENOMEM;
|
||||
}
|
||||
net_hotdata.rps_cpu_mask =
|
||||
roundup_pow_of_two(nr_cpu_ids) - 1;
|
||||
sock_table->_mask = size - 1;
|
||||
return -ENOMEM;
|
||||
}
|
||||
net_hotdata.rps_cpu_mask =
|
||||
roundup_pow_of_two(nr_cpu_ids) - 1;
|
||||
log = ilog2(size);
|
||||
tag_ptr = (rps_tag_ptr)sock_table | log;
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++)
|
||||
sock_table->ents[i] = RPS_NO_CPU;
|
||||
} else
|
||||
sock_table = NULL;
|
||||
|
||||
if (sock_table != o_sock_table) {
|
||||
rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
|
||||
sock_table);
|
||||
if (sock_table) {
|
||||
static_branch_inc(&rps_needed);
|
||||
static_branch_inc(&rfs_needed);
|
||||
}
|
||||
if (o_sock_table) {
|
||||
static_branch_dec(&rps_needed);
|
||||
static_branch_dec(&rfs_needed);
|
||||
tofree = o_sock_table;
|
||||
}
|
||||
for (i = 0; i < size; i++)
|
||||
sock_table[i].ent = RPS_NO_CPU;
|
||||
} else {
|
||||
sock_table = NULL;
|
||||
tag_ptr = 0UL;
|
||||
}
|
||||
if (tag_ptr != o_tag_ptr) {
|
||||
smp_store_release(&net_hotdata.rps_sock_flow_table, tag_ptr);
|
||||
if (sock_table) {
|
||||
static_branch_inc(&rps_needed);
|
||||
static_branch_inc(&rfs_needed);
|
||||
}
|
||||
if (o_sock_table) {
|
||||
static_branch_dec(&rps_needed);
|
||||
static_branch_dec(&rfs_needed);
|
||||
tofree = o_sock_table;
|
||||
}
|
||||
}
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&sock_flow_mutex);
|
||||
|
||||
kvfree_rcu_mightsleep(tofree);
|
||||
|
||||
Reference in New Issue
Block a user