Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:

 - Various minor code cleanups and fixes for hns, iser, cxgb4, hfi1,
   rxe, erdma, mana_ib

 - Prefetch supprot for rxe ODP

 - Remove memory window support from hns as new device FW is no longer
   support it

 - Remove qib, it is very old and obsolete now, Cornelis wishes to
   restructure the hfi1/qib shared layer

 - Fix a race in destroying CQs where we can still end up with work
   running because the work is cancled before the driver stops
   triggering it

 - Improve interaction with namespaces:
     * Follow the devlink namespace for newly spawned RDMA devices
     * Create iopoib net devces in the parent IB device's namespace
     * Allow CAP_NET_RAW checks to pass in user namespaces

 - A new flow control scheme for IB MADs to try and avoid queue
   overflows in the network

 - Fix 2G message sizes in bnxt_re

 - Optimize mkey layout for mlx5 DMABUF

 - New "DMA Handle" concept to allow controlling PCI TPH and steering
   tags

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (71 commits)
  RDMA/siw: Change maintainer email address
  RDMA/mana_ib: add support of multiple ports
  RDMA/mlx5: Refactor optional counters steering code
  RDMA/mlx5: Add DMAH support for reg_user_mr/reg_user_dmabuf_mr
  IB: Extend UVERBS_METHOD_REG_MR to get DMAH
  RDMA/mlx5: Add DMAH object support
  RDMA/core: Introduce a DMAH object and its alloc/free APIs
  IB/core: Add UVERBS_METHOD_REG_MR on the MR object
  net/mlx5: Add support for device steering tag
  net/mlx5: Expose IFC bits for TPH
  PCI/TPH: Expose pcie_tph_get_st_table_size()
  RDMA/mlx5: Fix incorrect MKEY masking
  RDMA/mlx5: Fix returned type from _mlx5r_umr_zap_mkey()
  RDMA/mlx5: remove redundant check on err on return expression
  RDMA/mana_ib: add additional port counters
  RDMA/mana_ib: Fix DSCP value in modify QP
  RDMA/efa: Add CQ with external memory support
  RDMA/core: Add umem "is_contiguous" and "start_dma_addr" helpers
  RDMA/uverbs: Add a common way to create CQ with umem
  RDMA/mlx5: Optimize DMABUF mkey page size
  ...
This commit is contained in:
Linus Torvalds
2025-07-31 12:19:55 -07:00
153 changed files with 2867 additions and 49165 deletions

View File

@@ -92,9 +92,7 @@ static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
/* Initialize non-HT cpu cores mask */
void init_real_cpu_mask(void)
{
int possible, curr_cpu, i, ht;
cpumask_clear(&node_affinity.real_cpu_mask);
int possible, curr_cpu, ht;
/* Start with cpu online mask as the real cpu mask */
cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
@@ -110,17 +108,10 @@ void init_real_cpu_mask(void)
* "real" cores. Assumes that HT cores are not enumerated in
* succession (except in the single core case).
*/
curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
for (i = 0; i < possible / ht; i++)
curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
/*
* Step 2. Remove the remaining HT siblings. Use cpumask_next() to
* skip any gaps.
*/
for (; i < possible; i++) {
cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
}
curr_cpu = cpumask_nth(possible / ht, &node_affinity.real_cpu_mask) + 1;
/* Step 2. Remove the remaining HT siblings. */
cpumask_clear_cpus(&node_affinity.real_cpu_mask, curr_cpu, nr_cpu_ids - curr_cpu);
}
int node_affinity_init(void)
@@ -346,9 +337,10 @@ static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
&entry->def_intr.used);
/* If there are non-interrupt CPUs available, use them first */
if (!cpumask_empty(non_intr_cpus))
cpu = cpumask_first(non_intr_cpus);
else /* Otherwise, use interrupt CPUs */
cpu = cpumask_first(non_intr_cpus);
/* Otherwise, use interrupt CPUs */
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(available_cpus);
if (cpu >= nr_cpu_ids) { /* empty */
@@ -963,32 +955,23 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
struct hfi1_affinity_node_list *affinity)
{
int possible, curr_cpu, i;
uint num_cores_per_socket = node_affinity.num_online_cpus /
affinity->num_core_siblings /
node_affinity.num_online_nodes;
int curr_cpu;
uint num_cores;
cpumask_copy(hw_thread_mask, &affinity->proc.mask);
if (affinity->num_core_siblings > 0) {
/* Removing other siblings not needed for now */
possible = cpumask_weight(hw_thread_mask);
curr_cpu = cpumask_first(hw_thread_mask);
for (i = 0;
i < num_cores_per_socket * node_affinity.num_online_nodes;
i++)
curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
for (; i < possible; i++) {
cpumask_clear_cpu(curr_cpu, hw_thread_mask);
curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
}
if (affinity->num_core_siblings == 0)
return;
/* Identifying correct HW threads within physical cores */
cpumask_shift_left(hw_thread_mask, hw_thread_mask,
num_cores_per_socket *
node_affinity.num_online_nodes *
hw_thread_no);
}
num_cores = rounddown(node_affinity.num_online_cpus / affinity->num_core_siblings,
node_affinity.num_online_nodes);
/* Removing other siblings not needed for now */
curr_cpu = cpumask_nth(num_cores * node_affinity.num_online_nodes, hw_thread_mask) + 1;
cpumask_clear_cpus(hw_thread_mask, curr_cpu, nr_cpu_ids - curr_cpu);
/* Identifying correct HW threads within physical cores */
cpumask_shift_left(hw_thread_mask, hw_thread_mask, num_cores * hw_thread_no);
}
int hfi1_get_proc_affinity(int node)
@@ -1087,22 +1070,19 @@ int hfi1_get_proc_affinity(int node)
* If HT cores are enabled, identify which HW threads within the
* physical cores should be used.
*/
if (affinity->num_core_siblings > 0) {
for (i = 0; i < affinity->num_core_siblings; i++) {
find_hw_thread_mask(i, hw_thread_mask, affinity);
for (i = 0; i < affinity->num_core_siblings; i++) {
find_hw_thread_mask(i, hw_thread_mask, affinity);
/*
* If there's at least one available core for this HW
* thread number, stop looking for a core.
*
* diff will always be not empty at least once in this
* loop as the used mask gets reset when
* (set->mask == set->used) before this loop.
*/
cpumask_andnot(diff, hw_thread_mask, &set->used);
if (!cpumask_empty(diff))
break;
}
/*
* If there's at least one available core for this HW
* thread number, stop looking for a core.
*
* diff will always be not empty at least once in this
* loop as the used mask gets reset when
* (set->mask == set->used) before this loop.
*/
if (cpumask_andnot(diff, hw_thread_mask, &set->used))
break;
}
hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
cpumask_pr_args(hw_thread_mask));
@@ -1133,8 +1113,7 @@ int hfi1_get_proc_affinity(int node)
* used for process assignments using the same method as
* the preferred NUMA node.
*/
cpumask_andnot(diff, available_mask, intrs_mask);
if (!cpumask_empty(diff))
if (cpumask_andnot(diff, available_mask, intrs_mask))
cpumask_copy(available_mask, diff);
/* If we don't have CPUs on the preferred node, use other NUMA nodes */
@@ -1150,8 +1129,7 @@ int hfi1_get_proc_affinity(int node)
* At first, we don't want to place processes on the same
* CPUs as interrupt handlers.
*/
cpumask_andnot(diff, available_mask, intrs_mask);
if (!cpumask_empty(diff))
if (cpumask_andnot(diff, available_mask, intrs_mask))
cpumask_copy(available_mask, diff);
}
hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",