From 1e1cd49ded597a7cc89f774ab3f42e22ff24fd57 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Mon, 9 Mar 2026 11:23:13 +1300 Subject: [PATCH 01/37] ACPI: NUMA: Only parse CFMWS at boot when CXL_ACPI is on On CXL platforms, the Static Resource Affinity Table (SRAT) may not cover memory affinity information for all the CXL memory regions. Since each CXL memory region is enumerated via a CXL Fixed Memory Window Structure (CFMWS), during early boot the kernel parses the CFMWS tables to find all CXL memory regions and sets a NUMA node for each of them. This memory affinity information of CXL memory regions is later used by the CXL ACPI driver. The CFMWS table doesn't provide the memory affinity information either. Currently the kernel assigns a 'faked' NUMA node for each CXL memory region, starting from the next node of the highest node that is enumerated via the SRAT. This can potentially increase the maximum NUMA node ID of the platform ('nr_node_ids') a lot. E.g., on a GNR platform with 4 NUMA nodes and 18 CFMWS tables, this bumps the 'nr_node_ids' to 22. Increasing the 'nr_node_ids' has side effects. For instance, it is widely used by the kernel for "highest possible NUMA node" based memory allocations. It also impacts userspace ABIs, e.g., some NUMA memory related system calls such as 'get_mempolicy' which requires 'maxnode' not being smaller than the 'nr_node_ids'. Currently parsing CFMWS tables and assigning faked NUMA node at boot is done unconditionally. However, if the CXL ACPI driver is not enabled, there will be no user of such memory affinity information of CXL memory regions. Change to only parsing the CFMWS tables at boot when CXL_ACPI is enabled in Kconfig to avoid the unnecessary cost of bumping up 'nr_node_ids'. E.g., on the aforementioned GNR platform, the "Slab" in /proc/meminfo is reduced with this change (when CXL_ACPI is off): w/ this change w/o Slab 900488 kB 923660 kB Signed-off-by: Kai Huang Reviewed-by: Jonathan Cameron Reviewed-by: Gregory Price Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260308222313.14014-1-kai.huang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/srat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index aa87ee1583a4..62d4a8df0b8c 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -654,8 +654,11 @@ int __init acpi_numa_init(void) } last_real_pxm = fake_pxm; fake_pxm++; - acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, acpi_parse_cfmws, - &fake_pxm); + + /* No need to expand numa nodes if CXL is disabled */ + if (IS_ENABLED(CONFIG_CXL_ACPI)) + acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, acpi_parse_cfmws, + &fake_pxm); if (cnt < 0) return cnt; From 9a775c07bb04384f7c03a35dd04818ed818c1f71 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:38 +0000 Subject: [PATCH 02/37] cxl: support Type2 when initializing cxl_dev_state In preparation for type2 drivers add function and macro for differentiating CXL memory expanders (type 3) from CXL device accelerators (type 2) helping drivers built from public headers to embed struct cxl_dev_state inside a private struct. Update type3 driver for using this same initialization. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260306164741.3796372-2-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 12 +++++------- drivers/cxl/core/memdev.c | 24 ++++++++++++++++++++++++ drivers/cxl/cxlmem.h | 34 +++++++++++++++++++++++++++++++++- drivers/cxl/pci.c | 14 +++++++------- tools/testing/cxl/test/mem.c | 3 +-- 5 files changed, 70 insertions(+), 17 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index e7a6452bf544..451ff2287b44 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1521,23 +1521,21 @@ int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL"); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec) { struct cxl_memdev_state *mds; int rc; - mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL); + mds = devm_cxl_dev_state_create(dev, CXL_DEVTYPE_CLASSMEM, serial, + dvsec, struct cxl_memdev_state, cxlds, + true); if (!mds) { dev_err(dev, "No memory available\n"); return ERR_PTR(-ENOMEM); } mutex_init(&mds->event.log_lock); - mds->cxlds.dev = dev; - mds->cxlds.reg_map.host = dev; - mds->cxlds.cxl_mbox.host = dev; - mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; - mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); if (rc == -EOPNOTSUPP) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 273c22118d3d..99e422594885 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -656,6 +656,30 @@ static void detach_memdev(struct work_struct *work) static struct lock_class_key cxl_memdev_key; +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox) +{ + struct cxl_dev_state *cxlds = devm_kzalloc(dev, size, GFP_KERNEL); + + if (!cxlds) + return NULL; + + cxlds->dev = dev; + cxlds->type = type; + cxlds->serial = serial; + cxlds->cxl_dvsec = dvsec; + cxlds->reg_map.host = dev; + cxlds->reg_map.resource = CXL_RESOURCE_NONE; + + if (has_mbox) + cxlds->cxl_mbox.host = dev; + + return cxlds; +} +EXPORT_SYMBOL_NS_GPL(_devm_cxl_dev_state_create, "CXL"); + static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, const struct file_operations *fops, const struct cxl_memdev_attach *attach) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index e21d744d639b..71367cb5178c 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -523,6 +523,37 @@ to_cxl_memdev_state(struct cxl_dev_state *cxlds) return container_of(cxlds, struct cxl_memdev_state, cxlds); } +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: name of the struct cxl_dev_state member in drv_struct + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) + enum cxl_opcode { CXL_MBOX_OP_INVALID = 0x0000, CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, @@ -858,7 +889,8 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev); +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec); void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index fbb300a01830..a42f273ff72b 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -865,25 +865,25 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) int rc, pmu_count; unsigned int i; bool irq_avail; + u16 dvsec; rc = pcim_enable_device(pdev); if (rc) return rc; pci_set_master(pdev); - mds = cxl_memdev_state_create(&pdev->dev); + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + pci_warn(pdev, "Device DVSEC not present, skip CXL.mem init\n"); + + mds = cxl_memdev_state_create(&pdev->dev, pci_get_dsn(pdev), dvsec); if (IS_ERR(mds)) return PTR_ERR(mds); cxlds = &mds->cxlds; pci_set_drvdata(pdev, cxlds); cxlds->rcd = is_cxl_restricted(pdev); - cxlds->serial = pci_get_dsn(pdev); - cxlds->cxl_dvsec = pci_find_dvsec_capability( - pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE); - if (!cxlds->cxl_dvsec) - dev_warn(&pdev->dev, - "Device DVSEC not present, skip CXL.mem init\n"); rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); if (rc) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index cb87e8c0e63c..79f42f4474d4 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1716,7 +1716,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; - mds = cxl_memdev_state_create(dev); + mds = cxl_memdev_state_create(dev, pdev->id + 1, 0); if (IS_ERR(mds)) return PTR_ERR(mds); @@ -1732,7 +1732,6 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf; INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work); - cxlds->serial = pdev->id + 1; if (is_rcd(pdev)) cxlds->rcd = true; From 005869886d1d370afb6c10cd40709d956960e9c2 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:39 +0000 Subject: [PATCH 03/37] cxl: export internal structs for external Type2 drivers In preparation for type2 support, move structs and functions a type2 driver will need to access to into a new shared header file. Differentiate between public and private data to be preserved by type2 drivers. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Tested-by: Alison Schofield Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260306164741.3796372-3-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/cxl.h | 97 +------------------ drivers/cxl/cxlmem.h | 114 ---------------------- include/cxl/cxl.h | 226 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+), 210 deletions(-) create mode 100644 include/cxl/cxl.h diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9b947286eb9b..1d94217729f7 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -12,6 +12,7 @@ #include #include #include +#include extern const struct nvdimm_security_ops *cxl_security_ops; @@ -201,97 +202,6 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48) #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 -/* - * Using struct_group() allows for per register-block-type helper routines, - * without requiring block-type agnostic code to include the prefix. - */ -struct cxl_regs { - /* - * Common set of CXL Component register block base pointers - * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure - * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure - */ - struct_group_tagged(cxl_component_regs, component, - void __iomem *hdm_decoder; - void __iomem *ras; - ); - /* - * Common set of CXL Device register block base pointers - * @status: CXL 2.0 8.2.8.3 Device Status Registers - * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers - * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers - */ - struct_group_tagged(cxl_device_regs, device_regs, - void __iomem *status, *mbox, *memdev; - ); - - struct_group_tagged(cxl_pmu_regs, pmu_regs, - void __iomem *pmu; - ); - - /* - * RCH downstream port specific RAS register - * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB - */ - struct_group_tagged(cxl_rch_regs, rch_regs, - void __iomem *dport_aer; - ); - - /* - * RCD upstream port specific PCIe cap register - * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB - */ - struct_group_tagged(cxl_rcd_regs, rcd_regs, - void __iomem *rcd_pcie_cap; - ); -}; - -struct cxl_reg_map { - bool valid; - int id; - unsigned long offset; - unsigned long size; -}; - -struct cxl_component_reg_map { - struct cxl_reg_map hdm_decoder; - struct cxl_reg_map ras; -}; - -struct cxl_device_reg_map { - struct cxl_reg_map status; - struct cxl_reg_map mbox; - struct cxl_reg_map memdev; -}; - -struct cxl_pmu_reg_map { - struct cxl_reg_map pmu; -}; - -/** - * struct cxl_register_map - DVSEC harvested register block mapping parameters - * @host: device for devm operations and logging - * @base: virtual base of the register-block-BAR + @block_offset - * @resource: physical resource base of the register block - * @max_size: maximum mapping size to perform register search - * @reg_type: see enum cxl_regloc_type - * @component_map: cxl_reg_map for component registers - * @device_map: cxl_reg_maps for device registers - * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units - */ -struct cxl_register_map { - struct device *host; - void __iomem *base; - resource_size_t resource; - resource_size_t max_size; - u8 reg_type; - union { - struct cxl_component_reg_map component_map; - struct cxl_device_reg_map device_map; - struct cxl_pmu_reg_map pmu_map; - }; -}; - void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, @@ -497,11 +407,6 @@ struct cxl_region_params { resource_size_t cache_size; }; -enum cxl_partition_mode { - CXL_PARTMODE_RAM, - CXL_PARTMODE_PMEM, -}; - /* * Indicate whether this region has been assembled by autodetection or * userspace assembly. Prevent endpoint decoders outside of automatic diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 71367cb5178c..281546de426e 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -113,8 +113,6 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); -#define CXL_NR_PARTITIONS_MAX 2 - struct cxl_dpa_info { u64 size; struct cxl_dpa_part_info { @@ -373,87 +371,6 @@ struct cxl_security_state { struct kernfs_node *sanitize_node; }; -/* - * enum cxl_devtype - delineate type-2 from a generic type-3 device - * @CXL_DEVTYPE_DEVMEM - Vendor specific CXL Type-2 device implementing HDM-D or - * HDM-DB, no requirement that this device implements a - * mailbox, or other memory-device-standard manageability - * flows. - * @CXL_DEVTYPE_CLASSMEM - Common class definition of a CXL Type-3 device with - * HDM-H and class-mandatory memory device registers - */ -enum cxl_devtype { - CXL_DEVTYPE_DEVMEM, - CXL_DEVTYPE_CLASSMEM, -}; - -/** - * struct cxl_dpa_perf - DPA performance property entry - * @dpa_range: range for DPA address - * @coord: QoS performance data (i.e. latency, bandwidth) - * @cdat_coord: raw QoS performance data from CDAT - * @qos_class: QoS Class cookies - */ -struct cxl_dpa_perf { - struct range dpa_range; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; - int qos_class; -}; - -/** - * struct cxl_dpa_partition - DPA partition descriptor - * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) - * @perf: performance attributes of the partition from CDAT - * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... - */ -struct cxl_dpa_partition { - struct resource res; - struct cxl_dpa_perf perf; - enum cxl_partition_mode mode; -}; - -/** - * struct cxl_dev_state - The driver device state - * - * cxl_dev_state represents the CXL driver/device state. It provides an - * interface to mailbox commands as well as some cached data about the device. - * Currently only memory devices are represented. - * - * @dev: The device associated with this CXL state - * @cxlmd: The device representing the CXL.mem capabilities of @dev - * @reg_map: component and ras register mapping parameters - * @regs: Class device "Device" registers - * @cxl_dvsec: Offset to the PCIe device DVSEC - * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) - * @media_ready: Indicate whether the device media is usable - * @dpa_res: Overall DPA resource tree for the device - * @part: DPA partition array - * @nr_partitions: Number of DPA partitions - * @serial: PCIe Device Serial Number - * @type: Generic Memory Class device or Vendor Specific Memory device - * @cxl_mbox: CXL mailbox context - * @cxlfs: CXL features context - */ -struct cxl_dev_state { - struct device *dev; - struct cxl_memdev *cxlmd; - struct cxl_register_map reg_map; - struct cxl_device_regs regs; - int cxl_dvsec; - bool rcd; - bool media_ready; - struct resource dpa_res; - struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; - unsigned int nr_partitions; - u64 serial; - enum cxl_devtype type; - struct cxl_mailbox cxl_mbox; -#ifdef CONFIG_CXL_FEATURES - struct cxl_features_state *cxlfs; -#endif -}; - static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) { /* @@ -523,37 +440,6 @@ to_cxl_memdev_state(struct cxl_dev_state *cxlds) return container_of(cxlds, struct cxl_memdev_state, cxlds); } -struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, - enum cxl_devtype type, - u64 serial, u16 dvsec, - size_t size, bool has_mbox); - -/** - * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a - * driver specific struct. - * - * @parent: device behind the request - * @type: CXL device type - * @serial: device identification - * @dvsec: dvsec capability offset - * @drv_struct: driver struct embedding a cxl_dev_state struct - * @member: name of the struct cxl_dev_state member in drv_struct - * @mbox: true if mailbox supported - * - * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state - * struct initialized. - * - * Introduced for Type2 driver support. - */ -#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ - ({ \ - static_assert(__same_type(struct cxl_dev_state, \ - ((drv_struct *)NULL)->member)); \ - static_assert(offsetof(drv_struct, member) == 0); \ - (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ - sizeof(drv_struct), mbox); \ - }) - enum cxl_opcode { CXL_MBOX_OP_INVALID = 0x0000, CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h new file mode 100644 index 000000000000..fa7269154620 --- /dev/null +++ b/include/cxl/cxl.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ +/* Copyright(c) 2026 Advanced Micro Devices, Inc. */ + +#ifndef __CXL_CXL_H__ +#define __CXL_CXL_H__ + +#include +#include +#include + +/** + * enum cxl_devtype - delineate type-2 from a generic type-3 device + * @CXL_DEVTYPE_DEVMEM: Vendor specific CXL Type-2 device implementing HDM-D or + * HDM-DB, no requirement that this device implements a + * mailbox, or other memory-device-standard manageability + * flows. + * @CXL_DEVTYPE_CLASSMEM: Common class definition of a CXL Type-3 device with + * HDM-H and class-mandatory memory device registers + */ +enum cxl_devtype { + CXL_DEVTYPE_DEVMEM, + CXL_DEVTYPE_CLASSMEM, +}; + +struct device; + +/* + * Using struct_group() allows for per register-block-type helper routines, + * without requiring block-type agnostic code to include the prefix. + */ +struct cxl_regs { + /* + * Common set of CXL Component register block base pointers + * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure + * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure + */ + struct_group_tagged(cxl_component_regs, component, + void __iomem *hdm_decoder; + void __iomem *ras; + ); + /* + * Common set of CXL Device register block base pointers + * @status: CXL 2.0 8.2.8.3 Device Status Registers + * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers + * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers + */ + struct_group_tagged(cxl_device_regs, device_regs, + void __iomem *status, *mbox, *memdev; + ); + + struct_group_tagged(cxl_pmu_regs, pmu_regs, + void __iomem *pmu; + ); + + /* + * RCH downstream port specific RAS register + * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB + */ + struct_group_tagged(cxl_rch_regs, rch_regs, + void __iomem *dport_aer; + ); + + /* + * RCD upstream port specific PCIe cap register + * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB + */ + struct_group_tagged(cxl_rcd_regs, rcd_regs, + void __iomem *rcd_pcie_cap; + ); +}; + +struct cxl_reg_map { + bool valid; + int id; + unsigned long offset; + unsigned long size; +}; + +struct cxl_component_reg_map { + struct cxl_reg_map hdm_decoder; + struct cxl_reg_map ras; +}; + +struct cxl_device_reg_map { + struct cxl_reg_map status; + struct cxl_reg_map mbox; + struct cxl_reg_map memdev; +}; + +struct cxl_pmu_reg_map { + struct cxl_reg_map pmu; +}; + +/** + * struct cxl_register_map - DVSEC harvested register block mapping parameters + * @host: device for devm operations and logging + * @base: virtual base of the register-block-BAR + @block_offset + * @resource: physical resource base of the register block + * @max_size: maximum mapping size to perform register search + * @reg_type: see enum cxl_regloc_type + * @component_map: cxl_reg_map for component registers + * @device_map: cxl_reg_maps for device registers + * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + */ +struct cxl_register_map { + struct device *host; + void __iomem *base; + resource_size_t resource; + resource_size_t max_size; + u8 reg_type; + union { + struct cxl_component_reg_map component_map; + struct cxl_device_reg_map device_map; + struct cxl_pmu_reg_map pmu_map; + }; +}; + +/** + * struct cxl_dpa_perf - DPA performance property entry + * @dpa_range: range for DPA address + * @coord: QoS performance data (i.e. latency, bandwidth) + * @cdat_coord: raw QoS performance data from CDAT + * @qos_class: QoS Class cookies + */ +struct cxl_dpa_perf { + struct range dpa_range; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; + int qos_class; +}; + +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, +}; + +/** + * struct cxl_dpa_partition - DPA partition descriptor + * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) + * @perf: performance attributes of the partition from CDAT + * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... + */ +struct cxl_dpa_partition { + struct resource res; + struct cxl_dpa_perf perf; + enum cxl_partition_mode mode; +}; + +#define CXL_NR_PARTITIONS_MAX 2 + +/** + * struct cxl_dev_state - The driver device state + * + * cxl_dev_state represents the CXL driver/device state. It provides an + * interface to mailbox commands as well as some cached data about the device. + * Currently only memory devices are represented. + * + * @dev: The device associated with this CXL state + * @cxlmd: The device representing the CXL.mem capabilities of @dev + * @reg_map: component and ras register mapping parameters + * @regs: Parsed register blocks + * @cxl_dvsec: Offset to the PCIe device DVSEC + * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) + * @media_ready: Indicate whether the device media is usable + * @dpa_res: Overall DPA resource tree for the device + * @part: DPA partition array + * @nr_partitions: Number of DPA partitions + * @serial: PCIe Device Serial Number + * @type: Generic Memory Class device or Vendor Specific Memory device + * @cxl_mbox: CXL mailbox context + * @cxlfs: CXL features context + */ +struct cxl_dev_state { + /* public for Type2 drivers */ + struct device *dev; + struct cxl_memdev *cxlmd; + + /* private for Type2 drivers */ + struct cxl_register_map reg_map; + struct cxl_device_regs regs; + int cxl_dvsec; + bool rcd; + bool media_ready; + struct resource dpa_res; + struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; + unsigned int nr_partitions; + u64 serial; + enum cxl_devtype type; + struct cxl_mailbox cxl_mbox; +#ifdef CONFIG_CXL_FEATURES + struct cxl_features_state *cxlfs; +#endif +}; + +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: name of the struct cxl_dev_state member in drv_struct + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) +#endif /* __CXL_CXL_H__ */ From 58f28930c7fb0e24cdf2972a9c3b7c91aeef4539 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:40 +0000 Subject: [PATCH 04/37] cxl: Move pci generic code from cxl_pci to core/cxl_pci Inside cxl/core/pci.c there are helpers for CXL PCIe initialization meanwhile cxl/pci_drv.c implements the functionality for a Type3 device initialization. In preparation for type2 support, move helper functions from cxl/pci.c to cxl/core/pci.c in order to be exported and used by type2 drivers. [ dj: Clarified subject. ] Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Signed-off-by: Gregory Price Link: https://patch.msgid.link/20260306164741.3796372-4-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 2 ++ drivers/cxl/core/pci.c | 62 ++++++++++++++++++++++++++++++++++++ drivers/cxl/core/regs.c | 1 - drivers/cxl/cxl.h | 2 -- drivers/cxl/cxlpci.h | 13 ++++++++ drivers/cxl/pci.c | 70 ----------------------------------------- 6 files changed, 77 insertions(+), 73 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 5b0570df0fd9..5539e941782f 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -224,4 +224,6 @@ int cxl_set_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, u16 *return_code); #endif +resource_size_t cxl_rcd_component_reg_phys(struct device *dev, + struct cxl_dport *dport); #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index f96ce884a213..c32cc62c501d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -696,6 +696,68 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, "CXL"); +static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, + struct cxl_register_map *map, + struct cxl_dport *dport) +{ + resource_size_t component_reg_phys; + + *map = (struct cxl_register_map) { + .host = &pdev->dev, + .resource = CXL_RESOURCE_NONE, + }; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + + component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); + if (component_reg_phys == CXL_RESOURCE_NONE) + return -ENXIO; + + map->resource = component_reg_phys; + map->reg_type = CXL_REGLOC_RBI_COMPONENT; + map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + return 0; +} + +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + int rc; + + rc = cxl_find_regblock(pdev, type, map); + + /* + * If the Register Locator DVSEC does not exist, check if it + * is an RCH and try to extract the Component Registers from + * an RCRB. + */ + if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { + struct cxl_dport *dport; + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + + rc = cxl_rcrb_get_comp_regs(pdev, map, dport); + if (rc) + return rc; + + rc = cxl_dport_map_rcd_linkcap(pdev, dport); + if (rc) + return rc; + + } else if (rc) { + return rc; + } + + return cxl_setup_regs(map); +} +EXPORT_SYMBOL_NS_GPL(cxl_pci_setup_regs, "CXL"); + int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) { int speed, bw; diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index a010b3214342..93710cf4f0a6 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -641,4 +641,3 @@ resource_size_t cxl_rcd_component_reg_phys(struct device *dev, return CXL_RESOURCE_NONE; return __rcrb_to_component(dev, &dport->rcrb, CXL_RCRB_UPSTREAM); } -EXPORT_SYMBOL_NS_GPL(cxl_rcd_component_reg_phys, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 1d94217729f7..8194447f75d3 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -222,8 +222,6 @@ int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); int cxl_setup_regs(struct cxl_register_map *map); struct cxl_dport; -resource_size_t cxl_rcd_component_reg_phys(struct device *dev, - struct cxl_dport *dport); int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_RESOURCE_NONE ((resource_size_t) -1) diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 0cf64218aa16..b826eb53cf7b 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -74,6 +74,17 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) return lnksta2 & PCI_EXP_LNKSTA2_FLIT; } +/* + * Assume that the caller has already validated that @pdev has CXL + * capabilities, any RCiEP with CXL capabilities is treated as a + * Restricted CXL Device (RCD) and finds upstream port and endpoint + * registers in a Root Complex Register Block (RCRB). + */ +static inline bool is_cxl_restricted(struct pci_dev *pdev) +{ + return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; +} + struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); @@ -101,4 +112,6 @@ static inline void devm_cxl_port_ras_setup(struct cxl_port *port) } #endif +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index a42f273ff72b..adc7c4bcb03a 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -465,76 +465,6 @@ static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail) return 0; } -/* - * Assume that any RCIEP that emits the CXL memory expander class code - * is an RCD - */ -static bool is_cxl_restricted(struct pci_dev *pdev) -{ - return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; -} - -static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, - struct cxl_register_map *map, - struct cxl_dport *dport) -{ - resource_size_t component_reg_phys; - - *map = (struct cxl_register_map) { - .host = &pdev->dev, - .resource = CXL_RESOURCE_NONE, - }; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); - if (component_reg_phys == CXL_RESOURCE_NONE) - return -ENXIO; - - map->resource = component_reg_phys; - map->reg_type = CXL_REGLOC_RBI_COMPONENT; - map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; - - return 0; -} - -static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map) -{ - int rc; - - rc = cxl_find_regblock(pdev, type, map); - - /* - * If the Register Locator DVSEC does not exist, check if it - * is an RCH and try to extract the Component Registers from - * an RCRB. - */ - if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { - struct cxl_dport *dport; - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - rc = cxl_rcrb_get_comp_regs(pdev, map, dport); - if (rc) - return rc; - - rc = cxl_dport_map_rcd_linkcap(pdev, dport); - if (rc) - return rc; - - } else if (rc) { - return rc; - } - - return cxl_setup_regs(map); -} - static void free_event_buf(void *buf) { kvfree(buf); From d537d953c47866bafc89feb66d8ef34baf17659a Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 6 Mar 2026 16:47:41 +0000 Subject: [PATCH 05/37] cxl/pci: Remove redundant cxl_pci_find_port() call Remove the redundant port lookup from cxl_rcrb_get_comp_regs() and use the dport parameter directly. The caller has already validated the port is non-NULL before invoking this function, and dport is given as a param. This is simpler than getting dport in the callee and return the pointer to the caller what would require more changes. Signed-off-by: Gregory Price Reviewed-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20260306164741.3796372-5-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c32cc62c501d..d1f487b3d809 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -707,11 +707,6 @@ static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, .resource = CXL_RESOURCE_NONE, }; - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); if (component_reg_phys == CXL_RESOURCE_NONE) return -ENXIO; From 09d065d256b1d5965fe6512cfd1c23ef44d2efc9 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sat, 28 Feb 2026 17:36:01 +0000 Subject: [PATCH 06/37] cxl: Make region type based on endpoint type Current code is expecting Type3 or CXL_DECODER_HOSTONLYMEM devices only. Support for Type2 implies region type needs to be based on the endpoint type HDM-D[B] instead. Signed-off-by: Alejandro Lucero Reviewed-by: Zhi Wang Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Ben Cheatham Reviewed-by: Alison Schofield Reviewed-by: Davidlohr Bueso Reviewed-by: Gregory Price Tested-by: Gregory Price Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20260228173603.1125109-2-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 42874948b589..d1ce4deae499 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2650,7 +2650,8 @@ static ssize_t create_ram_region_show(struct device *dev, } static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, - enum cxl_partition_mode mode, int id) + enum cxl_partition_mode mode, int id, + enum cxl_decoder_type target_type) { int rc; @@ -2672,7 +2673,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, return ERR_PTR(-EBUSY); } - return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM); + return devm_cxl_add_region(cxlrd, id, mode, target_type); } static ssize_t create_region_store(struct device *dev, const char *buf, @@ -2686,7 +2687,7 @@ static ssize_t create_region_store(struct device *dev, const char *buf, if (rc != 1) return -EINVAL; - cxlr = __create_region(cxlrd, mode, id); + cxlr = __create_region(cxlrd, mode, id, CXL_DECODER_HOSTONLYMEM); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); @@ -3902,7 +3903,8 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, do { cxlr = __create_region(cxlrd, cxlds->part[part].mode, - atomic_read(&cxlrd->region_id)); + atomic_read(&cxlrd->region_id), + cxled->cxld.target_type); } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); if (IS_ERR(cxlr)) { From 29f0724c4592a5ab9076e1ff6e4e39f0de60cc9e Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sat, 28 Feb 2026 17:36:02 +0000 Subject: [PATCH 07/37] cxl/region: Factor out interleave ways setup Region creation based on Type3 devices can be triggered from user space allowing memory combination through interleaving. In preparation for kernel driven region creation, that is Type2 drivers triggering region creation backed with its advertised CXL memory, factor out a common helper from the user-sysfs region setup for interleave ways. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Gregory Price Tested-by: Gregory Price Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260228173603.1125109-3-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 41 +++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d1ce4deae499..55cbad38ec89 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -485,22 +485,14 @@ static ssize_t interleave_ways_show(struct device *dev, static const struct attribute_group *get_cxl_region_target_group(void); -static ssize_t interleave_ways_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) +static int set_interleave_ways(struct cxl_region *cxlr, int val) { - struct cxl_region *cxlr = to_cxl_region(dev); struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region_params *p = &cxlr->params; - unsigned int val, save; - int rc; + int save, rc; u8 iw; - rc = kstrtouint(buf, 0, &val); - if (rc) - return rc; - rc = ways_to_eiw(val, &iw); if (rc) return rc; @@ -515,9 +507,7 @@ static ssize_t interleave_ways_store(struct device *dev, return -EINVAL; } - ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); - if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) - return rc; + lockdep_assert_held_write(&cxl_rwsem.region); if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) return -EBUSY; @@ -525,10 +515,31 @@ static ssize_t interleave_ways_store(struct device *dev, save = p->interleave_ways; p->interleave_ways = val; rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); - if (rc) { + if (rc) p->interleave_ways = save; + + return rc; +} + +static ssize_t interleave_ways_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + int val; + int rc; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + rc = set_interleave_ways(cxlr, val); + if (rc) return rc; - } return len; } From 64584273dfb8a1e5fc7d78094ba22a93c204b44e Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Sat, 28 Feb 2026 17:36:03 +0000 Subject: [PATCH 08/37] cxl/region: Factor out interleave granularity setup Region creation based on Type3 devices can be triggered from user space allowing memory combination through interleaving. In preparation for kernel driven region creation, that is Type2 drivers triggering region creation backed with its advertised CXL memory, factor out a common helper from the user-sysfs region setup for interleave granularity. Signed-off-by: Alejandro Lucero Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Tested-by: Gregory Price Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260228173603.1125109-4-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 55cbad38ec89..3edb5703d6de 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -559,21 +559,14 @@ static ssize_t interleave_granularity_show(struct device *dev, return sysfs_emit(buf, "%d\n", p->interleave_granularity); } -static ssize_t interleave_granularity_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) +static int set_interleave_granularity(struct cxl_region *cxlr, int val) { - struct cxl_region *cxlr = to_cxl_region(dev); struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region_params *p = &cxlr->params; - int rc, val; + int rc; u16 ig; - rc = kstrtoint(buf, 0, &val); - if (rc) - return rc; - rc = granularity_to_eig(val, &ig); if (rc) return rc; @@ -589,14 +582,33 @@ static ssize_t interleave_granularity_store(struct device *dev, if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity) return -EINVAL; - ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); - if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) - return rc; + lockdep_assert_held_write(&cxl_rwsem.region); if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) return -EBUSY; p->interleave_granularity = val; + return 0; +} + +static ssize_t interleave_granularity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + int rc, val; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + rc = set_interleave_granularity(cxlr, val); + if (rc) + return rc; return len; } From dc372e5f429ced834d81ff12a945397dc43585a8 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sat, 14 Mar 2026 15:06:32 +0800 Subject: [PATCH 09/37] cxl/pci: Hold memdev lock in cxl_event_trace_record() cxl_event_config() invokes cxl_mem_get_event_record() to get remain event logs from CXL device during cxl_pci_probe(). If CXL memdev probing failed before that, it is possible to access an invalid endpoint. So adding a cxlmd->driver binding status checking inside cxl_dpa_to_region() to ensure the corresponding endpoint is valid. Besides, cxl_event_trace_record() needs to hold memdev lock to invoke cxl_dpa_to_region() to ensure the memdev probing completed. It is possible that cxl_event_trace_record() is invoked during the CXL memdev probing, especially user or cxl_acpi triggers CXL memdev re-probing. Suggested-by: Dan Williams Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Signed-off-by: Li Ming Link: https://patch.msgid.link/20260314-fix_access_endpoint_without_drv_check-v2-3-4c09edf2e1db@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 5 +++-- drivers/cxl/core/region.c | 8 +++++--- drivers/cxl/cxlmem.h | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index e7a6452bf544..3f34bbabf4d3 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -893,7 +893,7 @@ out: } EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, "CXL"); -void cxl_event_trace_record(const struct cxl_memdev *cxlmd, +void cxl_event_trace_record(struct cxl_memdev *cxlmd, enum cxl_event_log_type type, enum cxl_event_type event_type, const uuid_t *uuid, union cxl_event *evt) @@ -920,6 +920,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, * translations. Take topology mutation locks and lookup * { HPA, REGION } from { DPA, MEMDEV } in the event record. */ + guard(device)(&cxlmd->dev); guard(rwsem_read)(&cxl_rwsem.region); guard(rwsem_read)(&cxl_rwsem.dpa); @@ -968,7 +969,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, } EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL"); -static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd, +static void __cxl_event_trace_record(struct cxl_memdev *cxlmd, enum cxl_event_log_type type, struct cxl_event_record_raw *record) { diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 42874948b589..840d52a52c4e 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2950,13 +2950,15 @@ static int __cxl_dpa_to_region(struct device *dev, void *arg) struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa) { struct cxl_dpa_to_region_context ctx; - struct cxl_port *port; + struct cxl_port *port = cxlmd->endpoint; + + if (!cxlmd->dev.driver) + return NULL; ctx = (struct cxl_dpa_to_region_context) { .dpa = dpa, }; - port = cxlmd->endpoint; - if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port)) + if (cxl_num_decoders_committed(port)) device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region); return ctx.cxlr; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index e21d744d639b..7a34a19c02c8 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -864,7 +864,7 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status); -void cxl_event_trace_record(const struct cxl_memdev *cxlmd, +void cxl_event_trace_record(struct cxl_memdev *cxlmd, enum cxl_event_log_type type, enum cxl_event_type event_type, const uuid_t *uuid, union cxl_event *evt); From e8069c66d09309579e53567be8ddfa6ccb2f452a Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sat, 14 Mar 2026 15:06:33 +0800 Subject: [PATCH 10/37] cxl/pci: Check memdev driver binding status in cxl_reset_done() cxl_reset_done() accesses the endpoint of the corresponding CXL memdev without endpoint validity checking. By default, cxlmd->endpoint is initialized to -ENXIO, if cxl_reset_done() is triggered after the corresponding CXL memdev probing failed, this results in access to an invalid endpoint. CXL subsystem can always check CXL memdev driver binding status to confirm its endpoint validity. So adding the CXL memdev driver checking inside cxl_reset_done() to avoid accessing an invalid endpoint. Fixes: 934edcd436dc ("cxl: Add post-reset warning if reset results in loss of previously committed HDM decoders") Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Signed-off-by: Li Ming Link: https://patch.msgid.link/20260314-fix_access_endpoint_without_drv_check-v2-4-4c09edf2e1db@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index fbb300a01830..a5922116db2a 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1043,6 +1043,9 @@ static void cxl_reset_done(struct pci_dev *pdev) * that no longer exists. */ guard(device)(&cxlmd->dev); + if (!cxlmd->dev.driver) + return; + if (cxlmd->endpoint && cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { dev_crit(dev, "SBR happened without memory regions removal.\n"); From 7974835aa9d54125a1b6a2948f927d745748bf46 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 19 Mar 2026 08:25:41 -0700 Subject: [PATCH 11/37] cxl: Add endpoint decoder flags clear when PCI reset happens When a PCI reset happens, the lock and enable flags of the CXL device should be cleared to avoid stale state flags after reset. Add flag clearing during cxl_reset_done() to clear the relevant endpoint decoder flags for all decoders of the endpoint device. Reported-by: Dan Williams Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260319152541.2739343-1-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/cxl.h | 1 + drivers/cxl/pci.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9b947286eb9b..d09c84bcc015 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -333,6 +333,7 @@ int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_DECODER_F_LOCK BIT(4) #define CXL_DECODER_F_ENABLE BIT(5) #define CXL_DECODER_F_NORMALIZED_ADDRESSING BIT(6) +#define CXL_DECODER_F_RESET_MASK (CXL_DECODER_F_ENABLE | CXL_DECODER_F_LOCK) enum cxl_decoder_type { CXL_DECODER_DEVMEM = 2, diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index fbb300a01830..84cff73b39e5 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1030,6 +1030,19 @@ static void cxl_error_resume(struct pci_dev *pdev) dev->driver ? "successful" : "failed"); } +static int cxl_endpoint_decoder_clear_reset_flags(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + cxled->cxld.flags &= ~CXL_DECODER_F_RESET_MASK; + + return 0; +} + static void cxl_reset_done(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); @@ -1045,6 +1058,9 @@ static void cxl_reset_done(struct pci_dev *pdev) guard(device)(&cxlmd->dev); if (cxlmd->endpoint && cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { + device_for_each_child(&cxlmd->endpoint->dev, NULL, + cxl_endpoint_decoder_clear_reset_flags); + dev_crit(dev, "SBR happened without memory regions removal.\n"); dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); add_taint(TAINT_USER, LOCKDEP_STILL_OK); From 14f2e2ebf31157a873536a7212502bd955b69647 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Sun, 22 Mar 2026 19:53:34 +0000 Subject: [PATCH 12/37] dax/bus: Use dax_region_put() in alloc_dax_region() error path alloc_dax_region() calls kref_init() on the dax_region early in the function, but the error path for sysfs_create_groups() failure uses kfree() directly to free the dax_region. This bypasses the kref lifecycle. Use dax_region_put() instead to handle kref lifecycle correctly. Suggested-by: Jonathan Cameron Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260322195343.206900-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/dax/bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index c94c09622516..299134c9b294 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -668,7 +668,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, }; if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { - kfree(dax_region); + dax_region_put(dax_region); return NULL; } From 116be1e112cbcb664887e44b74f27316a5fef861 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Sun, 22 Mar 2026 19:53:35 +0000 Subject: [PATCH 13/37] dax/hmem: Factor HMEM registration into __hmem_register_device() Separate the CXL overlap check from the HMEM registration path and keep the platform-device setup in a dedicated __hmem_register_device(). This makes hmem_register_device() the policy entry point for deciding whether a range should be deferred to CXL, while __hmem_register_device() handles the HMEM registration flow. No functional changes. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260322195343.206900-3-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/dax/hmem/hmem.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 1cf7c2a0ee1c..a3d45032355c 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -58,21 +58,14 @@ static void release_hmem(void *pdev) platform_device_unregister(pdev); } -static int hmem_register_device(struct device *host, int target_nid, - const struct resource *res) +static int __hmem_register_device(struct device *host, int target_nid, + const struct resource *res) { struct platform_device *pdev; struct memregion_info info; long id; int rc; - if (IS_ENABLED(CONFIG_CXL_REGION) && - region_intersects(res->start, resource_size(res), IORESOURCE_MEM, - IORES_DESC_CXL) != REGION_DISJOINT) { - dev_dbg(host, "deferring range to CXL: %pr\n", res); - return 0; - } - rc = region_intersects_soft_reserve(res->start, resource_size(res)); if (rc != REGION_INTERSECTS) return 0; @@ -123,6 +116,19 @@ out_put: return rc; } +static int hmem_register_device(struct device *host, int target_nid, + const struct resource *res) +{ + if (IS_ENABLED(CONFIG_CXL_REGION) && + region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) != REGION_DISJOINT) { + dev_dbg(host, "deferring range to CXL: %pr\n", res); + return 0; + } + + return __hmem_register_device(host, target_nid, res); +} + static int dax_hmem_platform_probe(struct platform_device *pdev) { return walk_hmem_resources(&pdev->dev, hmem_register_device); From 7b4bcaadfe00e2447c84378291e854ea87a2a41c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 22 Mar 2026 19:53:36 +0000 Subject: [PATCH 14/37] dax/hmem: Request cxl_acpi and cxl_pci before walking Soft Reserved ranges Ensure cxl_acpi has published CXL Window resources before HMEM walks Soft Reserved ranges. Replace MODULE_SOFTDEP("pre: cxl_acpi") with an explicit, synchronous request_module("cxl_acpi"). MODULE_SOFTDEP() only guarantees eventual loading, it does not enforce that the dependency has finished init before the current module runs. This can cause HMEM to start before cxl_acpi has populated the resource tree, breaking detection of overlaps between Soft Reserved and CXL Windows. Also, request cxl_pci before HMEM walks Soft Reserved ranges. Unlike cxl_acpi, cxl_pci attach is asynchronous and creates dependent devices that trigger further module loads. Asynchronous probe flushing (wait_for_device_probe()) is added later in the series in a deferred context before HMEM makes ownership decisions for Soft Reserved ranges. Add an additional explicit Kconfig ordering so that CXL_ACPI and CXL_PCI must be initialized before DEV_DAX_HMEM. This prevents HMEM from consuming Soft Reserved ranges before CXL drivers have had a chance to claim them. Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Tested-by: Tomasz Wolski Link: https://patch.msgid.link/20260322195343.206900-4-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/dax/Kconfig | 2 ++ drivers/dax/hmem/hmem.c | 17 ++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index d656e4c0eb84..3683bb3f2311 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -48,6 +48,8 @@ config DEV_DAX_CXL tristate "CXL DAX: direct access to CXL RAM regions" depends on CXL_BUS && CXL_REGION && DEV_DAX default CXL_REGION && DEV_DAX + depends on CXL_ACPI >= DEV_DAX_HMEM + depends on CXL_PCI >= DEV_DAX_HMEM help CXL RAM regions are either mapped by platform-firmware and published in the initial system-memory map as "System RAM", mapped diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index a3d45032355c..85e751675f65 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -145,6 +145,16 @@ static __init int dax_hmem_init(void) { int rc; + /* + * Ensure that cxl_acpi and cxl_pci have a chance to kick off + * CXL topology discovery at least once before scanning the + * iomem resource tree for IORES_DESC_CXL resources. + */ + if (IS_ENABLED(CONFIG_DEV_DAX_CXL)) { + request_module("cxl_acpi"); + request_module("cxl_pci"); + } + rc = platform_driver_register(&dax_hmem_platform_driver); if (rc) return rc; @@ -165,13 +175,6 @@ static __exit void dax_hmem_exit(void) module_init(dax_hmem_init); module_exit(dax_hmem_exit); -/* Allow for CXL to define its own dax regions */ -#if IS_ENABLED(CONFIG_CXL_REGION) -#if IS_MODULE(CONFIG_CXL_ACPI) -MODULE_SOFTDEP("pre: cxl_acpi"); -#endif -#endif - MODULE_ALIAS("platform:hmem*"); MODULE_ALIAS("platform:hmem_platform*"); MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory"); From edfcf1e21e79ddd6990a1330597c2eb072330832 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 22 Mar 2026 19:53:37 +0000 Subject: [PATCH 15/37] dax/hmem: Gate Soft Reserved deferral on DEV_DAX_CXL Replace IS_ENABLED(CONFIG_CXL_REGION) with IS_ENABLED(CONFIG_DEV_DAX_CXL) so that HMEM only defers Soft Reserved ranges when CXL DAX support is enabled. This makes the coordination between HMEM and the CXL stack more precise and prevents deferral in unrelated CXL configurations. Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260322195343.206900-5-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/dax/hmem/hmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 85e751675f65..ca752db03201 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -119,7 +119,7 @@ out_put: static int hmem_register_device(struct device *host, int target_nid, const struct resource *res) { - if (IS_ENABLED(CONFIG_CXL_REGION) && + if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { dev_dbg(host, "deferring range to CXL: %pr\n", res); From 39aa1d4be12bf9f685adaa06aa2d997c1c611b16 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 22 Mar 2026 19:53:38 +0000 Subject: [PATCH 16/37] dax/cxl, hmem: Initialize hmem early and defer dax_cxl binding Move hmem/ earlier in the dax Makefile so that hmem_init() runs before dax_cxl. In addition, defer registration of the dax_cxl driver to a workqueue instead of using module_cxl_driver(). This ensures that dax_hmem has an opportunity to initialize and register its deferred callback and make ownership decisions before dax_cxl begins probing and claiming Soft Reserved ranges. Mark the dax_cxl driver as PROBE_PREFER_ASYNCHRONOUS so its probe runs out of line from other synchronous probing avoiding ordering dependencies while coordinating ownership decisions with dax_hmem. Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Tomasz Wolski Link: https://patch.msgid.link/20260322195343.206900-6-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/dax/Makefile | 3 +-- drivers/dax/cxl.c | 27 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 5ed5c39857c8..70e996bf1526 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +obj-y += hmem/ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o @@ -10,5 +11,3 @@ dax-y += bus.o device_dax-y := device.o dax_pmem-y := pmem.o dax_cxl-y := cxl.o - -obj-y += hmem/ diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c index 13cd94d32ff7..a2136adfa186 100644 --- a/drivers/dax/cxl.c +++ b/drivers/dax/cxl.c @@ -38,10 +38,35 @@ static struct cxl_driver cxl_dax_region_driver = { .id = CXL_DEVICE_DAX_REGION, .drv = { .suppress_bind_attrs = true, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; -module_cxl_driver(cxl_dax_region_driver); +static void cxl_dax_region_driver_register(struct work_struct *work) +{ + cxl_driver_register(&cxl_dax_region_driver); +} + +static DECLARE_WORK(cxl_dax_region_driver_work, cxl_dax_region_driver_register); + +static int __init cxl_dax_region_init(void) +{ + /* + * Need to resolve a race with dax_hmem wanting to drive regions + * instead of CXL + */ + queue_work(system_long_wq, &cxl_dax_region_driver_work); + return 0; +} +module_init(cxl_dax_region_init); + +static void __exit cxl_dax_region_exit(void) +{ + flush_work(&cxl_dax_region_driver_work); + cxl_driver_unregister(&cxl_dax_region_driver); +} +module_exit(cxl_dax_region_exit); + MODULE_ALIAS_CXL(CXL_DEVICE_DAX_REGION); MODULE_DESCRIPTION("CXL DAX: direct access to CXL regions"); MODULE_LICENSE("GPL"); From 34f80bb969cc1710f336ea1878781780a59fc8e7 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Sun, 22 Mar 2026 19:53:39 +0000 Subject: [PATCH 17/37] dax: Track all dax_region allocations under a global resource tree Introduce a global "DAX Regions" resource root and register each dax_region->res under it via request_resource(). Release the resource on dax_region teardown. By enforcing a single global namespace for dax_region allocations, this ensures only one of dax_hmem or dax_cxl can successfully register a dax_region for a given range. Suggested-by: Dan Williams Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260322195343.206900-7-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/dax/bus.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 299134c9b294..68437c05e21d 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -10,6 +10,7 @@ #include "dax-private.h" #include "bus.h" +static struct resource dax_regions = DEFINE_RES_MEM_NAMED(0, -1, "DAX Regions"); static DEFINE_MUTEX(dax_bus_lock); /* @@ -627,6 +628,7 @@ static void dax_region_unregister(void *region) sysfs_remove_groups(&dax_region->dev->kobj, dax_region_attribute_groups); + release_resource(&dax_region->res); dax_region_put(dax_region); } @@ -635,6 +637,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, unsigned long flags) { struct dax_region *dax_region; + int rc; /* * The DAX core assumes that it can store its private data in @@ -667,14 +670,25 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, .flags = IORESOURCE_MEM | flags, }; - if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { - dax_region_put(dax_region); - return NULL; + rc = request_resource(&dax_regions, &dax_region->res); + if (rc) { + dev_dbg(parent, "dax_region resource conflict for %pR\n", + &dax_region->res); + goto err_res; } + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) + goto err_sysfs; + if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) return NULL; return dax_region; + +err_sysfs: + release_resource(&dax_region->res); +err_res: + dax_region_put(dax_region); + return NULL; } EXPORT_SYMBOL_GPL(alloc_dax_region); From 8e65f99b525b3f49b87db0db0d0e0fc1a0c53e40 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Sun, 22 Mar 2026 19:53:40 +0000 Subject: [PATCH 18/37] cxl/region: Add helper to check Soft Reserved containment by CXL regions Add a helper to determine whether a given Soft Reserved memory range is fully contained within the committed CXL region. This helper provides a primitive for policy decisions in subsequent patches such as co-ordination with dax_hmem to determine whether CXL has fully claimed ownership of Soft Reserved memory ranges. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260322195343.206900-8-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 30 ++++++++++++++++++++++++++++++ include/cxl/cxl.h | 15 +++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 include/cxl/cxl.h diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 42874948b589..f7b20f60ac5c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -4173,6 +4174,35 @@ static int cxl_region_setup_poison(struct cxl_region *cxlr) return devm_add_action_or_reset(dev, remove_debugfs, dentry); } +static int region_contains_resource(struct device *dev, void *data) +{ + struct resource *res = data; + struct cxl_region *cxlr; + struct cxl_region_params *p; + + if (!is_cxl_region(dev)) + return 0; + + cxlr = to_cxl_region(dev); + p = &cxlr->params; + + if (p->state != CXL_CONFIG_COMMIT) + return 0; + + if (!p->res) + return 0; + + return resource_contains(p->res, res) ? 1 : 0; +} + +bool cxl_region_contains_resource(struct resource *res) +{ + guard(rwsem_read)(&cxl_rwsem.region); + return bus_for_each_dev(&cxl_bus_type, NULL, res, + region_contains_resource) != 0; +} +EXPORT_SYMBOL_GPL(cxl_region_contains_resource); + static int cxl_region_can_probe(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h new file mode 100644 index 000000000000..b12d3d0f6658 --- /dev/null +++ b/include/cxl/cxl.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Advanced Micro Devices, Inc. */ +#ifndef _CXL_H_ +#define _CXL_H_ + +#ifdef CONFIG_CXL_REGION +bool cxl_region_contains_resource(struct resource *res); +#else +static inline bool cxl_region_contains_resource(struct resource *res) +{ + return false; +} +#endif + +#endif /* _CXL_H_ */ From e4de6b910bf3645c224cd873d4e03ce3dd81fbe0 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Sun, 22 Mar 2026 19:53:41 +0000 Subject: [PATCH 19/37] dax/hmem, cxl: Defer and resolve Soft Reserved ownership The current probe time ownership check for Soft Reserved memory based solely on CXL window intersection is insufficient. dax_hmem probing is not always guaranteed to run after CXL enumeration and region assembly, which can lead to incorrect ownership decisions before the CXL stack has finished publishing windows and assembling committed regions. Introduce deferred ownership handling for Soft Reserved ranges that intersect CXL windows. When such a range is encountered during the initial dax_hmem probe, schedule deferred work to wait for the CXL stack to complete enumeration and region assembly before deciding ownership. Once the deferred work runs, evaluate each Soft Reserved range individually: if a CXL region fully contains the range, skip it and let dax_cxl bind. Otherwise, register it with dax_hmem. This per-range ownership model avoids the need for CXL region teardown and alloc_dax_region() resource exclusion prevents double claiming. Introduce a boolean flag dax_hmem_initial_probe to live inside device.c so it survives module reload. Ensure dax_cxl defers driver registration until dax_hmem has completed ownership resolution. dax_cxl calls dax_hmem_flush_work() before cxl_driver_register(), which both waits for the deferred work to complete and creates a module symbol dependency that forces dax_hmem.ko to load before dax_cxl. Co-developed-by: Dan Williams Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260322195343.206900-9-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/dax/bus.h | 7 ++++ drivers/dax/cxl.c | 1 + drivers/dax/hmem/device.c | 3 ++ drivers/dax/hmem/hmem.c | 74 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+) diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index cbbf64443098..ebbfe2d6da14 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -49,6 +49,13 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); bool static_dev_dax(struct dev_dax *dev_dax); +#if IS_ENABLED(CONFIG_DEV_DAX_HMEM) +extern bool dax_hmem_initial_probe; +void dax_hmem_flush_work(void); +#else +static inline void dax_hmem_flush_work(void) { } +#endif + #define MODULE_ALIAS_DAX_DEVICE(type) \ MODULE_ALIAS("dax:t" __stringify(type) "*") #define DAX_DEVICE_MODALIAS_FMT "dax:t%d" diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c index a2136adfa186..3ab39b77843d 100644 --- a/drivers/dax/cxl.c +++ b/drivers/dax/cxl.c @@ -44,6 +44,7 @@ static struct cxl_driver cxl_dax_region_driver = { static void cxl_dax_region_driver_register(struct work_struct *work) { + dax_hmem_flush_work(); cxl_driver_register(&cxl_dax_region_driver); } diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c index 56e3cbd181b5..991a4bf7d969 100644 --- a/drivers/dax/hmem/device.c +++ b/drivers/dax/hmem/device.c @@ -8,6 +8,9 @@ static bool nohmem; module_param_named(disable, nohmem, bool, 0444); +bool dax_hmem_initial_probe; +EXPORT_SYMBOL_GPL(dax_hmem_initial_probe); + static bool platform_initialized; static DEFINE_MUTEX(hmem_resource_lock); static struct resource hmem_active = { diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index ca752db03201..9ceda6b5cadf 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "../bus.h" static bool region_idle; @@ -58,6 +59,23 @@ static void release_hmem(void *pdev) platform_device_unregister(pdev); } +struct dax_defer_work { + struct platform_device *pdev; + struct work_struct work; +}; + +static void process_defer_work(struct work_struct *w); + +static struct dax_defer_work dax_hmem_work = { + .work = __WORK_INITIALIZER(dax_hmem_work.work, process_defer_work), +}; + +void dax_hmem_flush_work(void) +{ + flush_work(&dax_hmem_work.work); +} +EXPORT_SYMBOL_GPL(dax_hmem_flush_work); + static int __hmem_register_device(struct device *host, int target_nid, const struct resource *res) { @@ -122,6 +140,11 @@ static int hmem_register_device(struct device *host, int target_nid, if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { + if (!dax_hmem_initial_probe) { + dev_dbg(host, "await CXL initial probe: %pr\n", res); + queue_work(system_long_wq, &dax_hmem_work.work); + return 0; + } dev_dbg(host, "deferring range to CXL: %pr\n", res); return 0; } @@ -129,8 +152,54 @@ static int hmem_register_device(struct device *host, int target_nid, return __hmem_register_device(host, target_nid, res); } +static int hmem_register_cxl_device(struct device *host, int target_nid, + const struct resource *res) +{ + if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) == REGION_DISJOINT) + return 0; + + if (cxl_region_contains_resource((struct resource *)res)) { + dev_dbg(host, "CXL claims resource, dropping: %pr\n", res); + return 0; + } + + dev_dbg(host, "CXL did not claim resource, registering: %pr\n", res); + return __hmem_register_device(host, target_nid, res); +} + +static void process_defer_work(struct work_struct *w) +{ + struct dax_defer_work *work = container_of(w, typeof(*work), work); + struct platform_device *pdev; + + if (!work->pdev) + return; + + pdev = work->pdev; + + /* Relies on cxl_acpi and cxl_pci having had a chance to load */ + wait_for_device_probe(); + + guard(device)(&pdev->dev); + if (!pdev->dev.driver) + return; + + if (!dax_hmem_initial_probe) { + dax_hmem_initial_probe = true; + walk_hmem_resources(&pdev->dev, hmem_register_cxl_device); + } +} + static int dax_hmem_platform_probe(struct platform_device *pdev) { + if (work_pending(&dax_hmem_work.work)) + return -EBUSY; + + if (!dax_hmem_work.pdev) + dax_hmem_work.pdev = + to_platform_device(get_device(&pdev->dev)); + return walk_hmem_resources(&pdev->dev, hmem_register_device); } @@ -168,6 +237,11 @@ static __init int dax_hmem_init(void) static __exit void dax_hmem_exit(void) { + if (dax_hmem_work.pdev) { + flush_work(&dax_hmem_work.work); + put_device(&dax_hmem_work.pdev->dev); + } + platform_driver_unregister(&dax_hmem_driver); platform_driver_unregister(&dax_hmem_platform_driver); } From 8a1ec5fb2360d6fc0183cbe7de68c7a4e611d120 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Thu, 26 Mar 2026 22:02:01 -0400 Subject: [PATCH 20/37] cxl/core/region: move pmem region driver logic into region_pmem.c core/region.c is overloaded with per-region control logic (pmem, dax, sysram, etc). Move the pmem region driver logic from region.c into region_pmem.c make it clear that this code only applies to pmem regions. No functional changes. [ dj: Fixed up some tabbing issues, may be from original code. ] Signed-off-by: Gregory Price Co-developed-by: Ira Weiny Signed-off-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260327020203.876122-2-gourry@gourry.net Signed-off-by: Dave Jiang --- drivers/cxl/core/Makefile | 2 +- drivers/cxl/core/core.h | 1 + drivers/cxl/core/region.c | 184 ------------------------------- drivers/cxl/core/region_pmem.c | 191 +++++++++++++++++++++++++++++++++ tools/testing/cxl/Kbuild | 2 +- 5 files changed, 194 insertions(+), 186 deletions(-) create mode 100644 drivers/cxl/core/region_pmem.c diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index a639a9499972..f73776fe323b 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -15,7 +15,7 @@ cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o cxl_core-$(CONFIG_TRACING) += trace.o -cxl_core-$(CONFIG_CXL_REGION) += region.o +cxl_core-$(CONFIG_CXL_REGION) += region.o region_pmem.o cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 5b0570df0fd9..2fa5f2f58c9b 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -50,6 +50,7 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port); struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa); u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa); +int devm_cxl_add_pmem_region(struct cxl_region *cxlr); #else static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 42874948b589..cf1b7e0617f3 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2757,46 +2757,6 @@ static ssize_t delete_region_store(struct device *dev, } DEVICE_ATTR_WO(delete_region); -static void cxl_pmem_region_release(struct device *dev) -{ - struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); - int i; - - for (i = 0; i < cxlr_pmem->nr_mappings; i++) { - struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd; - - put_device(&cxlmd->dev); - } - - kfree(cxlr_pmem); -} - -static const struct attribute_group *cxl_pmem_region_attribute_groups[] = { - &cxl_base_attribute_group, - NULL, -}; - -const struct device_type cxl_pmem_region_type = { - .name = "cxl_pmem_region", - .release = cxl_pmem_region_release, - .groups = cxl_pmem_region_attribute_groups, -}; - -bool is_cxl_pmem_region(struct device *dev) -{ - return dev->type == &cxl_pmem_region_type; -} -EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, "CXL"); - -struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) -{ - if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev), - "not a cxl_pmem_region device\n")) - return NULL; - return container_of(dev, struct cxl_pmem_region, dev); -} -EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL"); - struct cxl_poison_context { struct cxl_port *port; int part; @@ -3450,64 +3410,6 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, return -ENXIO; } -static struct lock_class_key cxl_pmem_region_key; - -static int cxl_pmem_region_alloc(struct cxl_region *cxlr) -{ - struct cxl_region_params *p = &cxlr->params; - struct cxl_nvdimm_bridge *cxl_nvb; - struct device *dev; - int i; - - guard(rwsem_read)(&cxl_rwsem.region); - if (p->state != CXL_CONFIG_COMMIT) - return -ENXIO; - - struct cxl_pmem_region *cxlr_pmem __free(kfree) = - kzalloc_flex(*cxlr_pmem, mapping, p->nr_targets); - if (!cxlr_pmem) - return -ENOMEM; - - cxlr_pmem->hpa_range.start = p->res->start; - cxlr_pmem->hpa_range.end = p->res->end; - - /* Snapshot the region configuration underneath the cxl_rwsem.region */ - cxlr_pmem->nr_mappings = p->nr_targets; - for (i = 0; i < p->nr_targets; i++) { - struct cxl_endpoint_decoder *cxled = p->targets[i]; - struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; - - /* - * Regions never span CXL root devices, so by definition the - * bridge for one device is the same for all. - */ - if (i == 0) { - cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint); - if (!cxl_nvb) - return -ENODEV; - cxlr->cxl_nvb = cxl_nvb; - } - m->cxlmd = cxlmd; - get_device(&cxlmd->dev); - m->start = cxled->dpa_res->start; - m->size = resource_size(cxled->dpa_res); - m->position = i; - } - - dev = &cxlr_pmem->dev; - device_initialize(dev); - lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); - device_set_pm_not_required(dev); - dev->parent = &cxlr->dev; - dev->bus = &cxl_bus_type; - dev->type = &cxl_pmem_region_type; - cxlr_pmem->cxlr = cxlr; - cxlr->cxlr_pmem = no_free_ptr(cxlr_pmem); - - return 0; -} - static void cxl_dax_region_release(struct device *dev) { struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev); @@ -3571,92 +3473,6 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) return cxlr_dax; } -static void cxlr_pmem_unregister(void *_cxlr_pmem) -{ - struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem; - struct cxl_region *cxlr = cxlr_pmem->cxlr; - struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; - - /* - * Either the bridge is in ->remove() context under the device_lock(), - * or cxlr_release_nvdimm() is cancelling the bridge's release action - * for @cxlr_pmem and doing it itself (while manually holding the bridge - * lock). - */ - device_lock_assert(&cxl_nvb->dev); - cxlr->cxlr_pmem = NULL; - cxlr_pmem->cxlr = NULL; - device_unregister(&cxlr_pmem->dev); -} - -static void cxlr_release_nvdimm(void *_cxlr) -{ - struct cxl_region *cxlr = _cxlr; - struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; - - scoped_guard(device, &cxl_nvb->dev) { - if (cxlr->cxlr_pmem) - devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister, - cxlr->cxlr_pmem); - } - cxlr->cxl_nvb = NULL; - put_device(&cxl_nvb->dev); -} - -/** - * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge - * @cxlr: parent CXL region for this pmem region bridge device - * - * Return: 0 on success negative error code on failure. - */ -static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) -{ - struct cxl_pmem_region *cxlr_pmem; - struct cxl_nvdimm_bridge *cxl_nvb; - struct device *dev; - int rc; - - rc = cxl_pmem_region_alloc(cxlr); - if (rc) - return rc; - cxlr_pmem = cxlr->cxlr_pmem; - cxl_nvb = cxlr->cxl_nvb; - - dev = &cxlr_pmem->dev; - rc = dev_set_name(dev, "pmem_region%d", cxlr->id); - if (rc) - goto err; - - rc = device_add(dev); - if (rc) - goto err; - - dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), - dev_name(dev)); - - scoped_guard(device, &cxl_nvb->dev) { - if (cxl_nvb->dev.driver) - rc = devm_add_action_or_reset(&cxl_nvb->dev, - cxlr_pmem_unregister, - cxlr_pmem); - else - rc = -ENXIO; - } - - if (rc) - goto err_bridge; - - /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */ - return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr); - -err: - put_device(dev); -err_bridge: - put_device(&cxl_nvb->dev); - cxlr->cxl_nvb = NULL; - return rc; -} - static void cxlr_dax_unregister(void *_cxlr_dax) { struct cxl_dax_region *cxlr_dax = _cxlr_dax; diff --git a/drivers/cxl/core/region_pmem.c b/drivers/cxl/core/region_pmem.c new file mode 100644 index 000000000000..23d97e3d78b6 --- /dev/null +++ b/drivers/cxl/core/region_pmem.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include "core.h" + +static void cxl_pmem_region_release(struct device *dev) +{ + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + int i; + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd; + + put_device(&cxlmd->dev); + } + + kfree(cxlr_pmem); +} + +static const struct attribute_group *cxl_pmem_region_attribute_groups[] = { + &cxl_base_attribute_group, + NULL +}; + +const struct device_type cxl_pmem_region_type = { + .name = "cxl_pmem_region", + .release = cxl_pmem_region_release, + .groups = cxl_pmem_region_attribute_groups, +}; + +bool is_cxl_pmem_region(struct device *dev) +{ + return dev->type == &cxl_pmem_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, "CXL"); + +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev), + "not a cxl_pmem_region device\n")) + return NULL; + return container_of(dev, struct cxl_pmem_region, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL"); + +static struct lock_class_key cxl_pmem_region_key; + +static int cxl_pmem_region_alloc(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int i; + + guard(rwsem_read)(&cxl_rwsem.region); + if (p->state != CXL_CONFIG_COMMIT) + return -ENXIO; + + struct cxl_pmem_region *cxlr_pmem __free(kfree) = + kzalloc_flex(*cxlr_pmem, mapping, p->nr_targets); + if (!cxlr_pmem) + return -ENOMEM; + + cxlr_pmem->hpa_range.start = p->res->start; + cxlr_pmem->hpa_range.end = p->res->end; + + /* Snapshot the region configuration underneath the cxl_rwsem.region */ + cxlr_pmem->nr_mappings = p->nr_targets; + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + + /* + * Regions never span CXL root devices, so by definition the + * bridge for one device is the same for all. + */ + if (i == 0) { + cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint); + if (!cxl_nvb) + return -ENODEV; + cxlr->cxl_nvb = cxl_nvb; + } + m->cxlmd = cxlmd; + get_device(&cxlmd->dev); + m->start = cxled->dpa_res->start; + m->size = resource_size(cxled->dpa_res); + m->position = i; + } + + dev = &cxlr_pmem->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); + device_set_pm_not_required(dev); + dev->parent = &cxlr->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_pmem_region_type; + cxlr_pmem->cxlr = cxlr; + cxlr->cxlr_pmem = no_free_ptr(cxlr_pmem); + + return 0; +} + +static void cxlr_pmem_unregister(void *_cxlr_pmem) +{ + struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem; + struct cxl_region *cxlr = cxlr_pmem->cxlr; + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; + + /* + * Either the bridge is in ->remove() context under the device_lock(), + * or cxlr_release_nvdimm() is cancelling the bridge's release action + * for @cxlr_pmem and doing it itself (while manually holding the bridge + * lock). + */ + device_lock_assert(&cxl_nvb->dev); + cxlr->cxlr_pmem = NULL; + cxlr_pmem->cxlr = NULL; + device_unregister(&cxlr_pmem->dev); +} + +static void cxlr_release_nvdimm(void *_cxlr) +{ + struct cxl_region *cxlr = _cxlr; + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; + + scoped_guard(device, &cxl_nvb->dev) { + if (cxlr->cxlr_pmem) + devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister, + cxlr->cxlr_pmem); + } + cxlr->cxl_nvb = NULL; + put_device(&cxl_nvb->dev); +} + +/** + * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge + * @cxlr: parent CXL region for this pmem region bridge device + * + * Return: 0 on success negative error code on failure. + */ +int devm_cxl_add_pmem_region(struct cxl_region *cxlr) +{ + struct cxl_pmem_region *cxlr_pmem; + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int rc; + + rc = cxl_pmem_region_alloc(cxlr); + if (rc) + return rc; + cxlr_pmem = cxlr->cxlr_pmem; + cxl_nvb = cxlr->cxl_nvb; + + dev = &cxlr_pmem->dev; + rc = dev_set_name(dev, "pmem_region%d", cxlr->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + scoped_guard(device, &cxl_nvb->dev) { + if (cxl_nvb->dev.driver) + rc = devm_add_action_or_reset(&cxl_nvb->dev, + cxlr_pmem_unregister, + cxlr_pmem); + else + rc = -ENXIO; + } + + if (rc) + goto err_bridge; + + /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */ + return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr); + +err: + put_device(dev); +err_bridge: + put_device(&cxl_nvb->dev); + cxlr->cxl_nvb = NULL; + return rc; +} diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 53d84a6874b7..f53d79a05661 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -59,7 +59,7 @@ cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o -cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o +cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o $(CXL_CORE_SRC)/region_pmem.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o From d747cf98f091e56beeed5233e8992fea59401011 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Thu, 26 Mar 2026 22:02:02 -0400 Subject: [PATCH 21/37] cxl/core/region: move dax region device logic into region_dax.c core/region.c is overloaded with per-region control logic (pmem, dax, sysram, etc). Move the CXL DAX region device infrastructure from region.c into a new region_dax.c file. This will also allow us to add additional dax-driver integration paths that don't further dirty the core region.c logic. No functional changes. Signed-off-by: Gregory Price Co-developed-by: Ira Weiny Signed-off-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260327020203.876122-3-gourry@gourry.net Signed-off-by: Dave Jiang --- drivers/cxl/core/Makefile | 2 +- drivers/cxl/core/core.h | 1 + drivers/cxl/core/region.c | 99 ------------------------------ drivers/cxl/core/region_dax.c | 109 ++++++++++++++++++++++++++++++++++ tools/testing/cxl/Kbuild | 2 +- 5 files changed, 112 insertions(+), 101 deletions(-) create mode 100644 drivers/cxl/core/region_dax.c diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index f73776fe323b..ce7213818d3c 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -15,7 +15,7 @@ cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o cxl_core-$(CONFIG_TRACING) += trace.o -cxl_core-$(CONFIG_CXL_REGION) += region.o region_pmem.o +cxl_core-$(CONFIG_CXL_REGION) += region.o region_pmem.o region_dax.o cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 2fa5f2f58c9b..5d91e8d0e5bc 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -50,6 +50,7 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port); struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa); u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa); +int devm_cxl_add_dax_region(struct cxl_region *cxlr); int devm_cxl_add_pmem_region(struct cxl_region *cxlr); #else diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index cf1b7e0617f3..34e2208ff105 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3410,105 +3410,6 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, return -ENXIO; } -static void cxl_dax_region_release(struct device *dev) -{ - struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev); - - kfree(cxlr_dax); -} - -static const struct attribute_group *cxl_dax_region_attribute_groups[] = { - &cxl_base_attribute_group, - NULL, -}; - -const struct device_type cxl_dax_region_type = { - .name = "cxl_dax_region", - .release = cxl_dax_region_release, - .groups = cxl_dax_region_attribute_groups, -}; - -static bool is_cxl_dax_region(struct device *dev) -{ - return dev->type == &cxl_dax_region_type; -} - -struct cxl_dax_region *to_cxl_dax_region(struct device *dev) -{ - if (dev_WARN_ONCE(dev, !is_cxl_dax_region(dev), - "not a cxl_dax_region device\n")) - return NULL; - return container_of(dev, struct cxl_dax_region, dev); -} -EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, "CXL"); - -static struct lock_class_key cxl_dax_region_key; - -static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) -{ - struct cxl_region_params *p = &cxlr->params; - struct cxl_dax_region *cxlr_dax; - struct device *dev; - - guard(rwsem_read)(&cxl_rwsem.region); - if (p->state != CXL_CONFIG_COMMIT) - return ERR_PTR(-ENXIO); - - cxlr_dax = kzalloc_obj(*cxlr_dax); - if (!cxlr_dax) - return ERR_PTR(-ENOMEM); - - cxlr_dax->hpa_range.start = p->res->start; - cxlr_dax->hpa_range.end = p->res->end; - - dev = &cxlr_dax->dev; - cxlr_dax->cxlr = cxlr; - device_initialize(dev); - lockdep_set_class(&dev->mutex, &cxl_dax_region_key); - device_set_pm_not_required(dev); - dev->parent = &cxlr->dev; - dev->bus = &cxl_bus_type; - dev->type = &cxl_dax_region_type; - - return cxlr_dax; -} - -static void cxlr_dax_unregister(void *_cxlr_dax) -{ - struct cxl_dax_region *cxlr_dax = _cxlr_dax; - - device_unregister(&cxlr_dax->dev); -} - -static int devm_cxl_add_dax_region(struct cxl_region *cxlr) -{ - struct cxl_dax_region *cxlr_dax; - struct device *dev; - int rc; - - cxlr_dax = cxl_dax_region_alloc(cxlr); - if (IS_ERR(cxlr_dax)) - return PTR_ERR(cxlr_dax); - - dev = &cxlr_dax->dev; - rc = dev_set_name(dev, "dax_region%d", cxlr->id); - if (rc) - goto err; - - rc = device_add(dev); - if (rc) - goto err; - - dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), - dev_name(dev)); - - return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, - cxlr_dax); -err: - put_device(dev); - return rc; -} - static int match_root_decoder(struct device *dev, const void *data) { const struct range *r1, *r2 = data; diff --git a/drivers/cxl/core/region_dax.c b/drivers/cxl/core/region_dax.c new file mode 100644 index 000000000000..fe367759ac69 --- /dev/null +++ b/drivers/cxl/core/region_dax.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2022 Intel Corporation. All rights reserved. + * Copyright(c) 2026 Meta Technologies Inc. All rights reserved. + */ +#include +#include +#include +#include +#include "core.h" + +static void cxl_dax_region_release(struct device *dev) +{ + struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev); + + kfree(cxlr_dax); +} + +static const struct attribute_group *cxl_dax_region_attribute_groups[] = { + &cxl_base_attribute_group, + NULL +}; + +const struct device_type cxl_dax_region_type = { + .name = "cxl_dax_region", + .release = cxl_dax_region_release, + .groups = cxl_dax_region_attribute_groups, +}; + +static bool is_cxl_dax_region(struct device *dev) +{ + return dev->type == &cxl_dax_region_type; +} + +struct cxl_dax_region *to_cxl_dax_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_dax_region(dev), + "not a cxl_dax_region device\n")) + return NULL; + return container_of(dev, struct cxl_dax_region, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, "CXL"); + +static struct lock_class_key cxl_dax_region_key; + +static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_dax_region *cxlr_dax; + struct device *dev; + + guard(rwsem_read)(&cxl_rwsem.region); + if (p->state != CXL_CONFIG_COMMIT) + return ERR_PTR(-ENXIO); + + cxlr_dax = kzalloc_obj(*cxlr_dax); + if (!cxlr_dax) + return ERR_PTR(-ENOMEM); + + cxlr_dax->hpa_range.start = p->res->start; + cxlr_dax->hpa_range.end = p->res->end; + + dev = &cxlr_dax->dev; + cxlr_dax->cxlr = cxlr; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_dax_region_key); + device_set_pm_not_required(dev); + dev->parent = &cxlr->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_dax_region_type; + + return cxlr_dax; +} + +static void cxlr_dax_unregister(void *_cxlr_dax) +{ + struct cxl_dax_region *cxlr_dax = _cxlr_dax; + + device_unregister(&cxlr_dax->dev); +} + +int devm_cxl_add_dax_region(struct cxl_region *cxlr) +{ + struct cxl_dax_region *cxlr_dax; + struct device *dev; + int rc; + + cxlr_dax = cxl_dax_region_alloc(cxlr); + if (IS_ERR(cxlr_dax)) + return PTR_ERR(cxlr_dax); + + dev = &cxlr_dax->dev; + rc = dev_set_name(dev, "dax_region%d", cxlr->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, + cxlr_dax); +err: + put_device(dev); + return rc; +} diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index f53d79a05661..d2b291e5f842 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -59,7 +59,7 @@ cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o -cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o $(CXL_CORE_SRC)/region_pmem.o +cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o $(CXL_CORE_SRC)/region_pmem.o $(CXL_CORE_SRC)/region_dax.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o From 29990ab5cb408d5aa15939d6535e3291aeef748b Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Thu, 26 Mar 2026 22:02:03 -0400 Subject: [PATCH 22/37] cxl/core: use cleanup.h for devm_cxl_add_dax_region Cleanup the gotos in the function. No functional change. Signed-off-by: Gregory Price Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260327020203.876122-4-gourry@gourry.net Signed-off-by: Dave Jiang --- drivers/cxl/core/region_dax.c | 13 +++++-------- drivers/cxl/cxl.h | 1 + 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/region_dax.c b/drivers/cxl/core/region_dax.c index fe367759ac69..de04f78f6ad8 100644 --- a/drivers/cxl/core/region_dax.c +++ b/drivers/cxl/core/region_dax.c @@ -81,29 +81,26 @@ static void cxlr_dax_unregister(void *_cxlr_dax) int devm_cxl_add_dax_region(struct cxl_region *cxlr) { - struct cxl_dax_region *cxlr_dax; struct device *dev; int rc; - cxlr_dax = cxl_dax_region_alloc(cxlr); + struct cxl_dax_region *cxlr_dax __free(put_cxl_dax_region) = + cxl_dax_region_alloc(cxlr); if (IS_ERR(cxlr_dax)) return PTR_ERR(cxlr_dax); dev = &cxlr_dax->dev; rc = dev_set_name(dev, "dax_region%d", cxlr->id); if (rc) - goto err; + return rc; rc = device_add(dev); if (rc) - goto err; + return rc; dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), dev_name(dev)); return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, - cxlr_dax); -err: - put_device(dev); - return rc; + no_free_ptr(cxlr_dax)); } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9b947286eb9b..7f63a62c2a5f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -808,6 +808,7 @@ DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev)) DEFINE_FREE(put_cxl_port, struct cxl_port *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) DEFINE_FREE(put_cxl_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxlsd.cxld.dev)) DEFINE_FREE(put_cxl_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) +DEFINE_FREE(put_cxl_dax_region, struct cxl_dax_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); void cxl_bus_rescan(void); From 261a02b93d9b6dfdc49b3e675be1a0e677cf71f3 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 30 Mar 2026 17:50:45 -0700 Subject: [PATCH 23/37] cxl/core: Check existence of cxl_memdev_state in poison test Before now, all CXL memdevs were assumed to have a mailbox-backed cxl_memdev_state, so poison command checks could safely dereference the @mds. With the introduction of Type 2 devices, a memdev may not implement a mailbox interface, and so there is no associated cxl_memdev_state. Guard against this case by returning false when @mds is absent. Signed-off-by: Alison Schofield Reviewed-by: Alejandro Lucero Link: https://patch.msgid.link/20260331005047.2813980-1-alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 273c22118d3d..591425866045 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -204,6 +204,9 @@ bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd, { struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + if (!mds) + return 0; + return test_bit(cmd, mds->poison.enabled_cmds); } From 87805c32e6ad7b5ce2d9f7f47e76081857a4a335 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:13 -0700 Subject: [PATCH 24/37] cxl/region: Fix use-after-free from auto assembly failure The following crash signature results from region destruction while an endpoint decoder is staged, but not fully attached. [ dj: Moved bus_find_device( to next line. ] Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-2-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 54 ++++++++++++++++++++++++++++++++++++++- drivers/cxl/cxl.h | 6 +++-- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index f7b20f60ac5c..b89442931277 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1064,6 +1064,14 @@ static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr, if (!cxld->region) { cxld->region = cxlr; + + /* + * Now that cxld->region is set the intermediate staging state + * can be cleared. + */ + if (cxld == &cxled->cxld && + cxled->state == CXL_DECODER_STATE_AUTO_STAGED) + cxled->state = CXL_DECODER_STATE_AUTO; get_device(&cxlr->dev); } @@ -1805,6 +1813,7 @@ static int cxl_region_attach_auto(struct cxl_region *cxlr, pos = p->nr_targets; p->targets[pos] = cxled; cxled->pos = pos; + cxled->state = CXL_DECODER_STATE_AUTO_STAGED; p->nr_targets++; return 0; @@ -2154,6 +2163,47 @@ static int cxl_region_attach(struct cxl_region *cxlr, return 0; } +static int cxl_region_by_target(struct device *dev, const void *data) +{ + const struct cxl_endpoint_decoder *cxled = data; + struct cxl_region_params *p; + struct cxl_region *cxlr; + + if (!is_cxl_region(dev)) + return 0; + + cxlr = to_cxl_region(dev); + p = &cxlr->params; + return p->targets[cxled->pos] == cxled; +} + +/* + * When an auto-region fails to assemble the decoder may be listed as a target, + * but not fully attached. + */ +static void cxl_cancel_auto_attach(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_region_params *p; + struct cxl_region *cxlr; + int pos = cxled->pos; + + if (cxled->state != CXL_DECODER_STATE_AUTO_STAGED) + return; + + struct device *dev __free(put_device) = + bus_find_device(&cxl_bus_type, NULL, cxled, cxl_region_by_target); + if (!dev) + return; + + cxlr = to_cxl_region(dev); + p = &cxlr->params; + + p->nr_targets--; + cxled->state = CXL_DECODER_STATE_AUTO; + cxled->pos = -1; + p->targets[pos] = NULL; +} + static struct cxl_region * __cxl_decoder_detach(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled, int pos, @@ -2177,8 +2227,10 @@ __cxl_decoder_detach(struct cxl_region *cxlr, cxled = p->targets[pos]; } else { cxlr = cxled->cxld.region; - if (!cxlr) + if (!cxlr) { + cxl_cancel_auto_attach(cxled); return NULL; + } p = &cxlr->params; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9b947286eb9b..30a31968f266 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -378,12 +378,14 @@ struct cxl_decoder { }; /* - * Track whether this decoder is reserved for region autodiscovery, or - * free for userspace provisioning. + * Track whether this decoder is free for userspace provisioning, reserved for + * region autodiscovery, whether it is started connecting (awaiting other + * peers), or has completed auto assembly. */ enum cxl_decoder_state { CXL_DECODER_STATE_MANUAL, CXL_DECODER_STATE_AUTO, + CXL_DECODER_STATE_AUTO_STAGED, }; /** From 1eaef15b2349087d9ce583b9153970d5cf5c5329 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:14 -0700 Subject: [PATCH 25/37] dax/cxl: Fix HMEM dependencies The expectation is that DEV_DAX_HMEM=y should be disallowed if any of CXL_ACPI, or CXL_PCI are set =m. Also DEV_DAX_CXL=y should be disallowed if DEV_DAX_HMEM=m. Use "$config || !$config" syntax for each dependency. Otherwise, the invalid DEV_DAX_HMEM=m && DEV_DAX_CXL=y configuration is allowed. Lastly, dax_hmem depends on the availability of the cxl_region_contains_resource() symbol published by the cxl_core.ko module. So, also prevent DEV_DAX_HMEM from being built-in when the cxl_core module is not built-in. Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-3-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/dax/Kconfig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3683bb3f2311..504f7f735ef5 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -32,6 +32,9 @@ config DEV_DAX_HMEM depends on EFI_SOFT_RESERVE select NUMA_KEEP_MEMINFO if NUMA_MEMBLKS default DEV_DAX + depends on CXL_ACPI || !CXL_ACPI + depends on CXL_PCI || !CXL_PCI + depends on CXL_BUS || !CXL_BUS help EFI 2.8 platforms, and others, may advertise 'specific purpose' memory. For example, a high bandwidth memory pool. The @@ -48,8 +51,7 @@ config DEV_DAX_CXL tristate "CXL DAX: direct access to CXL RAM regions" depends on CXL_BUS && CXL_REGION && DEV_DAX default CXL_REGION && DEV_DAX - depends on CXL_ACPI >= DEV_DAX_HMEM - depends on CXL_PCI >= DEV_DAX_HMEM + depends on DEV_DAX_HMEM || !DEV_DAX_HMEM help CXL RAM regions are either mapped by platform-firmware and published in the initial system-memory map as "System RAM", mapped From b6a61d5baf99c012c61ee93f8295185942cd7495 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:15 -0700 Subject: [PATCH 26/37] cxl/region: Limit visibility of cxl_region_contains_resource() The dax_hmem dependency on cxl_region_contains_resource() is a one-off special case. It is not suitable for other use cases. Move the definition to the other CONFIG_CXL_REGION guarded definitions in drivers/cxl/cxl.h and include that by a relative path include. This matches what drivers/dax/cxl.c does for its limited private usage of CXL core symbols. Reduce the symbol export visibility from global to just dax_hmem, to further clarify its applicability. Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-4-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 3 +-- drivers/cxl/cxl.h | 5 +++++ drivers/dax/hmem/hmem.c | 2 +- include/cxl/cxl.h | 15 --------------- 4 files changed, 7 insertions(+), 18 deletions(-) delete mode 100644 include/cxl/cxl.h diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index b89442931277..657844cf0379 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include "core.h" @@ -4253,7 +4252,7 @@ bool cxl_region_contains_resource(struct resource *res) return bus_for_each_dev(&cxl_bus_type, NULL, res, region_contains_resource) != 0; } -EXPORT_SYMBOL_GPL(cxl_region_contains_resource); +EXPORT_SYMBOL_FOR_MODULES(cxl_region_contains_resource, "dax_hmem"); static int cxl_region_can_probe(struct cxl_region *cxlr) { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 30a31968f266..84ad04a02bde 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -941,6 +941,7 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); int cxl_add_to_region(struct cxl_endpoint_decoder *cxled); struct cxl_dax_region *to_cxl_dax_region(struct device *dev); u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa); +bool cxl_region_contains_resource(struct resource *res); #else static inline bool is_cxl_pmem_region(struct device *dev) { @@ -963,6 +964,10 @@ static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, { return 0; } +static inline bool cxl_region_contains_resource(struct resource *res) +{ + return false; +} #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 9ceda6b5cadf..0051e553c33f 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include "../../cxl/cxl.h" #include "../bus.h" static bool region_idle; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h deleted file mode 100644 index b12d3d0f6658..000000000000 --- a/include/cxl/cxl.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (c) 2026 Advanced Micro Devices, Inc. */ -#ifndef _CXL_H_ -#define _CXL_H_ - -#ifdef CONFIG_CXL_REGION -bool cxl_region_contains_resource(struct resource *res); -#else -static inline bool cxl_region_contains_resource(struct resource *res) -{ - return false; -} -#endif - -#endif /* _CXL_H_ */ From 471d88441eb990ef1b64713e6975cb3549b1824b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:16 -0700 Subject: [PATCH 27/37] cxl/region: Constify cxl_region_resource_contains() The call to cxl_region_resource_contains() in hmem_register_cxl_device() need not cast away 'const'. The problem is the usage of the bus_for_each_dev() API which does not mark its @data parameter as 'const'. Switch to bus_find_device() which does take 'const' @data, fixup cxl_region_resource_contains() and its caller. Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-5-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 11 ++++++----- drivers/cxl/cxl.h | 4 ++-- drivers/dax/hmem/hmem.c | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 657844cf0379..30787faef352 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -4225,9 +4225,9 @@ static int cxl_region_setup_poison(struct cxl_region *cxlr) return devm_add_action_or_reset(dev, remove_debugfs, dentry); } -static int region_contains_resource(struct device *dev, void *data) +static int region_contains_resource(struct device *dev, const void *data) { - struct resource *res = data; + const struct resource *res = data; struct cxl_region *cxlr; struct cxl_region_params *p; @@ -4246,11 +4246,12 @@ static int region_contains_resource(struct device *dev, void *data) return resource_contains(p->res, res) ? 1 : 0; } -bool cxl_region_contains_resource(struct resource *res) +bool cxl_region_contains_resource(const struct resource *res) { guard(rwsem_read)(&cxl_rwsem.region); - return bus_for_each_dev(&cxl_bus_type, NULL, res, - region_contains_resource) != 0; + struct device *dev __free(put_device) = bus_find_device( + &cxl_bus_type, NULL, res, region_contains_resource); + return !!dev; } EXPORT_SYMBOL_FOR_MODULES(cxl_region_contains_resource, "dax_hmem"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 84ad04a02bde..340bdc9fcacc 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -941,7 +941,7 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); int cxl_add_to_region(struct cxl_endpoint_decoder *cxled); struct cxl_dax_region *to_cxl_dax_region(struct device *dev); u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa); -bool cxl_region_contains_resource(struct resource *res); +bool cxl_region_contains_resource(const struct resource *res); #else static inline bool is_cxl_pmem_region(struct device *dev) { @@ -964,7 +964,7 @@ static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, { return 0; } -static inline bool cxl_region_contains_resource(struct resource *res) +static inline bool cxl_region_contains_resource(const struct resource *res) { return false; } diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 0051e553c33f..b2ab1292fa81 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -159,7 +159,7 @@ static int hmem_register_cxl_device(struct device *host, int target_nid, IORES_DESC_CXL) == REGION_DISJOINT) return 0; - if (cxl_region_contains_resource((struct resource *)res)) { + if (cxl_region_contains_resource(res)) { dev_dbg(host, "CXL claims resource, dropping: %pr\n", res); return 0; } From 3cba30eed56df3af80ae8d4fde9cf4039eace82a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:17 -0700 Subject: [PATCH 28/37] dax/hmem: Reduce visibility of dax_cxl coordination symbols No other module or use case should be using dax_hmem_initial_probe or dax_hmem_flush_work(). Limit their use to dax_hmem, and dax_cxl respectively. Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-6-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/dax/hmem/device.c | 2 +- drivers/dax/hmem/hmem.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c index 991a4bf7d969..675d56276d78 100644 --- a/drivers/dax/hmem/device.c +++ b/drivers/dax/hmem/device.c @@ -9,7 +9,7 @@ static bool nohmem; module_param_named(disable, nohmem, bool, 0444); bool dax_hmem_initial_probe; -EXPORT_SYMBOL_GPL(dax_hmem_initial_probe); +EXPORT_SYMBOL_FOR_MODULES(dax_hmem_initial_probe, "dax_hmem"); static bool platform_initialized; static DEFINE_MUTEX(hmem_resource_lock); diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index b2ab1292fa81..dd3d7f93baee 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -74,7 +74,7 @@ void dax_hmem_flush_work(void) { flush_work(&dax_hmem_work.work); } -EXPORT_SYMBOL_GPL(dax_hmem_flush_work); +EXPORT_SYMBOL_FOR_MODULES(dax_hmem_flush_work, "dax_cxl"); static int __hmem_register_device(struct device *host, int target_nid, const struct resource *res) From f8dc1bde187310e0345beb08df949e0c2a4c86ce Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:18 -0700 Subject: [PATCH 29/37] dax/hmem: Fix singleton confusion between dax_hmem_work and hmem devices dax_hmem (ab)uses a platform device to allow for a module to autoload in the presence of "Soft Reserved" resources. The dax_hmem driver had no dependencies on the "hmem_platform" device being a singleton until the recent "dax_hmem vs dax_cxl" takeover solution. Replace the layering violation of dax_hmem_work assuming that there will never be more than one "hmem_platform" device associated with a global work item with a dax_hmem local workqueue that can theoretically support any number of hmem_platform devices. Fixup the reference counting to only pin the device while it is live in the queue. Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-7-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/dax/bus.h | 15 +++++- drivers/dax/hmem/device.c | 28 ++++++---- drivers/dax/hmem/hmem.c | 108 +++++++++++++++++++------------------- 3 files changed, 85 insertions(+), 66 deletions(-) diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index ebbfe2d6da14..7b1a83f1ce1f 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -3,7 +3,9 @@ #ifndef __DAX_BUS_H__ #define __DAX_BUS_H__ #include +#include #include +#include struct dev_dax; struct resource; @@ -49,8 +51,19 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); bool static_dev_dax(struct dev_dax *dev_dax); +struct hmem_platform_device { + struct platform_device pdev; + struct work_struct work; + bool did_probe; +}; + +static inline struct hmem_platform_device * +to_hmem_platform_device(struct platform_device *pdev) +{ + return container_of(pdev, struct hmem_platform_device, pdev); +} + #if IS_ENABLED(CONFIG_DEV_DAX_HMEM) -extern bool dax_hmem_initial_probe; void dax_hmem_flush_work(void); #else static inline void dax_hmem_flush_work(void) { } diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c index 675d56276d78..d70359b4307b 100644 --- a/drivers/dax/hmem/device.c +++ b/drivers/dax/hmem/device.c @@ -4,13 +4,11 @@ #include #include #include +#include "../bus.h" static bool nohmem; module_param_named(disable, nohmem, bool, 0444); -bool dax_hmem_initial_probe; -EXPORT_SYMBOL_FOR_MODULES(dax_hmem_initial_probe, "dax_hmem"); - static bool platform_initialized; static DEFINE_MUTEX(hmem_resource_lock); static struct resource hmem_active = { @@ -36,9 +34,21 @@ int walk_hmem_resources(struct device *host, walk_hmem_fn fn) } EXPORT_SYMBOL_GPL(walk_hmem_resources); +static void hmem_work(struct work_struct *work) +{ + /* place holder until dax_hmem driver attaches */ +} + +static struct hmem_platform_device hmem_platform = { + .pdev = { + .name = "hmem_platform", + .id = 0, + }, + .work = __WORK_INITIALIZER(hmem_platform.work, hmem_work), +}; + static void __hmem_register_resource(int target_nid, struct resource *res) { - struct platform_device *pdev; struct resource *new; int rc; @@ -54,17 +64,13 @@ static void __hmem_register_resource(int target_nid, struct resource *res) if (platform_initialized) return; - pdev = platform_device_alloc("hmem_platform", 0); - if (!pdev) { + rc = platform_device_register(&hmem_platform.pdev); + if (rc) { pr_err_once("failed to register device-dax hmem_platform device\n"); return; } - rc = platform_device_add(pdev); - if (rc) - platform_device_put(pdev); - else - platform_initialized = true; + platform_initialized = true; } void hmem_register_resource(int target_nid, struct resource *res) diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index dd3d7f93baee..e1dae83dae8d 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -59,20 +59,11 @@ static void release_hmem(void *pdev) platform_device_unregister(pdev); } -struct dax_defer_work { - struct platform_device *pdev; - struct work_struct work; -}; - -static void process_defer_work(struct work_struct *w); - -static struct dax_defer_work dax_hmem_work = { - .work = __WORK_INITIALIZER(dax_hmem_work.work, process_defer_work), -}; +static struct workqueue_struct *dax_hmem_wq; void dax_hmem_flush_work(void) { - flush_work(&dax_hmem_work.work); + flush_workqueue(dax_hmem_wq); } EXPORT_SYMBOL_FOR_MODULES(dax_hmem_flush_work, "dax_cxl"); @@ -134,24 +125,6 @@ out_put: return rc; } -static int hmem_register_device(struct device *host, int target_nid, - const struct resource *res) -{ - if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && - region_intersects(res->start, resource_size(res), IORESOURCE_MEM, - IORES_DESC_CXL) != REGION_DISJOINT) { - if (!dax_hmem_initial_probe) { - dev_dbg(host, "await CXL initial probe: %pr\n", res); - queue_work(system_long_wq, &dax_hmem_work.work); - return 0; - } - dev_dbg(host, "deferring range to CXL: %pr\n", res); - return 0; - } - - return __hmem_register_device(host, target_nid, res); -} - static int hmem_register_cxl_device(struct device *host, int target_nid, const struct resource *res) { @@ -170,35 +143,55 @@ static int hmem_register_cxl_device(struct device *host, int target_nid, static void process_defer_work(struct work_struct *w) { - struct dax_defer_work *work = container_of(w, typeof(*work), work); - struct platform_device *pdev; - - if (!work->pdev) - return; - - pdev = work->pdev; + struct hmem_platform_device *hpdev = container_of(w, typeof(*hpdev), work); + struct device *dev = &hpdev->pdev.dev; /* Relies on cxl_acpi and cxl_pci having had a chance to load */ wait_for_device_probe(); - guard(device)(&pdev->dev); - if (!pdev->dev.driver) - return; + guard(device)(dev); + if (!dev->driver) + goto out; - if (!dax_hmem_initial_probe) { - dax_hmem_initial_probe = true; - walk_hmem_resources(&pdev->dev, hmem_register_cxl_device); + if (!hpdev->did_probe) { + hpdev->did_probe = true; + walk_hmem_resources(dev, hmem_register_cxl_device); } +out: + put_device(dev); +} + +static int hmem_register_device(struct device *host, int target_nid, + const struct resource *res) +{ + struct platform_device *pdev = to_platform_device(host); + struct hmem_platform_device *hpdev = to_hmem_platform_device(pdev); + + if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && + region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) != REGION_DISJOINT) { + if (!hpdev->did_probe) { + dev_dbg(host, "await CXL initial probe: %pr\n", res); + hpdev->work.func = process_defer_work; + get_device(host); + if (!queue_work(dax_hmem_wq, &hpdev->work)) + put_device(host); + return 0; + } + dev_dbg(host, "deferring range to CXL: %pr\n", res); + return 0; + } + + return __hmem_register_device(host, target_nid, res); } static int dax_hmem_platform_probe(struct platform_device *pdev) { - if (work_pending(&dax_hmem_work.work)) - return -EBUSY; + struct hmem_platform_device *hpdev = to_hmem_platform_device(pdev); - if (!dax_hmem_work.pdev) - dax_hmem_work.pdev = - to_platform_device(get_device(&pdev->dev)); + /* queue is only flushed on module unload, fail rebind with pending work */ + if (work_pending(&hpdev->work)) + return -EBUSY; return walk_hmem_resources(&pdev->dev, hmem_register_device); } @@ -224,26 +217,33 @@ static __init int dax_hmem_init(void) request_module("cxl_pci"); } + dax_hmem_wq = alloc_ordered_workqueue("dax_hmem_wq", 0); + if (!dax_hmem_wq) + return -ENOMEM; + rc = platform_driver_register(&dax_hmem_platform_driver); if (rc) - return rc; + goto err_platform_driver; rc = platform_driver_register(&dax_hmem_driver); if (rc) - platform_driver_unregister(&dax_hmem_platform_driver); + goto err_driver; + + return 0; + +err_driver: + platform_driver_unregister(&dax_hmem_platform_driver); +err_platform_driver: + destroy_workqueue(dax_hmem_wq); return rc; } static __exit void dax_hmem_exit(void) { - if (dax_hmem_work.pdev) { - flush_work(&dax_hmem_work.work); - put_device(&dax_hmem_work.pdev->dev); - } - platform_driver_unregister(&dax_hmem_driver); platform_driver_unregister(&dax_hmem_platform_driver); + destroy_workqueue(dax_hmem_wq); } module_init(dax_hmem_init); From 059edcc405e46cc10ee65ab2c039aa6bccfbb3a0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:19 -0700 Subject: [PATCH 30/37] dax/hmem: Parent dax_hmem devices For test purposes it is useful to be able to determine which "hmem_platform" device is hosting a given sub-device. Register hmem devices underneath "hmem_platform". Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-8-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/dax/hmem/hmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index e1dae83dae8d..af21f66bf872 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -96,6 +96,7 @@ static int __hmem_register_device(struct device *host, int target_nid, return -ENOMEM; } + pdev->dev.parent = host; pdev->dev.numa_node = numa_map_to_online_node(target_nid); info = (struct memregion_info) { .target_node = target_nid, From 78b8f1a7a4ab39cecd926d50627db3537e0f2ee9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:20 -0700 Subject: [PATCH 31/37] tools/testing/cxl: Simulate auto-assembly failure Add a cxl_test module option to skip setting up one of the members of the default auto-assembled region. This simulates a device failing between firmware setup and OS boot, or region configuration interrupted by an event like kexec. Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260327052821.440749-9-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/test/cxl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 81e2aef3627a..7deeb7ff7bdf 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -16,6 +16,7 @@ static int interleave_arithmetic; static bool extended_linear_cache; +static bool fail_autoassemble; #define FAKE_QTG_ID 42 @@ -819,6 +820,12 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) return; } + /* Simulate missing cxl_mem.4 configuration */ + if (hb0 && pdev->id == 4 && cxld->id == 0 && fail_autoassemble) { + default_mock_decoder(cxld); + return; + } + base = window->base_hpa; if (extended_linear_cache) base += mock_auto_region_size; @@ -1620,6 +1627,8 @@ module_param(interleave_arithmetic, int, 0444); MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1"); module_param(extended_linear_cache, bool, 0444); MODULE_PARM_DESC(extended_linear_cache, "Enable extended linear cache support"); +module_param(fail_autoassemble, bool, 0444); +MODULE_PARM_DESC(fail_autoassemble, "Simulate missing member of an auto-region"); module_init(cxl_test_init); module_exit(cxl_test_exit); MODULE_LICENSE("GPL v2"); From 549b5c12ef06441dbde4718f16e23c547f5592d7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Mar 2026 22:28:21 -0700 Subject: [PATCH 32/37] tools/testing/cxl: Test dax_hmem takeover of CXL regions When platform firmware is committed to publishing EFI_CONVENTIONAL_MEMORY in the memory map, but CXL fails to assemble the region, dax_hmem can attempt to attach a dax device to the memory range. Take advantage of the new ability to support multiple "hmem_platform" devices, and to enable regression testing of several scenarios: * CXL correctly assembles a region, check dax_hmem fails to attach dax * CXL fails to assemble a region, check dax_hmem successfully attaches dax * Check that loading the dax_cxl driver loads the dax_hmem driver * Attempt to race cxl_mock_mem async probe vs dax_hmem probe flushing. Check that both positive and negative cases. Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Tested-by: Alison Schofield Link: https://patch.msgid.link/20260327052821.440749-10-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/Kbuild | 7 ++++ tools/testing/cxl/test/Kbuild | 1 + tools/testing/cxl/test/cxl.c | 57 ++++++++++++++++++++++++++++++ tools/testing/cxl/test/hmem_test.c | 47 ++++++++++++++++++++++++ tools/testing/cxl/test/mem.c | 3 ++ tools/testing/cxl/test/mock.c | 50 ++++++++++++++++++++++++++ tools/testing/cxl/test/mock.h | 8 +++++ 7 files changed, 173 insertions(+) create mode 100644 tools/testing/cxl/test/hmem_test.c diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 53d84a6874b7..540425c7cd41 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -11,8 +11,12 @@ ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup ldflags-y += --wrap=hmat_get_extended_linear_cache_size ldflags-y += --wrap=devm_cxl_add_dport_by_dev ldflags-y += --wrap=devm_cxl_switch_port_decoders_setup +ldflags-y += --wrap=walk_hmem_resources +ldflags-y += --wrap=region_intersects +ldflags-y += --wrap=region_intersects_soft_reserve DRIVERS := ../../../drivers +DAX_HMEM_SRC := $(DRIVERS)/dax/hmem CXL_SRC := $(DRIVERS)/cxl CXL_CORE_SRC := $(DRIVERS)/cxl/core ccflags-y := -I$(srctree)/drivers/cxl/ @@ -70,6 +74,9 @@ cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o +obj-m += dax_hmem.o +dax_hmem-y := $(DAX_HMEM_SRC)/hmem.o + KBUILD_CFLAGS := $(filter-out -Wmissing-prototypes -Wmissing-declarations, $(KBUILD_CFLAGS)) obj-m += test/ diff --git a/tools/testing/cxl/test/Kbuild b/tools/testing/cxl/test/Kbuild index af50972c8b6d..c168e3c998a7 100644 --- a/tools/testing/cxl/test/Kbuild +++ b/tools/testing/cxl/test/Kbuild @@ -7,6 +7,7 @@ obj-m += cxl_mock_mem.o obj-m += cxl_translate.o cxl_test-y := cxl.o +cxl_test-y += hmem_test.o cxl_mock-y := mock.o cxl_mock_mem-y := mem.o diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 7deeb7ff7bdf..9a9f52090c1d 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1121,6 +1121,53 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) cxl_endpoint_get_perf_coordinates(port, ep_c); } +/* + * Simulate that the first half of mock CXL Window 0 is "Soft Reserve" capacity + */ +static int mock_walk_hmem_resources(struct device *host, walk_hmem_fn fn) +{ + struct acpi_cedt_cfmws *cfmws = mock_cfmws[0]; + struct resource window = + DEFINE_RES_MEM(cfmws->base_hpa, cfmws->window_size / 2); + + dev_dbg(host, "walk cxl_test resource: %pr\n", &window); + return fn(host, 0, &window); +} + +/* + * This should only be called by the dax_hmem case, treat mismatches (negative + * result) as "fallback to base region_intersects()". Simulate that the first + * half of mock CXL Window 0 is IORES_DESC_CXL capacity. + */ +static int mock_region_intersects(resource_size_t start, size_t size, + unsigned long flags, unsigned long desc) +{ + struct resource res = DEFINE_RES_MEM(start, size); + struct acpi_cedt_cfmws *cfmws = mock_cfmws[0]; + struct resource window = + DEFINE_RES_MEM(cfmws->base_hpa, cfmws->window_size / 2); + + if (resource_overlaps(&res, &window)) + return REGION_INTERSECTS; + pr_debug("warning: no cxl_test CXL intersection for %pr\n", &res); + return -1; +} + + +static int +mock_region_intersects_soft_reserve(resource_size_t start, size_t size) +{ + struct resource res = DEFINE_RES_MEM(start, size); + struct acpi_cedt_cfmws *cfmws = mock_cfmws[0]; + struct resource window = + DEFINE_RES_MEM(cfmws->base_hpa, cfmws->window_size / 2); + + if (resource_overlaps(&res, &window)) + return REGION_INTERSECTS; + pr_debug("warning: no cxl_test soft reserve intersection for %pr\n", &res); + return -1; +} + static struct cxl_mock_ops cxl_mock_ops = { .is_mock_adev = is_mock_adev, .is_mock_bridge = is_mock_bridge, @@ -1136,6 +1183,9 @@ static struct cxl_mock_ops cxl_mock_ops = { .devm_cxl_add_dport_by_dev = mock_cxl_add_dport_by_dev, .hmat_get_extended_linear_cache_size = mock_hmat_get_extended_linear_cache_size, + .walk_hmem_resources = mock_walk_hmem_resources, + .region_intersects = mock_region_intersects, + .region_intersects_soft_reserve = mock_region_intersects_soft_reserve, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; @@ -1561,8 +1611,14 @@ static __init int cxl_test_init(void) if (rc) goto err_root; + rc = hmem_test_init(); + if (rc) + goto err_mem; + return 0; +err_mem: + cxl_mem_exit(); err_root: platform_device_put(cxl_acpi); err_rch: @@ -1600,6 +1656,7 @@ static __exit void cxl_test_exit(void) { int i; + hmem_test_exit(); cxl_mem_exit(); platform_device_unregister(cxl_acpi); cxl_rch_topo_exit(); diff --git a/tools/testing/cxl/test/hmem_test.c b/tools/testing/cxl/test/hmem_test.c new file mode 100644 index 000000000000..3a1a089e1721 --- /dev/null +++ b/tools/testing/cxl/test/hmem_test.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2026 Intel Corporation */ +#include +#include +#include "../../../drivers/dax/bus.h" + +static bool hmem_test; + +static void hmem_test_work(struct work_struct *work) +{ +} + +static void hmem_test_release(struct device *dev) +{ + struct hmem_platform_device *hpdev = + container_of(dev, typeof(*hpdev), pdev.dev); + + memset(hpdev, 0, sizeof(*hpdev)); +} + +static struct hmem_platform_device hmem_test_device = { + .pdev = { + .name = "hmem_platform", + .id = 1, + .dev = { + .release = hmem_test_release, + }, + }, + .work = __WORK_INITIALIZER(hmem_test_device.work, hmem_test_work), +}; + +int hmem_test_init(void) +{ + if (!hmem_test) + return 0; + + return platform_device_register(&hmem_test_device.pdev); +} + +void hmem_test_exit(void) +{ + if (hmem_test) + platform_device_unregister(&hmem_test_device.pdev); +} + +module_param(hmem_test, bool, 0444); +MODULE_PARM_DESC(hmem_test, "Enable/disable the dax_hmem test platform device"); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index cb87e8c0e63c..cc847e9aeceb 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1695,6 +1695,9 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) struct cxl_dpa_info range_info = { 0 }; int rc; + /* Increase async probe race window */ + usleep_range(500*1000, 1000*1000); + mdata = devm_kzalloc(dev, sizeof(*mdata), GFP_KERNEL); if (!mdata) return -ENOMEM; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index b8fcb50c1027..6454b868b122 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -251,6 +251,56 @@ struct cxl_dport *__wrap_devm_cxl_add_dport_by_dev(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_dport_by_dev, "CXL"); +int __wrap_region_intersects(resource_size_t start, size_t size, + unsigned long flags, unsigned long desc) +{ + int rc = -1; + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops) + rc = ops->region_intersects(start, size, flags, desc); + if (rc < 0) + rc = region_intersects(start, size, flags, desc); + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_GPL(__wrap_region_intersects); + +int __wrap_region_intersects_soft_reserve(resource_size_t start, size_t size) +{ + int rc = -1; + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops) + rc = ops->region_intersects_soft_reserve(start, size); + if (rc < 0) + rc = region_intersects_soft_reserve(start, size); + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_GPL(__wrap_region_intersects_soft_reserve); + +int __wrap_walk_hmem_resources(struct device *host, walk_hmem_fn fn) +{ + int index, rc = 0; + bool is_mock = strcmp(dev_name(host), "hmem_platform.1") == 0; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (is_mock) { + if (ops) + rc = ops->walk_hmem_resources(host, fn); + } else { + rc = walk_hmem_resources(host, fn); + } + put_cxl_mock_ops(index); + return rc; +} +EXPORT_SYMBOL_GPL(__wrap_walk_hmem_resources); + MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("cxl_test: emulation module"); MODULE_IMPORT_NS("ACPI"); diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 2684b89c8aa2..4f57dc80ae7d 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -2,6 +2,7 @@ #include #include +#include #include struct cxl_mock_ops { @@ -27,8 +28,15 @@ struct cxl_mock_ops { int (*hmat_get_extended_linear_cache_size)(struct resource *backing_res, int nid, resource_size_t *cache_size); + int (*walk_hmem_resources)(struct device *host, walk_hmem_fn fn); + int (*region_intersects)(resource_size_t start, size_t size, + unsigned long flags, unsigned long desc); + int (*region_intersects_soft_reserve)(resource_size_t start, + size_t size); }; +int hmem_test_init(void); +void hmem_test_exit(void); void register_cxl_mock_ops(struct cxl_mock_ops *ops); void unregister_cxl_mock_ops(struct cxl_mock_ops *ops); struct cxl_mock_ops *get_cxl_mock_ops(int *index); From d585bc86fb9f405ed1f2f56cc50c82d9aaada297 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Wed, 1 Apr 2026 20:49:51 +0800 Subject: [PATCH 33/37] cxl/region: Add a region sysfs interface for region lock status There are 3 scenarios that leads to a locked region: 1. A region is created on a root decoder with Fixed Device Confiuration attribute. 2. CXL_HDM_DECODER0_CTRL_LOCK. Both 1 & 1 are well described in: commit 2230c4bdc412 ("cxl: Add handling of locked CXL decoder") 3) Platform that has region creation with PRMT address translation always locks the region, regardless of the FIXED attribute or decoder ctrl bit. Region locked means region destroy operations are not permitted. CXL region driver returns -EPERM for region destroy operations. Although the locked status of the corresponding root decoder implies the region is also locked, exposing the region lock status directly to userspace improves usability for users who may not be aware of this relationship. [ dj: Amended commit log with additional locking scenarios. ] Signed-off-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Alejandro Lucero Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260401124951.1290041-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- Documentation/ABI/testing/sysfs-bus-cxl | 13 +++++++++++++ drivers/cxl/core/region.c | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index c80a1b5a03db..16a9b3d2e2c0 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -508,6 +508,19 @@ Description: (RO) The size of extended linear cache, if there is an extended linear cache. Otherwise the attribute will not be visible. + +What: /sys/bus/cxl/devices/regionZ/locked +Date: Mar, 2026 +KernelVersion: v7.1 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The CXL driver has the capability to lock a region based on + a BIOS or platform dependent configuration. Regions created as + locked are never permitted to be destroyed. Resets to participating + decoders will not result in a region destroy and will not free the + decoder resources. + + What: /sys/bus/cxl/devices/regionZ/mode Date: January, 2023 KernelVersion: v6.3 diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 42874948b589..95d81816008e 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -767,6 +767,22 @@ static ssize_t extended_linear_cache_size_show(struct device *dev, } static DEVICE_ATTR_RO(extended_linear_cache_size); +static ssize_t locked_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + int rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + + rc = test_bit(CXL_REGION_F_LOCK, &cxlr->flags); + return sysfs_emit(buf, "%d\n", rc); +} +static DEVICE_ATTR_RO(locked); + static struct attribute *cxl_region_attrs[] = { &dev_attr_uuid.attr, &dev_attr_commit.attr, @@ -776,6 +792,7 @@ static struct attribute *cxl_region_attrs[] = { &dev_attr_size.attr, &dev_attr_mode.attr, &dev_attr_extended_linear_cache_size.attr, + &dev_attr_locked.attr, NULL, }; From f3b1d2260703f8fb39fd667a26d931d63d2dd10e Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 13 Mar 2026 23:19:50 -0700 Subject: [PATCH 34/37] tools/testing/cxl: Enable replay of user regions as auto regions The cxl_test module currently hard-codes auto regions in the mock topology, limiting coverage of the driver's region auto-assembly logic. Teach cxl_test to replay previously committed decoder programming across a cxl_acpi unbind/bind cycle. Decoder programming is recorded in a registry keyed by a stable port identity and decoder id. The registry is updated on decoder commit and reset events and consulted during enumeration to restore previously enabled decoders. This allows regions created through the user interface to be replayed during enumeration and treated as auto-discovered regions, enabling testing of region auto-assembly using configurations created in the cxl_test topology. Example workflow: # cxl create-region ... # echo 1 > /sys/bus/platform/devices/cxl_acpi.0/decoder_reset_preserve_registry # echo cxl_acpi.0 > /sys/bus/platform/drivers/cxl_acpi/unbind # echo cxl_acpi.0 > /sys/bus/platform/drivers/cxl_acpi/bind # echo 0 > /sys/bus/platform/devices/cxl_acpi.0/decoder_reset_preserve_registry The NDCTL CXL unit test, cxl-region-replay.sh, demonstrates the usage. Co-developed-by: Dan Williams Signed-off-by: Dan Williams Co-developed-by: Dave Jiang Signed-off-by: Alison Schofield Link: https://patch.msgid.link/20260314061952.2221030-1-alison.schofield@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/test/cxl.c | 383 ++++++++++++++++++++++++++++++++++- 1 file changed, 373 insertions(+), 10 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 81e2aef3627a..cd47fdd7ccb5 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -51,6 +51,31 @@ struct platform_device *cxl_mem_single[NR_MEM_SINGLE]; static struct platform_device *cxl_rch[NR_CXL_RCH]; static struct platform_device *cxl_rcd[NR_CXL_RCH]; +/* + * Decoder registry + * + * Record decoder programming so that the topology can be reconstructed + * after cxl_acpi unbind/bind. This allows a user-created region config + * to be replayed as if firmware had provided the region at enumeration + * time. + * + * Entries are keyed by a stable port identity (port->uport_dev) combined + * with the decoder id. Decoder state is saved at initialization and + * updated on commit and reset. + * + * On re-enumeration mock_init_hdm_decoder() consults this registry to + * restore enabled decoders. Disabled decoders are reinitialized to a + * clean default state rather than replaying stale programming. + */ +static DEFINE_XARRAY(decoder_registry); + +/* + * When set, decoder reset will not update the registry. This allows + * region destroy operations to reset live decoders without erasing + * the saved programming needed for replay after re-enumeration. + */ +static bool decoder_reset_preserve_registry; + static inline bool is_multi_bridge(struct device *dev) { int i; @@ -704,6 +729,194 @@ static int map_targets(struct device *dev, void *data) return 0; } +/* + * Build a stable registry key from the decoder's upstream port identity + * and decoder id. + * + * Decoder objects and cxl_port objects are reallocated on each enumeration, + * so their addresses cannot be used directly as replay keys. However, + * port->uport_dev is stable for a given topology across cxl_acpi unbind/bind + * in cxl_test, so use that as the port identity and pack the local decoder + * id into the low bits. + * + * The key is formed as: + * ((unsigned long)port->uport_dev << 4) | cxld->id + * + * The low bits hold the decoder id (which must fit in 4 bits) while + * the remaining bits identify the upstream port. This key is only used + * within cxl_test to locate saved decoder state during replay. + */ +static unsigned long cxld_registry_index(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + + dev_WARN_ONCE(&port->dev, cxld->id >= 16, + "decoder id:%d out of range\n", cxld->id); + return (((unsigned long)port->uport_dev) << 4) | cxld->id; +} + +struct cxl_test_decoder { + union { + struct cxl_switch_decoder cxlsd; + struct cxl_endpoint_decoder cxled; + }; + struct range dpa_range; +}; + +static struct cxl_test_decoder *cxld_registry_find(struct cxl_decoder *cxld) +{ + return xa_load(&decoder_registry, cxld_registry_index(cxld)); +} + +#define dbg_cxld(port, msg, cxld) \ + do { \ + struct cxl_decoder *___d = (cxld); \ + dev_dbg((port)->uport_dev, \ + "decoder%d: %s range: %#llx-%#llx iw: %d ig: %d flags: %#lx\n", \ + ___d->id, msg, ___d->hpa_range.start, \ + ___d->hpa_range.end + 1, ___d->interleave_ways, \ + ___d->interleave_granularity, ___d->flags); \ + } while (0) + +static int mock_decoder_commit(struct cxl_decoder *cxld); +static void mock_decoder_reset(struct cxl_decoder *cxld); +static void init_disabled_mock_decoder(struct cxl_decoder *cxld); + +static void cxld_copy(struct cxl_decoder *a, struct cxl_decoder *b) +{ + a->id = b->id; + a->hpa_range = b->hpa_range; + a->interleave_ways = b->interleave_ways; + a->interleave_granularity = b->interleave_granularity; + a->target_type = b->target_type; + a->flags = b->flags; + a->commit = mock_decoder_commit; + a->reset = mock_decoder_reset; +} + +/* + * Restore decoder programming saved in the registry. + * + * Only decoders that were saved enabled are restored. Disabled decoders + * are left in their default inactive state so that stale programming is + * not resurrected after topology replay. + * + * For endpoint decoders this also restores the DPA reservation needed + * to reconstruct committed mappings. + */ +static int cxld_registry_restore(struct cxl_decoder *cxld, + struct cxl_test_decoder *td) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + int rc; + + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd = + to_cxl_switch_decoder(&cxld->dev); + + if (!(td->cxlsd.cxld.flags & CXL_DECODER_F_ENABLE)) + return 0; + + dbg_cxld(port, "restore", &td->cxlsd.cxld); + cxld_copy(cxld, &td->cxlsd.cxld); + WARN_ON(cxlsd->nr_targets != td->cxlsd.nr_targets); + + /* Restore saved target intent; live dport binding happens later */ + for (int i = 0; i < cxlsd->nr_targets; i++) { + cxlsd->target[i] = NULL; + cxld->target_map[i] = td->cxlsd.cxld.target_map[i]; + } + + port->commit_end = cxld->id; + + } else { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + + if (!(td->cxled.cxld.flags & CXL_DECODER_F_ENABLE)) + return 0; + + dbg_cxld(port, "restore", &td->cxled.cxld); + cxld_copy(cxld, &td->cxled.cxld); + cxled->state = td->cxled.state; + cxled->skip = td->cxled.skip; + if (range_len(&td->dpa_range)) { + rc = devm_cxl_dpa_reserve(cxled, td->dpa_range.start, + range_len(&td->dpa_range), + td->cxled.skip); + if (rc) { + init_disabled_mock_decoder(cxld); + return rc; + } + } + port->commit_end = cxld->id; + } + + return 0; +} + +static void __cxld_registry_save(struct cxl_test_decoder *td, + struct cxl_decoder *cxld) +{ + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd = + to_cxl_switch_decoder(&cxld->dev); + + cxld_copy(&td->cxlsd.cxld, cxld); + td->cxlsd.nr_targets = cxlsd->nr_targets; + + /* Save target port_id as a stable identify for the dport */ + for (int i = 0; i < cxlsd->nr_targets; i++) { + struct cxl_dport *dport; + + if (!cxlsd->target[i]) + continue; + + dport = cxlsd->target[i]; + td->cxlsd.cxld.target_map[i] = dport->port_id; + } + } else { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + + cxld_copy(&td->cxled.cxld, cxld); + td->cxled.state = cxled->state; + td->cxled.skip = cxled->skip; + + if (!(cxld->flags & CXL_DECODER_F_ENABLE)) { + td->dpa_range.start = 0; + td->dpa_range.end = -1; + } else if (cxled->dpa_res) { + td->dpa_range.start = cxled->dpa_res->start; + td->dpa_range.end = cxled->dpa_res->end; + } else { + td->dpa_range.start = 0; + td->dpa_range.end = -1; + } + } +} + +static void cxld_registry_save(struct cxl_test_decoder *td, + struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + + dbg_cxld(port, "save", cxld); + __cxld_registry_save(td, cxld); +} + +static void cxld_registry_update(struct cxl_decoder *cxld) +{ + struct cxl_test_decoder *td = cxld_registry_find(cxld); + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + + if (WARN_ON_ONCE(!td)) + return; + + dbg_cxld(port, "update", cxld); + __cxld_registry_save(td, cxld); +} + static int mock_decoder_commit(struct cxl_decoder *cxld) { struct cxl_port *port = to_cxl_port(cxld->dev.parent); @@ -723,6 +936,13 @@ static int mock_decoder_commit(struct cxl_decoder *cxld) port->commit_end++; cxld->flags |= CXL_DECODER_F_ENABLE; + if (is_endpoint_decoder(&cxld->dev)) { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + + cxled->state = CXL_DECODER_STATE_AUTO; + } + cxld_registry_update(cxld); return 0; } @@ -743,6 +963,65 @@ static void mock_decoder_reset(struct cxl_decoder *cxld) "%s: out of order reset, expected decoder%d.%d\n", dev_name(&cxld->dev), port->id, port->commit_end); cxld->flags &= ~CXL_DECODER_F_ENABLE; + + if (is_endpoint_decoder(&cxld->dev)) { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + + cxled->state = CXL_DECODER_STATE_MANUAL; + cxled->skip = 0; + } + if (decoder_reset_preserve_registry) + dev_dbg(port->uport_dev, "decoder%d: skip registry update\n", + cxld->id); + else + cxld_registry_update(cxld); +} + +static struct cxl_test_decoder *cxld_registry_new(struct cxl_decoder *cxld) +{ + struct cxl_test_decoder *td __free(kfree) = + kzalloc(sizeof(*td), GFP_KERNEL); + unsigned long key = cxld_registry_index(cxld); + + if (!td) + return NULL; + + if (xa_insert(&decoder_registry, key, td, GFP_KERNEL)) { + WARN_ON(1); + return NULL; + } + + cxld_registry_save(td, cxld); + return no_free_ptr(td); +} + +static void init_disabled_mock_decoder(struct cxl_decoder *cxld) +{ + cxld->hpa_range.start = 0; + cxld->hpa_range.end = -1; + cxld->interleave_ways = 1; + cxld->interleave_granularity = 0; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + cxld->flags = 0; + cxld->commit = mock_decoder_commit; + cxld->reset = mock_decoder_reset; + + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd = + to_cxl_switch_decoder(&cxld->dev); + + for (int i = 0; i < cxlsd->nr_targets; i++) { + cxlsd->target[i] = NULL; + cxld->target_map[i] = 0; + } + } else { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + + cxled->state = CXL_DECODER_STATE_MANUAL; + cxled->skip = 0; + } } static void default_mock_decoder(struct cxl_decoder *cxld) @@ -757,6 +1036,8 @@ static void default_mock_decoder(struct cxl_decoder *cxld) cxld->target_type = CXL_DECODER_HOSTONLYMEM; cxld->commit = mock_decoder_commit; cxld->reset = mock_decoder_reset; + + WARN_ON_ONCE(!cxld_registry_new(cxld)); } static int first_decoder(struct device *dev, const void *data) @@ -771,13 +1052,29 @@ static int first_decoder(struct device *dev, const void *data) return 0; } -static void mock_init_hdm_decoder(struct cxl_decoder *cxld) +/* + * Initialize a decoder during HDM enumeration. + * + * If a saved registry entry exists: + * - enabled decoders are restored from the saved programming + * - disabled decoders are initialized in a clean disabled state + * + * If no registry entry exists the decoder follows the normal mock + * initialization path, including the special auto-region setup for + * the first endpoints under host-bridge0. + * + * Returns true if decoder state was restored from the registry. In + * that case the saved decode configuration (including target mapping) + * has already been applied and the map_targets() is skipped. + */ +static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) { struct acpi_cedt_cfmws *window = mock_cfmws[0]; struct platform_device *pdev = NULL; struct cxl_endpoint_decoder *cxled; struct cxl_switch_decoder *cxlsd; struct cxl_port *port, *iter; + struct cxl_test_decoder *td; struct cxl_memdev *cxlmd; struct cxl_dport *dport; struct device *dev; @@ -804,6 +1101,24 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) port = NULL; } while (port); port = cxled_to_port(cxled); + } else { + port = to_cxl_port(cxld->dev.parent); + } + + td = cxld_registry_find(cxld); + if (td) { + bool enabled; + + if (is_switch_decoder(&cxld->dev)) + enabled = td->cxlsd.cxld.flags & CXL_DECODER_F_ENABLE; + else + enabled = td->cxled.cxld.flags & CXL_DECODER_F_ENABLE; + + if (enabled) + return !cxld_registry_restore(cxld, td); + + init_disabled_mock_decoder(cxld); + return false; } /* @@ -814,9 +1129,10 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) * * See 'cxl list -BMPu -m cxl_mem.0,cxl_mem.4' */ - if (!hb0 || pdev->id % 4 || pdev->id > 4 || cxld->id > 0) { + if (!is_endpoint_decoder(&cxld->dev) || !hb0 || pdev->id % 4 || + pdev->id > 4 || cxld->id > 0) { default_mock_decoder(cxld); - return; + return false; } base = window->base_hpa; @@ -838,6 +1154,7 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->commit = mock_decoder_commit; cxld->reset = mock_decoder_reset; + WARN_ON_ONCE(!cxld_registry_new(cxld)); /* * Now that endpoint decoder is set up, walk up the hierarchy * and setup the switch and root port decoders targeting @cxlmd. @@ -859,14 +1176,14 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) /* put cxl_mem.4 second in the decode order */ if (pdev->id == 4) { cxlsd->target[1] = dport; - cxld->target_map[1] = dport->port_id; + cxlsd->cxld.target_map[1] = dport->port_id; } else { cxlsd->target[0] = dport; - cxld->target_map[0] = dport->port_id; + cxlsd->cxld.target_map[0] = dport->port_id; } } else { cxlsd->target[0] = dport; - cxld->target_map[0] = dport->port_id; + cxlsd->cxld.target_map[0] = dport->port_id; } cxld = &cxlsd->cxld; cxld->target_type = CXL_DECODER_HOSTONLYMEM; @@ -885,8 +1202,14 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) .start = base, .end = base + mock_auto_region_size - 1, }; + cxld->commit = mock_decoder_commit; + cxld->reset = mock_decoder_reset; + + cxld_registry_update(cxld); put_device(dev); } + + return false; } static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, @@ -895,6 +1218,7 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, struct cxl_port *port = cxlhdm->port; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); int target_count, i; + bool restored; if (is_cxl_endpoint(port)) target_count = 0; @@ -934,10 +1258,8 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, } ctx.target_map = cxld->target_map; - - mock_init_hdm_decoder(cxld); - - if (target_count) { + restored = mock_init_hdm_decoder(cxld); + if (target_count && !restored) { rc = device_for_each_child(port->uport_dev, &ctx, map_targets); if (rc) { @@ -1415,6 +1737,33 @@ err_mem: return rc; } +static ssize_t +decoder_reset_preserve_registry_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", decoder_reset_preserve_registry); +} + +static ssize_t +decoder_reset_preserve_registry_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int rc; + + rc = kstrtobool(buf, &decoder_reset_preserve_registry); + if (rc) + return rc; + return count; +} + +static DEVICE_ATTR_RW(decoder_reset_preserve_registry); + +static struct attribute *cxl_acpi_attrs[] = { + &dev_attr_decoder_reset_preserve_registry.attr, NULL +}; +ATTRIBUTE_GROUPS(cxl_acpi); + static __init int cxl_test_init(void) { int rc, i; @@ -1545,6 +1894,7 @@ static __init int cxl_test_init(void) mock_companion(&acpi0017_mock, &cxl_acpi->dev); acpi0017_mock.dev.bus = &platform_bus_type; + cxl_acpi->dev.groups = cxl_acpi_groups; rc = platform_device_add(cxl_acpi); if (rc) @@ -1589,6 +1939,17 @@ err_gen_pool_create: return rc; } +static void free_decoder_registry(void) +{ + unsigned long index; + void *entry; + + xa_for_each(&decoder_registry, index, entry) { + xa_erase(&decoder_registry, index); + kfree(entry); + } +} + static __exit void cxl_test_exit(void) { int i; @@ -1614,6 +1975,8 @@ static __exit void cxl_test_exit(void) depopulate_all_mock_resources(); gen_pool_destroy(cxl_mock_pool); unregister_cxl_mock_ops(&cxl_mock_ops); + free_decoder_registry(); + xa_destroy(&decoder_registry); } module_param(interleave_arithmetic, int, 0444); From 9b6e1ed28a7f239cc9184101cedc6fec4c3b3dc9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 3 Apr 2026 14:48:46 -0700 Subject: [PATCH 35/37] MAINTAINERS: Update address for Dan Williams Update MAINTAINERS and .mailmap to point to my kernel.org address: djbw@kernel.org. Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260403214846.1062341-1-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- .mailmap | 1 + MAINTAINERS | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.mailmap b/.mailmap index 63c11ea7e35d..215fe2329cf4 100644 --- a/.mailmap +++ b/.mailmap @@ -204,6 +204,7 @@ Colin Ian King Corey Minyard Damian Hobson-Garcia Dan Carpenter +Dan Williams Daniel Borkmann Daniel Borkmann Daniel Borkmann diff --git a/MAINTAINERS b/MAINTAINERS index 96ea84948d76..1a58c53f7627 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4058,7 +4058,7 @@ S: Maintained F: crypto/rsa* ASYNCHRONOUS TRANSFERS/TRANSFORMS (IOAT) API -R: Dan Williams +R: Dan Williams S: Odd fixes W: http://sourceforge.net/projects/xscaleiop F: Documentation/crypto/async-tx-api.rst @@ -6429,7 +6429,7 @@ M: Dave Jiang M: Alison Schofield M: Vishal Verma M: Ira Weiny -M: Dan Williams +M: Dan Williams L: linux-cxl@vger.kernel.org S: Maintained F: Documentation/driver-api/cxl @@ -7290,7 +7290,7 @@ S: Maintained F: scripts/dev-needs.sh DEVICE DIRECT ACCESS (DAX) -M: Dan Williams +M: Dan Williams M: Vishal Verma M: Dave Jiang L: nvdimm@lists.linux.dev @@ -9803,7 +9803,7 @@ F: include/linux/fcntl.h F: include/uapi/linux/fcntl.h FILESYSTEM DIRECT ACCESS (DAX) -M: Dan Williams +M: Dan Williams R: Matthew Wilcox R: Jan Kara L: linux-fsdevel@vger.kernel.org @@ -12861,7 +12861,7 @@ F: drivers/platform/x86/intel/hid.c INTEL I/OAT DMA DRIVER M: Dave Jiang -R: Dan Williams +R: Dan Williams L: dmaengine@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-dmaengine/list/ @@ -14564,7 +14564,7 @@ K: libie LIBNVDIMM BTT: BLOCK TRANSLATION TABLE M: Vishal Verma -M: Dan Williams +M: Dan Williams M: Dave Jiang L: nvdimm@lists.linux.dev S: Supported @@ -14573,7 +14573,7 @@ P: Documentation/nvdimm/maintainer-entry-profile.rst F: drivers/nvdimm/btt* LIBNVDIMM PMEM: PERSISTENT MEMORY DRIVER -M: Dan Williams +M: Dan Williams M: Vishal Verma M: Dave Jiang L: nvdimm@lists.linux.dev @@ -14591,7 +14591,7 @@ F: Documentation/devicetree/bindings/pmem/pmem-region.yaml F: drivers/nvdimm/of_pmem.c LIBNVDIMM: NON-VOLATILE MEMORY DEVICE SUBSYSTEM -M: Dan Williams +M: Dan Williams M: Vishal Verma M: Dave Jiang M: Ira Weiny @@ -26858,7 +26858,7 @@ S: Maintained F: Documentation/devicetree/bindings/trigger-source/* TRUSTED EXECUTION ENVIRONMENT SECURITY MANAGER (TSM) -M: Dan Williams +M: Dan Williams L: linux-coco@lists.linux.dev S: Maintained F: Documentation/ABI/testing/configfs-tsm-report From 3624a22783b74ffebaa7d9f286e203604baa06c7 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sat, 21 Mar 2026 14:14:59 +0800 Subject: [PATCH 36/37] cxl/hdm: Add support for 32 switch decoders Per CXL r4.0 section 8.2.4.20.1. CXL host bridge and switch ports can support 32 HDM decoders. Current implementation misses some decoders on CXL host bridge and switch in the case that the value of Decoder Count field in CXL HDM decoder Capability Register is greater than or equal to 9. Update calculation implementation to ensure the decoder count calculation is correct for CXL host bridge/switch ports. Signed-off-by: Li Ming Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260321061459.1910205-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 2 +- drivers/cxl/cxl.h | 11 ++++++++++- drivers/cxl/cxlmem.h | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index c222e98ae736..3930e130d6b6 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -177,7 +177,7 @@ static struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, } parse_hdm_decoder_caps(cxlhdm); - if (cxlhdm->decoder_count == 0) { + if (cxlhdm->decoder_count < 0) { dev_err(dev, "Spec violation. Caps invalid\n"); return ERR_PTR(-ENXIO); } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index d09c84bcc015..4e7923811f94 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -77,7 +77,16 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr) { int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); - return val ? val * 2 : 1; + switch (val) { + case 0: + return 1; + case 1 ... 8: + return val * 2; + case 9 ... 12: + return (val - 4) * 4; + default: + return -ENXIO; + } } /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index e21d744d639b..399b150b404c 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -923,7 +923,7 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd); */ struct cxl_hdm { struct cxl_component_regs regs; - unsigned int decoder_count; + int decoder_count; unsigned int target_count; unsigned int interleave_mask; unsigned long iw_cap_mask; From dc989bb79380194917351284167f78c3aa084c94 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 15 Apr 2026 13:43:57 +0100 Subject: [PATCH 37/37] MAINTAINERS: Update Jonathan Cameron's email address Update my email address for CXL, FWCTL and Cache subsystems to use my kernel.org account. Also update .mailmap. Separate patches will replace maintainers for HiSilicon specific hardware. Signed-off-by: Jonathan Cameron Acked-by: Conor Dooley Link: https://patch.msgid.link/20260415124357.12539-1-Jonathan.Cameron@huawei.com Signed-off-by: Dave Jiang --- .mailmap | 1 + MAINTAINERS | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.mailmap b/.mailmap index 215fe2329cf4..2ffd23c127b9 100644 --- a/.mailmap +++ b/.mailmap @@ -418,6 +418,7 @@ John Stultz Jonas Gorski +Jonathan Cameron Jordan Crouse diff --git a/MAINTAINERS b/MAINTAINERS index 1a58c53f7627..ebeb41c67ada 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6424,7 +6424,7 @@ F: include/linux/compiler_attributes.h COMPUTE EXPRESS LINK (CXL) M: Davidlohr Bueso -M: Jonathan Cameron +M: Jonathan Cameron M: Dave Jiang M: Alison Schofield M: Vishal Verma @@ -6440,7 +6440,7 @@ F: include/uapi/linux/cxl_mem.h F: tools/testing/cxl/ COMPUTE EXPRESS LINK PMU (CPMU) -M: Jonathan Cameron +M: Jonathan Cameron L: linux-cxl@vger.kernel.org S: Maintained F: Documentation/admin-guide/perf/cxl.rst @@ -10524,7 +10524,7 @@ FWCTL SUBSYSTEM M: Dave Jiang M: Jason Gunthorpe M: Saeed Mahameed -R: Jonathan Cameron +R: Jonathan Cameron S: Maintained F: Documentation/userspace-api/fwctl/ F: drivers/fwctl/ @@ -25136,7 +25136,7 @@ F: drivers/staging/ STANDALONE CACHE CONTROLLER DRIVERS M: Conor Dooley -M: Jonathan Cameron +M: Jonathan Cameron S: Maintained T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/ F: Documentation/devicetree/bindings/cache/