From 9cc60ec453fe5d58d4faa70829814769a8af24d4 Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Wed, 25 Feb 2026 17:37:58 +0800 Subject: [PATCH 01/24] dma-mapping: benchmark: modify the framework to adapt to more map modes This patch adjusts the DMA map benchmark framework to make the DMA map benchmark framework more flexible and adaptable to other mapping modes in the future. By abstracting the framework into five interfaces: prepare, unprepare, initialize_data, do_map, and do_unmap. The new map schema can be introduced more easily without major modifications to the existing code structure. Reviewed-by: Barry Song Signed-off-by: Qinxin Xia Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260225093800.3625054-2-xiaqinxin@huawei.com --- include/uapi/linux/map_benchmark.h | 8 +- kernel/dma/map_benchmark.c | 131 ++++++++++++++++++++++++----- 2 files changed, 115 insertions(+), 24 deletions(-) diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h index c2d91088a40d..e076748f2120 100644 --- a/include/uapi/linux/map_benchmark.h +++ b/include/uapi/linux/map_benchmark.h @@ -17,6 +17,11 @@ #define DMA_MAP_TO_DEVICE 1 #define DMA_MAP_FROM_DEVICE 2 +enum { + DMA_MAP_BENCH_SINGLE_MODE, + DMA_MAP_BENCH_MODE_MAX +}; + struct map_benchmark { __u64 avg_map_100ns; /* average map latency in 100ns */ __u64 map_stddev; /* standard deviation of map latency */ @@ -29,7 +34,8 @@ struct map_benchmark { __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ - __u8 expansion[76]; /* For future use */ + __u8 map_mode; /* the mode of dma map */ + __u8 expansion[75]; /* For future use */ }; #endif /* _UAPI_DMA_BENCHMARK_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 0f33b3ea7daf..b80e0fb399b1 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -5,6 +5,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -31,17 +32,105 @@ struct map_benchmark_data { atomic64_t loops; }; +struct map_benchmark_ops { + void *(*prepare)(struct map_benchmark_data *map); + void (*unprepare)(void *mparam); + void (*initialize_data)(void *mparam); + int (*do_map)(void *mparam); + void (*do_unmap)(void *mparam); +}; + +struct dma_single_map_param { + struct device *dev; + dma_addr_t addr; + void *xbuf; + u32 npages; + u32 dma_dir; +}; + +static void *dma_single_map_benchmark_prepare(struct map_benchmark_data *map) +{ + struct dma_single_map_param *params __free(kfree) = kzalloc(sizeof(*params), + GFP_KERNEL); + if (!params) + return NULL; + + params->npages = map->bparam.granule; + params->dma_dir = map->bparam.dma_dir; + params->dev = map->dev; + params->xbuf = alloc_pages_exact(params->npages * PAGE_SIZE, GFP_KERNEL); + if (!params->xbuf) + return NULL; + + return_ptr(params); +} + +static void dma_single_map_benchmark_unprepare(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + free_pages_exact(params->xbuf, params->npages * PAGE_SIZE); + kfree(params); +} + +static void dma_single_map_benchmark_initialize_data(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + /* + * for a non-coherent device, if we don't stain them in the + * cache, this will give an underestimate of the real-world + * overhead of BIDIRECTIONAL or TO_DEVICE mappings; + * 66 means everything goes well! 66 is lucky. + */ + if (params->dma_dir != DMA_FROM_DEVICE) + memset(params->xbuf, 0x66, params->npages * PAGE_SIZE); +} + +static int dma_single_map_benchmark_do_map(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + params->addr = dma_map_single(params->dev, params->xbuf, + params->npages * PAGE_SIZE, params->dma_dir); + if (unlikely(dma_mapping_error(params->dev, params->addr))) { + pr_err("dma_map_single failed on %s\n", dev_name(params->dev)); + return -ENOMEM; + } + + return 0; +} + +static void dma_single_map_benchmark_do_unmap(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + dma_unmap_single(params->dev, params->addr, + params->npages * PAGE_SIZE, params->dma_dir); +} + +static struct map_benchmark_ops dma_single_map_benchmark_ops = { + .prepare = dma_single_map_benchmark_prepare, + .unprepare = dma_single_map_benchmark_unprepare, + .initialize_data = dma_single_map_benchmark_initialize_data, + .do_map = dma_single_map_benchmark_do_map, + .do_unmap = dma_single_map_benchmark_do_unmap, +}; + +static struct map_benchmark_ops *dma_map_benchmark_ops[DMA_MAP_BENCH_MODE_MAX] = { + [DMA_MAP_BENCH_SINGLE_MODE] = &dma_single_map_benchmark_ops, +}; + static int map_benchmark_thread(void *data) { - void *buf; - dma_addr_t dma_addr; struct map_benchmark_data *map = data; - int npages = map->bparam.granule; - u64 size = npages * PAGE_SIZE; + __u8 map_mode = map->bparam.map_mode; int ret = 0; - buf = alloc_pages_exact(size, GFP_KERNEL); - if (!buf) + struct map_benchmark_ops *mb_ops = dma_map_benchmark_ops[map_mode]; + void *mparam = mb_ops->prepare(map); + + if (!mparam) return -ENOMEM; while (!kthread_should_stop()) { @@ -49,23 +138,12 @@ static int map_benchmark_thread(void *data) ktime_t map_stime, map_etime, unmap_stime, unmap_etime; ktime_t map_delta, unmap_delta; - /* - * for a non-coherent device, if we don't stain them in the - * cache, this will give an underestimate of the real-world - * overhead of BIDIRECTIONAL or TO_DEVICE mappings; - * 66 means evertything goes well! 66 is lucky. - */ - if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, size); - + mb_ops->initialize_data(mparam); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, size, map->dir); - if (unlikely(dma_mapping_error(map->dev, dma_addr))) { - pr_err("dma_map_single failed on %s\n", - dev_name(map->dev)); - ret = -ENOMEM; + ret = mb_ops->do_map(mparam); + if (ret) goto out; - } + map_etime = ktime_get(); map_delta = ktime_sub(map_etime, map_stime); @@ -73,7 +151,8 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, size, map->dir); + mb_ops->do_unmap(mparam); + unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -108,7 +187,7 @@ static int map_benchmark_thread(void *data) } out: - free_pages_exact(buf, size); + mb_ops->unprepare(mparam); return ret; } @@ -209,6 +288,12 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case DMA_MAP_BENCHMARK: + if (map->bparam.map_mode < 0 || + map->bparam.map_mode >= DMA_MAP_BENCH_MODE_MAX) { + pr_err("invalid map mode\n"); + return -EINVAL; + } + if (map->bparam.threads == 0 || map->bparam.threads > DMA_MAP_MAX_THREADS) { pr_err("invalid thread number\n"); From a8d14dd6e621f47344d0eda72f7ce9203bdef4f1 Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Wed, 25 Feb 2026 17:37:59 +0800 Subject: [PATCH 02/24] dma-mapping: benchmark: add support for dma_map_sg Support for dma scatter-gather mapping and is intended for testing mapping performance. It achieves by introducing the dma_sg_map_param structure and related functions, which enable the implementation of scatter-gather mapping preparation, mapping, and unmapping operations. Additionally, the dma_map_benchmark_ops array is updated to include operations for scatter-gather mapping. This commit aims to provide a wider range of mapping performance test to cater to different scenarios. Reviewed-by: Barry Song Signed-off-by: Qinxin Xia Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260225093800.3625054-3-xiaqinxin@huawei.com --- include/uapi/linux/map_benchmark.h | 5 +- kernel/dma/map_benchmark.c | 115 +++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h index e076748f2120..4b17829a9f17 100644 --- a/include/uapi/linux/map_benchmark.h +++ b/include/uapi/linux/map_benchmark.h @@ -19,6 +19,7 @@ enum { DMA_MAP_BENCH_SINGLE_MODE, + DMA_MAP_BENCH_SG_MODE, DMA_MAP_BENCH_MODE_MAX }; @@ -33,7 +34,9 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u32 granule; /* - SINGLE_MODE: number of pages mapped/unmapped per operation + * - SG_MODE: number of scatterlist entries (each maps one page) + */ __u8 map_mode; /* the mode of dma map */ __u8 expansion[75]; /* For future use */ }; diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index b80e0fb399b1..29eeb5fdf199 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -117,8 +118,122 @@ static struct map_benchmark_ops dma_single_map_benchmark_ops = { .do_unmap = dma_single_map_benchmark_do_unmap, }; +struct dma_sg_map_param { + struct sg_table sgt; + struct device *dev; + void **buf; + u32 npages; + u32 dma_dir; +}; + +static void *dma_sg_map_benchmark_prepare(struct map_benchmark_data *map) +{ + struct scatterlist *sg; + int i; + + struct dma_sg_map_param *params = kzalloc(sizeof(*params), GFP_KERNEL); + + if (!params) + return NULL; + /* + * Set the number of scatterlist entries based on the granule. + * In SG mode, 'granule' represents the number of scatterlist entries. + * Each scatterlist entry corresponds to a single page. + */ + params->npages = map->bparam.granule; + params->dma_dir = map->bparam.dma_dir; + params->dev = map->dev; + params->buf = kmalloc_array(params->npages, sizeof(*params->buf), + GFP_KERNEL); + if (!params->buf) + goto out; + + if (sg_alloc_table(¶ms->sgt, params->npages, GFP_KERNEL)) + goto free_buf; + + for_each_sgtable_sg(¶ms->sgt, sg, i) { + params->buf[i] = (void *)__get_free_page(GFP_KERNEL); + if (!params->buf[i]) + goto free_page; + + sg_set_buf(sg, params->buf[i], PAGE_SIZE); + } + + return params; + +free_page: + while (i-- > 0) + free_page((unsigned long)params->buf[i]); + + sg_free_table(¶ms->sgt); +free_buf: + kfree(params->buf); +out: + kfree(params); + return NULL; +} + +static void dma_sg_map_benchmark_unprepare(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + int i; + + for (i = 0; i < params->npages; i++) + free_page((unsigned long)params->buf[i]); + + sg_free_table(¶ms->sgt); + + kfree(params->buf); + kfree(params); +} + +static void dma_sg_map_benchmark_initialize_data(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + struct scatterlist *sg; + int i = 0; + + if (params->dma_dir == DMA_FROM_DEVICE) + return; + + for_each_sgtable_sg(¶ms->sgt, sg, i) + memset(params->buf[i], 0x66, PAGE_SIZE); +} + +static int dma_sg_map_benchmark_do_map(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + int ret = 0; + + int sg_mapped = dma_map_sg(params->dev, params->sgt.sgl, + params->npages, params->dma_dir); + if (!sg_mapped) { + pr_err("dma_map_sg failed on %s\n", dev_name(params->dev)); + ret = -ENOMEM; + } + + return ret; +} + +static void dma_sg_map_benchmark_do_unmap(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + + dma_unmap_sg(params->dev, params->sgt.sgl, params->npages, + params->dma_dir); +} + +static struct map_benchmark_ops dma_sg_map_benchmark_ops = { + .prepare = dma_sg_map_benchmark_prepare, + .unprepare = dma_sg_map_benchmark_unprepare, + .initialize_data = dma_sg_map_benchmark_initialize_data, + .do_map = dma_sg_map_benchmark_do_map, + .do_unmap = dma_sg_map_benchmark_do_unmap, +}; + static struct map_benchmark_ops *dma_map_benchmark_ops[DMA_MAP_BENCH_MODE_MAX] = { [DMA_MAP_BENCH_SINGLE_MODE] = &dma_single_map_benchmark_ops, + [DMA_MAP_BENCH_SG_MODE] = &dma_sg_map_benchmark_ops, }; static int map_benchmark_thread(void *data) From a54302ccfd38afba7b297566f0d414b961ca97bf Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Wed, 25 Feb 2026 17:38:00 +0800 Subject: [PATCH 03/24] tools/dma: Add dma_map_sg support Support for dma_map_sg, add option '-m' to distinguish mode. i) Users can set option '-m' to select mode: DMA_MAP_BENCH_SINGLE_MODE=0, DMA_MAP_BENCH_SG_MODE:=1 (The mode is also show in the test result). ii) Users can set option '-g' to set sg_nents (total count of entries in scatterlist) the maximum number is 1024. Each of sg buf size is PAGE_SIZE. e.g [root@localhost]# ./dma_map_benchmark -m 1 -g 8 -t 8 -s 30 -d 2 dma mapping mode: DMA_MAP_BENCH_SG_MODE dma mapping benchmark: threads:8 seconds:30 node:-1 dir:FROM_DEVICE granule/sg_nents: 8 average map latency(us):1.4 standard deviation:0.3 average unmap latency(us):1.3 standard deviation:0.3 [root@localhost]# ./dma_map_benchmark -m 0 -g 8 -t 8 -s 30 -d 2 dma mapping mode: DMA_MAP_BENCH_SINGLE_MODE dma mapping benchmark: threads:8 seconds:30 node:-1 dir:FROM_DEVICE granule/sg_nents: 8 average map latency(us):1.0 standard deviation:0.3 average unmap latency(us):1.3 standard deviation:0.5 Reviewed-by: Barry Song Signed-off-by: Qinxin Xia Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260225093800.3625054-4-xiaqinxin@huawei.com --- tools/dma/dma_map_benchmark.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/dma/dma_map_benchmark.c b/tools/dma/dma_map_benchmark.c index dd0ed528e6df..eab0ac611a23 100644 --- a/tools/dma/dma_map_benchmark.c +++ b/tools/dma/dma_map_benchmark.c @@ -20,12 +20,19 @@ static char *directions[] = { "FROM_DEVICE", }; +static char *mode[] = { + "SINGLE_MODE", + "SG_MODE", +}; + int main(int argc, char **argv) { struct map_benchmark map; int fd, opt; /* default single thread, run 20 seconds on NUMA_NO_NODE */ int threads = 1, seconds = 20, node = -1; + /* default single map mode */ + int map_mode = DMA_MAP_BENCH_SINGLE_MODE; /* default dma mask 32bit, bidirectional DMA */ int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; /* default granule 1 PAGESIZE */ @@ -33,7 +40,7 @@ int main(int argc, char **argv) int cmd = DMA_MAP_BENCHMARK; - while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:m:")) != -1) { switch (opt) { case 't': threads = atoi(optarg); @@ -56,11 +63,20 @@ int main(int argc, char **argv) case 'g': granule = atoi(optarg); break; + case 'm': + map_mode = atoi(optarg); + break; default: return -1; } } + if (map_mode < 0 || map_mode >= DMA_MAP_BENCH_MODE_MAX) { + fprintf(stderr, "invalid map mode, SINGLE_MODE:%d, SG_MODE: %d\n", + DMA_MAP_BENCH_SINGLE_MODE, DMA_MAP_BENCH_SG_MODE); + exit(1); + } + if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) { fprintf(stderr, "invalid number of threads, must be in 1-%d\n", DMA_MAP_MAX_THREADS); @@ -110,14 +126,15 @@ int main(int argc, char **argv) map.dma_dir = dir; map.dma_trans_ns = xdelay; map.granule = granule; + map.map_mode = map_mode; if (ioctl(fd, cmd, &map)) { perror("ioctl"); exit(1); } - printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", - threads, seconds, node, directions[dir], granule); + printf("dma mapping benchmark(%s): threads:%d seconds:%d node:%d dir:%s granule:%d\n", + mode[map_mode], threads, seconds, node, directions[dir], granule); printf("average map latency(us):%.1f standard deviation:%.1f\n", map.avg_map_100ns/10.0, map.map_stddev/10.0); printf("average unmap latency(us):%.1f standard deviation:%.1f\n", From 2c92eff008a253a5ec0af7e9fa9c5a41e238ea50 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:12:16 +0800 Subject: [PATCH 04/24] arm64: Provide dcache_by_myline_op_nosync helper dcache_by_myline_op ensures completion of the data cache operations for a region, while dcache_by_myline_op_nosync only issues them without waiting. This enables deferred synchronization so completion for multiple regions can be handled together later. Cc: Leon Romanovsky Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tangquan Zheng Tested-by: Xueyuan Chen Signed-off-by: Barry Song Reviewed-by: Catalin Marinas Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221216.59886-1-21cnbao@gmail.com --- arch/arm64/include/asm/assembler.h | 25 +++++++++++++++++++------ arch/arm64/kernel/relocate_kernel.S | 3 ++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index d3d46e5f7188..cdbaad41bddb 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -371,14 +371,13 @@ alternative_endif * [start, end) with dcache line size explicitly provided. * * op: operation passed to dc instruction - * domain: domain used in dsb instruction * start: starting virtual address of the region * end: end virtual address of the region * linesz: dcache line size * fixup: optional label to branch to on user fault * Corrupts: start, end, tmp */ - .macro dcache_by_myline_op op, domain, start, end, linesz, tmp, fixup + .macro dcache_by_myline_op_nosync op, start, end, linesz, tmp, fixup sub \tmp, \linesz, #1 bic \start, \start, \tmp alternative_if ARM64_WORKAROUND_4311569 @@ -412,14 +411,28 @@ alternative_if ARM64_WORKAROUND_4311569 cbnz \start, .Ldcache_op\@ .endif alternative_else_nop_endif - dsb \domain _cond_uaccess_extable .Ldcache_op\@, \fixup .endm /* * Macro to perform a data cache maintenance for the interval - * [start, end) + * [start, end) without waiting for completion + * + * op: operation passed to dc instruction + * start: starting virtual address of the region + * end: end virtual address of the region + * fixup: optional label to branch to on user fault + * Corrupts: start, end, tmp1, tmp2 + */ + .macro dcache_by_line_op_nosync op, start, end, tmp1, tmp2, fixup + dcache_line_size \tmp1, \tmp2 + dcache_by_myline_op_nosync \op, \start, \end, \tmp1, \tmp2, \fixup + .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [start, end) and wait for completion * * op: operation passed to dc instruction * domain: domain used in dsb instruction @@ -429,8 +442,8 @@ alternative_else_nop_endif * Corrupts: start, end, tmp1, tmp2 */ .macro dcache_by_line_op op, domain, start, end, tmp1, tmp2, fixup - dcache_line_size \tmp1, \tmp2 - dcache_by_myline_op \op, \domain, \start, \end, \tmp1, \tmp2, \fixup + dcache_by_line_op_nosync \op, \start, \end, \tmp1, \tmp2, \fixup + dsb \domain .endm /* diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 413f899e4ac6..6cb4209f5dab 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -64,7 +64,8 @@ SYM_CODE_START(arm64_relocate_new_kernel) mov x19, x13 copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 add x1, x19, #PAGE_SIZE - dcache_by_myline_op civac, sy, x19, x1, x15, x20 + dcache_by_myline_op_nosync civac, x19, x1, x15, x20 + dsb sy b .Lnext .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination From 1c3a7f9e6bac8993946d384ee4c2f79910e93cd8 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:12:39 +0800 Subject: [PATCH 05/24] arm64: Provide dcache_clean_poc_nosync helper dcache_clean_poc_nosync does not wait for the data cache clean to complete. Later, we wait for completion of all scatter-gather entries together. Cc: Leon Romanovsky Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tangquan Zheng Tested-by: Xueyuan Chen Signed-off-by: Barry Song Reviewed-by: Catalin Marinas Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221239.59903-1-21cnbao@gmail.com --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/cache.S | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 28ab96e808ef..9b6d0a62cf3d 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -74,6 +74,7 @@ extern void icache_inval_pou(unsigned long start, unsigned long end); extern void dcache_clean_inval_poc(unsigned long start, unsigned long end); extern void dcache_inval_poc(unsigned long start, unsigned long end); extern void dcache_clean_poc(unsigned long start, unsigned long end); +extern void dcache_clean_poc_nosync(unsigned long start, unsigned long end); extern void dcache_clean_pop(unsigned long start, unsigned long end); extern void dcache_clean_pou(unsigned long start, unsigned long end); extern long caches_clean_inval_user_pou(unsigned long start, unsigned long end); diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 503567c864fd..4a7c7e03785d 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -178,6 +178,21 @@ SYM_FUNC_START(__pi_dcache_clean_poc) SYM_FUNC_END(__pi_dcache_clean_poc) SYM_FUNC_ALIAS(dcache_clean_poc, __pi_dcache_clean_poc) +/* + * dcache_clean_poc_nosync(start, end) + * + * Issue the instructions of D-cache lines for the interval [start, end). + * not necessarily cleaned to the PoC till an explicit dsb sy afterward. + * + * - start - virtual start address of region + * - end - virtual end address of region + */ +SYM_FUNC_START(__pi_dcache_clean_poc_nosync) + dcache_by_line_op_nosync cvac, x0, x1, x2, x3 + ret +SYM_FUNC_END(__pi_dcache_clean_poc_nosync) +SYM_FUNC_ALIAS(dcache_clean_poc_nosync, __pi_dcache_clean_poc_nosync) + /* * dcache_clean_pop(start, end) * From cf875c4b6863fd64054e1c3550c349eac09c4f35 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:12:58 +0800 Subject: [PATCH 06/24] arm64: Provide dcache_inval_poc_nosync helper dcache_inval_poc_nosync does not wait for the data cache invalidation to complete. Later, we defer the synchronization so we can wait for all SG entries together. Cc: Leon Romanovsky Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tangquan Zheng Tested-by: Xueyuan Chen Signed-off-by: Barry Song Reviewed-by: Catalin Marinas Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221258.59918-1-21cnbao@gmail.com --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/cache.S | 42 +++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 9b6d0a62cf3d..382b4ac3734d 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -74,6 +74,7 @@ extern void icache_inval_pou(unsigned long start, unsigned long end); extern void dcache_clean_inval_poc(unsigned long start, unsigned long end); extern void dcache_inval_poc(unsigned long start, unsigned long end); extern void dcache_clean_poc(unsigned long start, unsigned long end); +extern void dcache_inval_poc_nosync(unsigned long start, unsigned long end); extern void dcache_clean_poc_nosync(unsigned long start, unsigned long end); extern void dcache_clean_pop(unsigned long start, unsigned long end); extern void dcache_clean_pou(unsigned long start, unsigned long end); diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 4a7c7e03785d..ab75c050f559 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -132,17 +132,7 @@ alternative_else_nop_endif ret SYM_FUNC_END(dcache_clean_pou) -/* - * dcache_inval_poc(start, end) - * - * Ensure that any D-cache lines for the interval [start, end) - * are invalidated. Any partial lines at the ends of the interval are - * also cleaned to PoC to prevent data loss. - * - * - start - kernel start address of region - * - end - kernel end address of region - */ -SYM_FUNC_START(__pi_dcache_inval_poc) +.macro __dcache_inval_poc_nosync dcache_line_size x2, x3 sub x3, x2, #1 tst x1, x3 // end cache line aligned? @@ -158,11 +148,41 @@ SYM_FUNC_START(__pi_dcache_inval_poc) 3: add x0, x0, x2 cmp x0, x1 b.lo 2b +.endm + +/* + * dcache_inval_poc(start, end) + * + * Ensure that any D-cache lines for the interval [start, end) + * are invalidated. Any partial lines at the ends of the interval are + * also cleaned to PoC to prevent data loss. + * + * - start - kernel start address of region + * - end - kernel end address of region + */ +SYM_FUNC_START(__pi_dcache_inval_poc) + __dcache_inval_poc_nosync dsb sy ret SYM_FUNC_END(__pi_dcache_inval_poc) SYM_FUNC_ALIAS(dcache_inval_poc, __pi_dcache_inval_poc) +/* + * dcache_inval_poc_nosync(start, end) + * + * Issue the instructions of D-cache lines for the interval [start, end) + * for invalidation. Not necessarily cleaned to PoC till an explicit dsb + * sy is issued later + * + * - start - kernel start address of region + * - end - kernel end address of region + */ +SYM_FUNC_START(__pi_dcache_inval_poc_nosync) + __dcache_inval_poc_nosync + ret +SYM_FUNC_END(__pi_dcache_inval_poc_nosync) +SYM_FUNC_ALIAS(dcache_inval_poc_nosync, __pi_dcache_inval_poc_nosync) + /* * dcache_clean_poc(start, end) * From d7eafe655b741dfc241d5b920f6d2cea45b568d9 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:13:16 +0800 Subject: [PATCH 07/24] dma-mapping: Separate DMA sync issuing and completion waiting Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device always wait for the completion of each DMA buffer. That is, issuing the DMA sync and waiting for completion is done in a single API call. For scatter-gather lists with multiple entries, this means issuing and waiting is repeated for each entry, which can hurt performance. Architectures like ARM64 may be able to issue all DMA sync operations for all entries first and then wait for completion together. To address this, arch_sync_dma_for_* now batches DMA operations and performs a flush afterward. On ARM64, the flush is implemented with a dsb instruction in arch_sync_dma_flush(). On other architectures, arch_sync_dma_flush() is currently a nop. Cc: Leon Romanovsky Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Joerg Roedel Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Tangquan Zheng Reviewed-by: Juergen Gross # drivers/xen/swiotlb-xen.c Tested-by: Xueyuan Chen Signed-off-by: Barry Song Reviewed-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221316.59934-1-21cnbao@gmail.com --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/cache.h | 5 +++++ arch/arm64/mm/dma-mapping.c | 4 ++-- drivers/iommu/dma-iommu.c | 35 ++++++++++++++++++++++++++-------- drivers/xen/swiotlb-xen.c | 24 +++++++++++++++-------- include/linux/dma-map-ops.h | 6 ++++++ kernel/dma/Kconfig | 3 +++ kernel/dma/direct.c | 6 +++++- kernel/dma/direct.h | 9 +++++++-- kernel/dma/swiotlb.c | 7 ++++++- 10 files changed, 78 insertions(+), 22 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 38dba5f7e4d2..ceafaac6532c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -55,6 +55,7 @@ config ARM64 select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_SYNC_DMA_FOR_CPU + select ARCH_HAS_BATCHED_DMA_SYNC select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_ZONE_DMA_SET if EXPERT diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index dd2c8586a725..10a7ffadee3d 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -87,6 +87,11 @@ int cache_line_size(void); #define dma_get_cache_alignment cache_line_size +static inline void arch_sync_dma_flush(void) +{ + dsb(sy); +} + /* Compress a u64 MPIDR value into 32 bits. */ static inline u64 arch_compact_of_hwid(u64 id) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index b2b5792b2caa..ae1ae0280eef 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, { unsigned long start = (unsigned long)phys_to_virt(paddr); - dcache_clean_poc(start, start + size); + dcache_clean_poc_nosync(start, start + size); } void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, if (dir == DMA_TO_DEVICE) return; - dcache_inval_poc(start, start + size); + dcache_inval_poc_nosync(start, start + size); } void arch_dma_prep_coherent(struct page *page, size_t size) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 5dac64be61bb..66fc25bae85b 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1095,8 +1095,10 @@ void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(phys, size, dir); + arch_sync_dma_flush(); + } swiotlb_sync_single_for_cpu(dev, phys, size, dir); } @@ -1112,8 +1114,10 @@ void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); swiotlb_sync_single_for_device(dev, phys, size, dir); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } } void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, @@ -1122,13 +1126,15 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, struct scatterlist *sg; int i; - if (sg_dma_is_swiotlb(sgl)) + if (sg_dma_is_swiotlb(sgl)) { for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg), sg->length, dir); - else if (!dev_is_dma_coherent(dev)) + } else if (!dev_is_dma_coherent(dev)) { for_each_sg(sgl, sg, nelems, i) arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); + arch_sync_dma_flush(); + } } void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, @@ -1137,14 +1143,16 @@ void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, struct scatterlist *sg; int i; - if (sg_dma_is_swiotlb(sgl)) + if (sg_dma_is_swiotlb(sgl)) { for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_device(dev, sg_dma_address(sg), sg->length, dir); - else if (!dev_is_dma_coherent(dev)) + } else if (!dev_is_dma_coherent(dev)) { for_each_sg(sgl, sg, nelems, i) arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); + arch_sync_dma_flush(); + } } static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, @@ -1219,8 +1227,10 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, return DMA_MAPPING_ERROR; } - if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) @@ -1242,8 +1252,10 @@ void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, if (WARN_ON(!phys)) return; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(phys, size, dir); + arch_sync_dma_flush(); + } __iommu_dma_unmap(dev, dma_handle, size); @@ -1980,6 +1992,8 @@ int dma_iova_sync(struct device *dev, struct dma_iova_state *state, dma_addr_t addr = state->addr + offset; size_t iova_start_pad = iova_offset(iovad, addr); + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); return iommu_sync_map(domain, addr - iova_start_pad, iova_align(iovad, size + iova_start_pad)); } @@ -1993,6 +2007,8 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; size_t iova_start_pad = iova_offset(iovad, addr); + bool need_sync_dma = !dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)); dma_addr_t end = addr + size; do { @@ -2016,6 +2032,9 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, addr += len; iova_start_pad = 0; } while (addr < end); + + if (need_sync_dma) + arch_sync_dma_flush(); } static void __iommu_dma_iova_unlink(struct device *dev, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index ccf25027bec1..b79917e785a5 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -262,10 +262,12 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys, done: if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) { arch_sync_dma_for_device(phys, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_device(dev, dev_addr, size, dir); + } } return dev_addr; } @@ -287,10 +289,12 @@ static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) { arch_sync_dma_for_cpu(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir); + } } /* NOTE: We use dev_addr here, not paddr! */ @@ -308,10 +312,12 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, struct io_tlb_pool *pool; if (!dev_is_dma_coherent(dev)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { arch_sync_dma_for_cpu(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_cpu(dev, dma_addr, size, dir); + } } pool = xen_swiotlb_find_pool(dev, dma_addr); @@ -331,10 +337,12 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, __swiotlb_sync_single_for_device(dev, paddr, size, dir, pool); if (!dev_is_dma_coherent(dev)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { arch_sync_dma_for_device(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_device(dev, dma_addr, size, dir); + } } } diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..8a07df5a9ef6 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, } #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ +#ifndef CONFIG_ARCH_HAS_BATCHED_DMA_SYNC +static inline void arch_sync_dma_flush(void) +{ +} +#endif + #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL void arch_sync_dma_for_cpu_all(void); #else diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 159900736f25..bfef21b4a9ae 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -72,6 +72,9 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool +config ARCH_HAS_BATCHED_DMA_SYNC + bool + # # Select this option if the architecture assumes DMA devices are coherent # by default. diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f43a930716d..c7666e5d5e7c 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -406,6 +406,8 @@ void dma_direct_sync_sg_for_device(struct device *dev, arch_sync_dma_for_device(paddr, sg->length, dir); } + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); } #endif @@ -427,8 +429,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); } - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); + } } /* diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index f476c63b668c..f925a7e8b000 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -60,8 +60,10 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, swiotlb_sync_single_for_device(dev, paddr, size, dir); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_device(paddr, size, dir); + arch_sync_dma_flush(); + } } static inline void dma_direct_sync_single_for_cpu(struct device *dev, @@ -71,6 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); } @@ -106,8 +109,10 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, } if (!dev_is_dma_coherent(dev) && - !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } return dma_addr; err_overflow: diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..1105db1689d5 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -867,6 +867,9 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size if (orig_addr == INVALID_PHYS_ADDR) return; + if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); + /* * It's valid for tlb_offset to be negative. This can happen when the * "offset" returned by swiotlb_align_offset() is non-zero, and the @@ -1595,8 +1598,10 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, return DMA_MAPPING_ERROR; } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { arch_sync_dma_for_device(swiotlb_addr, size, dir); + arch_sync_dma_flush(); + } return dma_addr; } From 661f8a193d48d123aedcbd401ace137333d02523 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:13:37 +0800 Subject: [PATCH 08/24] dma-mapping: Support batch mode for dma_direct_{map,unmap}_sg Extending these APIs with a flush argument: dma_direct_unmap_phys(), dma_direct_map_phys(), and dma_direct_sync_single_for_cpu(). For single-buffer cases, flush=true would be used, while for SG cases flush=false would be used, followed by a single flush after all cache operations are issued in dma_direct_{map,unmap}_sg(). This ultimately benefits dma_map_sg() and dma_unmap_sg(). Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tangquan Zheng Reviewed-by: Leon Romanovsky Tested-by: Xueyuan Chen Signed-off-by: Barry Song Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221337.59951-1-21cnbao@gmail.com --- kernel/dma/direct.c | 17 +++++++++++++---- kernel/dma/direct.h | 16 ++++++++++------ kernel/dma/mapping.c | 6 +++--- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c7666e5d5e7c..ec887f443741 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -444,14 +444,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, { struct scatterlist *sg; int i; + bool need_sync = false; for_each_sg(sgl, sg, nents, i) { - if (sg_dma_is_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) { sg_dma_unmark_bus_address(sg); - else + } else { + need_sync = true; dma_direct_unmap_phys(dev, sg->dma_address, - sg_dma_len(sg), dir, attrs); + sg_dma_len(sg), dir, attrs, false); + } } + if (need_sync && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); } #endif @@ -461,6 +466,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, struct pci_p2pdma_map_state p2pdma_state = {}; struct scatterlist *sg; int i, ret; + bool need_sync = false; for_each_sg(sgl, sg, nents, i) { switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) { @@ -472,8 +478,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, */ break; case PCI_P2PDMA_MAP_NONE: + need_sync = true; sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), - sg->length, dir, attrs); + sg->length, dir, attrs, false); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; @@ -492,6 +499,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, sg_dma_len(sg) = sg->length; } + if (need_sync && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); return nents; out_unmap: diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index f925a7e8b000..52b361e66700 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -67,13 +67,15 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, } static inline void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) + dma_addr_t addr, size_t size, enum dma_data_direction dir, + bool flush) { phys_addr_t paddr = dma_to_phys(dev, addr); if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(paddr, size, dir); - arch_sync_dma_flush(); + if (flush) + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); } @@ -82,7 +84,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, static inline dma_addr_t dma_direct_map_phys(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, - unsigned long attrs) + unsigned long attrs, bool flush) { dma_addr_t dma_addr; @@ -111,7 +113,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, if (!dev_is_dma_coherent(dev) && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); - arch_sync_dma_flush(); + if (flush) + arch_sync_dma_flush(); } return dma_addr; @@ -124,7 +127,8 @@ err_overflow: } static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) + size_t size, enum dma_data_direction dir, unsigned long attrs, + bool flush) { phys_addr_t phys; @@ -134,7 +138,7 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); + dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush); swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3928a509c44c..78d8b4039c3e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -166,7 +166,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, if (dma_map_direct(dev, ops) || (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) - addr = dma_direct_map_phys(dev, phys, size, dir, attrs); + addr = dma_direct_map_phys(dev, phys, size, dir, attrs, true); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (ops->map_phys) @@ -207,7 +207,7 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size))) - dma_direct_unmap_phys(dev, addr, size, dir, attrs); + dma_direct_unmap_phys(dev, addr, size, dir, attrs, true); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (ops->unmap_phys) @@ -373,7 +373,7 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); + dma_direct_sync_single_for_cpu(dev, addr, size, dir, true); else if (use_dma_iommu(dev)) iommu_dma_sync_single_for_cpu(dev, addr, size, dir); else if (ops->sync_single_for_cpu) From d9794c0600f95b226b6672c5b364e44c80d660c5 Mon Sep 17 00:00:00 2001 From: Kit Dallege Date: Sun, 15 Mar 2026 18:10:01 +0100 Subject: [PATCH 09/24] dma-mapping: fix false kernel-doc comment marker Change /** to /* for the DMA attributes list comment in dma-mapping.h. The comment is not a kernel-doc structured comment and should not use the kernel-doc opening marker. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Kit Dallege Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260315171001.66010-1-xaum.io@gmail.com --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 29973baa0581..0c2807e50bdf 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -9,7 +9,7 @@ #include #include -/** +/* * List of possible attributes associated with a DMA mapping. The semantics * of each attribute should be defined in Documentation/core-api/dma-attributes.rst. */ From abdd23c8849d45c6bdef0ab6facbbc63bddebbe1 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:17 +0100 Subject: [PATCH 10/24] of: reserved_mem: remove fdt node from the structure FDT node is not needed for anything besides the initialization, so it can be simply passed as an argument to the reserved memory region init function. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-2-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- drivers/memory/tegra/tegra210-emc-table.c | 3 ++- drivers/of/of_reserved_mem.c | 25 ++++++++++------------- include/linux/of_reserved_mem.h | 4 ++-- kernel/dma/coherent.c | 4 +--- kernel/dma/contiguous.c | 3 +-- kernel/dma/swiotlb.c | 5 ++--- 6 files changed, 19 insertions(+), 25 deletions(-) diff --git a/drivers/memory/tegra/tegra210-emc-table.c b/drivers/memory/tegra/tegra210-emc-table.c index 34a8785d2861..ac1d1e13482a 100644 --- a/drivers/memory/tegra/tegra210-emc-table.c +++ b/drivers/memory/tegra/tegra210-emc-table.c @@ -75,7 +75,8 @@ static const struct reserved_mem_ops tegra210_emc_table_ops = { .device_release = tegra210_emc_table_device_release, }; -static int tegra210_emc_table_init(struct reserved_mem *rmem) +static int tegra210_emc_table_init(unsigned long node, + struct reserved_mem *rmem) { pr_debug("Tegra210 EMC table at %pa, size %lu bytes\n", &rmem->base, (unsigned long)rmem->size); diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 1fd28f805610..6705b7afebf0 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -104,7 +104,8 @@ static void __init alloc_reserved_mem_array(void) reserved_mem = new_array; } -static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem); +static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem, + unsigned long node); /* * fdt_reserved_mem_save_node() - save fdt node for second pass initialization */ @@ -118,13 +119,12 @@ static void __init fdt_reserved_mem_save_node(unsigned long node, const char *un return; } - rmem->fdt_node = node; rmem->name = uname; rmem->base = base; rmem->size = size; /* Call the region specific initialization function */ - fdt_init_reserved_mem_node(rmem); + fdt_init_reserved_mem_node(rmem, node); reserved_mem_count++; } @@ -483,7 +483,8 @@ static const struct of_device_id __rmem_of_table_sentinel /* * __reserved_mem_init_node() - call region specific reserved memory init code */ -static int __init __reserved_mem_init_node(struct reserved_mem *rmem) +static int __init __reserved_mem_init_node(struct reserved_mem *rmem, + unsigned long node) { extern const struct of_device_id __reservedmem_of_table[]; const struct of_device_id *i; @@ -493,10 +494,10 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem) reservedmem_of_init_fn initfn = i->data; const char *compat = i->compatible; - if (!of_flat_dt_is_compatible(rmem->fdt_node, compat)) + if (!of_flat_dt_is_compatible(node, compat)) continue; - ret = initfn(rmem); + ret = initfn(node, rmem); if (ret == 0) { pr_info("initialized node %s, compatible id %s\n", rmem->name, compat); @@ -526,11 +527,6 @@ static int __init __rmem_cmp(const void *a, const void *b) if (ra->size > rb->size) return 1; - if (ra->fdt_node < rb->fdt_node) - return -1; - if (ra->fdt_node > rb->fdt_node) - return 1; - return 0; } @@ -564,19 +560,20 @@ static void __init __rmem_check_for_overlap(void) /** * fdt_init_reserved_mem_node() - Initialize a reserved memory region * @rmem: reserved_mem struct of the memory region to be initialized. + * @node: fdt node of the initialized region * * This function is used to call the region specific initialization * function for a reserved memory region. */ -static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem) +static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem, + unsigned long node) { - unsigned long node = rmem->fdt_node; int err = 0; bool nomap; nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; - err = __reserved_mem_init_node(rmem); + err = __reserved_mem_init_node(rmem, node); if (err != 0 && err != -ENOENT) { pr_info("node %s compatible matching fail\n", rmem->name); if (nomap) diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index f573423359f4..5159938bfe03 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -11,7 +11,6 @@ struct resource; struct reserved_mem { const char *name; - unsigned long fdt_node; const struct reserved_mem_ops *ops; phys_addr_t base; phys_addr_t size; @@ -25,7 +24,8 @@ struct reserved_mem_ops { struct device *dev); }; -typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem); +typedef int (*reservedmem_of_init_fn)(unsigned long node, + struct reserved_mem *rmem); #ifdef CONFIG_OF_RESERVED_MEM diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 1147497bc512..34621acbd3c5 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -367,10 +367,8 @@ static const struct reserved_mem_ops rmem_dma_ops = { .device_release = rmem_dma_device_release, }; -static int __init rmem_dma_setup(struct reserved_mem *rmem) +static int __init rmem_dma_setup(unsigned long node, struct reserved_mem *rmem) { - unsigned long node = rmem->fdt_node; - if (of_get_flat_dt_prop(node, "reusable", NULL)) return -EINVAL; diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index c56004d314dc..81a2fa4971ee 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -475,9 +475,8 @@ static const struct reserved_mem_ops rmem_cma_ops = { .device_release = rmem_cma_device_release, }; -static int __init rmem_cma_setup(struct reserved_mem *rmem) +static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) { - unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..f3a12e15a951 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1860,10 +1860,9 @@ static const struct reserved_mem_ops rmem_swiotlb_ops = { .device_release = rmem_swiotlb_device_release, }; -static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) +static int __init rmem_swiotlb_setup(unsigned long node, + struct reserved_mem *rmem) { - unsigned long node = rmem->fdt_node; - if (of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "linux,cma-default", NULL) || of_get_flat_dt_prop(node, "linux,dma-default", NULL) || From 9d5149b3f2e7e80378907a8d3e4f7a94dfbbbdb8 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:18 +0100 Subject: [PATCH 11/24] of: reserved_mem: use -ENODEV instead of -ENOENT When given reserved memory region doesn't really support given node, return -ENODEV instead of -ENOENT. Then fix __reserved_mem_init_node() function to properly propagate error code different from -ENODEV instead of silently ignoring it. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-3-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- drivers/of/of_reserved_mem.c | 7 ++++--- kernel/dma/coherent.c | 2 +- kernel/dma/contiguous.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 6705b7afebf0..9aff460a0420 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -488,9 +488,10 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem, { extern const struct of_device_id __reservedmem_of_table[]; const struct of_device_id *i; - int ret = -ENOENT; + int ret = -ENODEV; - for (i = __reservedmem_of_table; i < &__rmem_of_table_sentinel; i++) { + for (i = __reservedmem_of_table; ret == -ENODEV && + i < &__rmem_of_table_sentinel; i++) { reservedmem_of_init_fn initfn = i->data; const char *compat = i->compatible; @@ -574,7 +575,7 @@ static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem, nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; err = __reserved_mem_init_node(rmem, node); - if (err != 0 && err != -ENOENT) { + if (err != 0 && err != -ENODEV) { pr_info("node %s compatible matching fail\n", rmem->name); if (nomap) memblock_clear_nomap(rmem->base, rmem->size); diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 34621acbd3c5..64f9ba618e19 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -370,7 +370,7 @@ static const struct reserved_mem_ops rmem_dma_ops = { static int __init rmem_dma_setup(unsigned long node, struct reserved_mem *rmem) { if (of_get_flat_dt_prop(node, "reusable", NULL)) - return -EINVAL; + return -ENODEV; #ifdef CONFIG_ARM if (!of_get_flat_dt_prop(node, "no-map", NULL)) { diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 81a2fa4971ee..e6fc6906b5c0 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -483,7 +483,7 @@ static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) - return -EINVAL; + return -ENODEV; if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); From c640cad6a5382ea08a4e052156cfefc8021c51b7 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:19 +0100 Subject: [PATCH 12/24] of: reserved_mem: switch to ops based OF_DECLARE() Move init function from OF_DECLARE() argument to the given reserved memory region ops structure and then pass that structure to the OF_DECLARE() initializer. This node_init callback is mandatory for the reserved mem driver. Such change makes it possible in the future to add more functions called by the generic code before given memory region is initialized and rmem object is created. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-4-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- drivers/memory/tegra/tegra210-emc-table.c | 16 ++++++++-------- drivers/of/of_reserved_mem.c | 17 +++++++++++++---- include/linux/of_reserved_mem.h | 13 ++++++------- kernel/dma/coherent.c | 13 +++++++------ kernel/dma/contiguous.c | 15 ++++++++------- kernel/dma/swiotlb.c | 14 +++++++------- 6 files changed, 49 insertions(+), 39 deletions(-) diff --git a/drivers/memory/tegra/tegra210-emc-table.c b/drivers/memory/tegra/tegra210-emc-table.c index ac1d1e13482a..4b3c478b2743 100644 --- a/drivers/memory/tegra/tegra210-emc-table.c +++ b/drivers/memory/tegra/tegra210-emc-table.c @@ -70,20 +70,20 @@ static void tegra210_emc_table_device_release(struct reserved_mem *rmem, memunmap(timings); } -static const struct reserved_mem_ops tegra210_emc_table_ops = { - .device_init = tegra210_emc_table_device_init, - .device_release = tegra210_emc_table_device_release, -}; - static int tegra210_emc_table_init(unsigned long node, struct reserved_mem *rmem) { pr_debug("Tegra210 EMC table at %pa, size %lu bytes\n", &rmem->base, (unsigned long)rmem->size); - rmem->ops = &tegra210_emc_table_ops; - return 0; } + +static const struct reserved_mem_ops tegra210_emc_table_ops = { + .node_init = tegra210_emc_table_init, + .device_init = tegra210_emc_table_device_init, + .device_release = tegra210_emc_table_device_release, +}; + RESERVEDMEM_OF_DECLARE(tegra210_emc_table, "nvidia,tegra210-emc-table", - tegra210_emc_table_init); + &tegra210_emc_table_ops); diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 9aff460a0420..4dd0d6f6a4b0 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -480,8 +480,16 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam static const struct of_device_id __rmem_of_table_sentinel __used __section("__reservedmem_of_table_end"); -/* - * __reserved_mem_init_node() - call region specific reserved memory init code +/** + * __reserved_mem_init_node() - initialize a reserved memory region + * @rmem: reserved_mem structure to initialize + * @node: FDT node describing the reserved memory region + * + * This function iterates through the reserved memory drivers and calls the + * node_init callback for the compatible entry matching the node. On success, + * the operations pointer is stored in the reserved_mem structure. + * + * Return: 0 on success, -ENODEV if no compatible match found */ static int __init __reserved_mem_init_node(struct reserved_mem *rmem, unsigned long node) @@ -492,14 +500,15 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem, for (i = __reservedmem_of_table; ret == -ENODEV && i < &__rmem_of_table_sentinel; i++) { - reservedmem_of_init_fn initfn = i->data; + const struct reserved_mem_ops *ops = i->data; const char *compat = i->compatible; if (!of_flat_dt_is_compatible(node, compat)) continue; - ret = initfn(node, rmem); + ret = ops->node_init(node, rmem); if (ret == 0) { + rmem->ops = ops; pr_info("initialized node %s, compatible id %s\n", rmem->name, compat); break; diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 5159938bfe03..747a1e73d5dd 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -18,19 +18,17 @@ struct reserved_mem { }; struct reserved_mem_ops { + int (*node_init)(unsigned long fdt_node, struct reserved_mem *rmem); int (*device_init)(struct reserved_mem *rmem, struct device *dev); void (*device_release)(struct reserved_mem *rmem, struct device *dev); }; -typedef int (*reservedmem_of_init_fn)(unsigned long node, - struct reserved_mem *rmem); - #ifdef CONFIG_OF_RESERVED_MEM -#define RESERVEDMEM_OF_DECLARE(name, compat, init) \ - _OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn) +#define RESERVEDMEM_OF_DECLARE(name, compat, ops) \ + _OF_DECLARE(reservedmem, name, compat, ops, struct reserved_mem_ops *) int of_reserved_mem_device_init_by_idx(struct device *dev, struct device_node *np, int idx); @@ -48,8 +46,9 @@ int of_reserved_mem_region_count(const struct device_node *np); #else -#define RESERVEDMEM_OF_DECLARE(name, compat, init) \ - _OF_DECLARE_STUB(reservedmem, name, compat, init, reservedmem_of_init_fn) +#define RESERVEDMEM_OF_DECLARE(name, compat, ops) \ + _OF_DECLARE_STUB(reservedmem, name, compat, ops, \ + struct reserved_mem_ops *) static inline int of_reserved_mem_device_init_by_idx(struct device *dev, struct device_node *np, int idx) diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 64f9ba618e19..bcdc0f76d2e8 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -362,10 +362,6 @@ static void rmem_dma_device_release(struct reserved_mem *rmem, dev->dma_mem = NULL; } -static const struct reserved_mem_ops rmem_dma_ops = { - .device_init = rmem_dma_device_init, - .device_release = rmem_dma_device_release, -}; static int __init rmem_dma_setup(unsigned long node, struct reserved_mem *rmem) { @@ -388,7 +384,6 @@ static int __init rmem_dma_setup(unsigned long node, struct reserved_mem *rmem) } #endif - rmem->ops = &rmem_dma_ops; pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; @@ -405,5 +400,11 @@ static int __init dma_init_reserved_memory(void) core_initcall(dma_init_reserved_memory); #endif /* CONFIG_DMA_GLOBAL_POOL */ -RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); +static const struct reserved_mem_ops rmem_dma_ops = { + .node_init = rmem_dma_setup, + .device_init = rmem_dma_device_init, + .device_release = rmem_dma_device_release, +}; + +RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", &rmem_dma_ops); #endif diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index e6fc6906b5c0..efeebda92537 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -470,11 +470,6 @@ static void rmem_cma_device_release(struct reserved_mem *rmem, dev->cma_area = NULL; } -static const struct reserved_mem_ops rmem_cma_ops = { - .device_init = rmem_cma_device_init, - .device_release = rmem_cma_device_release, -}; - static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) { bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); @@ -499,7 +494,6 @@ static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) if (default_cma) dma_contiguous_default_area = cma; - rmem->ops = &rmem_cma_ops; rmem->priv = cma; pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", @@ -511,5 +505,12 @@ static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) return 0; } -RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); + +static const struct reserved_mem_ops rmem_cma_ops = { + .node_init = rmem_cma_setup, + .device_init = rmem_cma_device_init, + .device_release = rmem_cma_device_release, +}; + +RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", &rmem_cma_ops); #endif diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f3a12e15a951..44b566d20e04 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1855,11 +1855,6 @@ static void rmem_swiotlb_device_release(struct reserved_mem *rmem, dev->dma_io_tlb_mem = &io_tlb_default_mem; } -static const struct reserved_mem_ops rmem_swiotlb_ops = { - .device_init = rmem_swiotlb_device_init, - .device_release = rmem_swiotlb_device_release, -}; - static int __init rmem_swiotlb_setup(unsigned long node, struct reserved_mem *rmem) { @@ -1869,11 +1864,16 @@ static int __init rmem_swiotlb_setup(unsigned long node, of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - rmem->ops = &rmem_swiotlb_ops; pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; } -RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); +static const struct reserved_mem_ops rmem_swiotlb_ops = { + .node_init = rmem_swiotlb_setup, + .device_init = rmem_swiotlb_device_init, + .device_release = rmem_swiotlb_device_release, +}; + +RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", &rmem_swiotlb_ops); #endif /* CONFIG_DMA_RESTRICTED_POOL */ From 7fd3981202b9aef8862aa06ca0d75496c0f9681f Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:20 +0100 Subject: [PATCH 13/24] of: reserved_mem: replace CMA quirks by generic methods Add optional reserved memory callbacks to perform region verification and early fixup, then move all CMA related code in of_reserved_mem.c to them. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-5-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- drivers/of/of_reserved_mem.c | 118 ++++++++++++++++++++++---------- include/linux/cma.h | 10 --- include/linux/dma-map-ops.h | 3 - include/linux/of_reserved_mem.h | 3 + kernel/dma/contiguous.c | 70 ++++++++++++++----- 5 files changed, 137 insertions(+), 67 deletions(-) diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 4dd0d6f6a4b0..5dd585bcf8a8 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -24,8 +24,6 @@ #include #include #include -#include -#include #include "of_private.h" @@ -106,6 +104,11 @@ static void __init alloc_reserved_mem_array(void) static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem, unsigned long node); +static int fdt_validate_reserved_mem_node(unsigned long node, + phys_addr_t *align); +static int fdt_fixup_reserved_mem_node(unsigned long node, + phys_addr_t base, phys_addr_t size); + /* * fdt_reserved_mem_save_node() - save fdt node for second pass initialization */ @@ -154,21 +157,19 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, const char *uname) { phys_addr_t base, size; - int i, len; + int i, len, err; const __be32 *prop; - bool nomap, default_cma; + bool nomap; prop = of_flat_dt_get_addr_size_prop(node, "reg", &len); if (!prop) return -ENOENT; nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; - default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); - if (default_cma && cma_skip_dt_default_reserved_mem()) { - pr_err("Skipping dt linux,cma-default for \"cma=\" kernel param.\n"); - return -EINVAL; - } + err = fdt_validate_reserved_mem_node(node, NULL); + if (err && err != -ENODEV) + return err; for (i = 0; i < len; i++) { u64 b, s; @@ -179,10 +180,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, size = s; if (size && early_init_dt_reserve_memory(base, size, nomap) == 0) { - /* Architecture specific contiguous memory fixup. */ - if (of_flat_dt_is_compatible(node, "shared-dma-pool") && - of_get_flat_dt_prop(node, "reusable", NULL)) - dma_contiguous_early_fixup(base, size); + fdt_fixup_reserved_mem_node(node, base, size); pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); } else { @@ -253,17 +251,19 @@ void __init fdt_scan_reserved_mem_reg_nodes(void) fdt_for_each_subnode(child, fdt, node) { const char *uname; - bool default_cma = of_get_flat_dt_prop(child, "linux,cma-default", NULL); u64 b, s; + int ret; if (!of_fdt_device_is_available(fdt, child)) continue; - if (default_cma && cma_skip_dt_default_reserved_mem()) - continue; if (!of_flat_dt_get_addr_size(child, "reg", &b, &s)) continue; + ret = fdt_validate_reserved_mem_node(child, NULL); + if (ret && ret != -ENODEV) + continue; + base = b; size = s; @@ -397,7 +397,7 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam phys_addr_t base = 0, align = 0, size; int i, len; const __be32 *prop; - bool nomap, default_cma; + bool nomap; int ret; prop = of_get_flat_dt_prop(node, "size", &len); @@ -421,19 +421,10 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam } nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; - default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); - if (default_cma && cma_skip_dt_default_reserved_mem()) { - pr_err("Skipping dt linux,cma-default for \"cma=\" kernel param.\n"); - return -EINVAL; - } - - /* Need adjust the alignment to satisfy the CMA requirement */ - if (IS_ENABLED(CONFIG_CMA) - && of_flat_dt_is_compatible(node, "shared-dma-pool") - && of_get_flat_dt_prop(node, "reusable", NULL) - && !nomap) - align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES); + ret = fdt_validate_reserved_mem_node(node, &align); + if (ret && ret != -ENODEV) + return ret; prop = of_flat_dt_get_addr_size_prop(node, "alloc-ranges", &len); if (prop) { @@ -468,18 +459,76 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam uname, (unsigned long)(size / SZ_1M)); return -ENOMEM; } - /* Architecture specific contiguous memory fixup. */ - if (of_flat_dt_is_compatible(node, "shared-dma-pool") && - of_get_flat_dt_prop(node, "reusable", NULL)) - dma_contiguous_early_fixup(base, size); + + fdt_fixup_reserved_mem_node(node, base, size); + /* Save region in the reserved_mem array */ fdt_reserved_mem_save_node(node, uname, base, size); return 0; } +extern const struct of_device_id __reservedmem_of_table[]; static const struct of_device_id __rmem_of_table_sentinel __used __section("__reservedmem_of_table_end"); +/** + * fdt_fixup_reserved_mem_node() - call fixup function for a reserved memory node + * @node: FDT node to fixup + * @base: base address of the reserved memory region + * @size: size of the reserved memory region + * + * This function iterates through the reserved memory drivers and calls + * the node_fixup callback for the compatible entry matching the node. + * + * Return: 0 on success, -ENODEV if no compatible match found + */ +static int __init fdt_fixup_reserved_mem_node(unsigned long node, + phys_addr_t base, phys_addr_t size) +{ + const struct of_device_id *i; + int ret = -ENODEV; + + for (i = __reservedmem_of_table; ret == -ENODEV && + i < &__rmem_of_table_sentinel; i++) { + const struct reserved_mem_ops *ops = i->data; + + if (!of_flat_dt_is_compatible(node, i->compatible)) + continue; + + if (ops->node_fixup) + ret = ops->node_fixup(node, base, size); + } + return ret; +} + +/** + * fdt_validate_reserved_mem_node() - validate a reserved memory node + * @node: FDT node to validate + * @align: pointer to store the validated alignment (may be modified by callback) + * + * This function iterates through the reserved memory drivers and calls + * the node_validate callback for the compatible entry matching the node. + * + * Return: 0 on success, -ENODEV if no compatible match found + */ +static int __init fdt_validate_reserved_mem_node(unsigned long node, phys_addr_t *align) +{ + const struct of_device_id *i; + int ret = -ENODEV; + + for (i = __reservedmem_of_table; ret == -ENODEV && + i < &__rmem_of_table_sentinel; i++) { + const struct reserved_mem_ops *ops = i->data; + + if (!of_flat_dt_is_compatible(node, i->compatible)) + continue; + + if (ops->node_validate) + ret = ops->node_validate(node, align); + } + return ret; +} + /** * __reserved_mem_init_node() - initialize a reserved memory region * @rmem: reserved_mem structure to initialize @@ -494,7 +543,6 @@ static const struct of_device_id __rmem_of_table_sentinel static int __init __reserved_mem_init_node(struct reserved_mem *rmem, unsigned long node) { - extern const struct of_device_id __reservedmem_of_table[]; const struct of_device_id *i; int ret = -ENODEV; @@ -511,7 +559,7 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem, rmem->ops = ops; pr_info("initialized node %s, compatible id %s\n", rmem->name, compat); - break; + return ret; } } return ret; diff --git a/include/linux/cma.h b/include/linux/cma.h index d0793eaaadaa..8555d38a97b1 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -61,14 +61,4 @@ extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); extern void cma_reserve_pages_on_error(struct cma *cma); - -#ifdef CONFIG_DMA_CMA -extern bool cma_skip_dt_default_reserved_mem(void); -#else -static inline bool cma_skip_dt_default_reserved_mem(void) -{ - return false; -} -#endif - #endif diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..55ecd2934225 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -147,9 +147,6 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page, { __free_pages(page, get_order(size)); } -static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) -{ -} #endif /* CONFIG_DMA_CMA*/ #ifdef CONFIG_DMA_DECLARE_COHERENT diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 747a1e73d5dd..e8b20b29fa68 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -18,6 +18,9 @@ struct reserved_mem { }; struct reserved_mem_ops { + int (*node_validate)(unsigned long fdt_node, phys_addr_t *align); + int (*node_fixup)(unsigned long fdt_node, phys_addr_t base, + phys_addr_t size); int (*node_init)(unsigned long fdt_node, struct reserved_mem *rmem); int (*device_init)(struct reserved_mem *rmem, struct device *dev); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index efeebda92537..65d216663e81 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -91,16 +91,6 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); -/* - * cma_skip_dt_default_reserved_mem - This is called from the - * reserved_mem framework to detect if the default cma region is being - * set by the "cma=" kernel parameter. - */ -bool __init cma_skip_dt_default_reserved_mem(void) -{ - return size_cmdline != -1; -} - #ifdef CONFIG_DMA_NUMA_CMA static struct cma *dma_contiguous_numa_area[MAX_NUMNODES]; @@ -470,25 +460,65 @@ static void rmem_cma_device_release(struct reserved_mem *rmem, dev->cma_area = NULL; } +static int __init __rmem_cma_verify_node(unsigned long node) +{ + if (!of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -ENODEV; + + if (size_cmdline != -1 && + of_get_flat_dt_prop(node, "linux,cma-default", NULL)) { + pr_err("Skipping dt linux,cma-default node in favor for \"cma=\" kernel param.\n"); + return -EBUSY; + } + return 0; +} + +static int __init rmem_cma_validate(unsigned long node, phys_addr_t *align) +{ + int ret = __rmem_cma_verify_node(node); + + if (ret) + return ret; + + if (align) + *align = max_t(phys_addr_t, *align, CMA_MIN_ALIGNMENT_BYTES); + + return 0; +} + +static int __init rmem_cma_fixup(unsigned long node, phys_addr_t base, + phys_addr_t size) +{ + int ret = __rmem_cma_verify_node(node); + + if (ret) + return ret; + + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(base, size); + return 0; +} + static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) { bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; - int err; + int ret; - if (!of_get_flat_dt_prop(node, "reusable", NULL) || - of_get_flat_dt_prop(node, "no-map", NULL)) - return -ENODEV; + ret = __rmem_cma_verify_node(node); + if (ret) + return ret; if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } - err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); - if (err) { + ret = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); + if (ret) { pr_err("Reserved memory: unable to setup CMA region\n"); - return err; + return ret; } if (default_cma) @@ -499,14 +529,16 @@ static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); - err = dma_heap_cma_register_heap(cma); - if (err) + ret = dma_heap_cma_register_heap(cma); + if (ret) pr_warn("Couldn't register CMA heap."); return 0; } static const struct reserved_mem_ops rmem_cma_ops = { + .node_validate = rmem_cma_validate, + .node_fixup = rmem_cma_fixup, .node_init = rmem_cma_setup, .device_init = rmem_cma_device_init, .device_release = rmem_cma_device_release, From 427864f793eb69ae3f33aed0fcbe625809412366 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:21 +0100 Subject: [PATCH 14/24] of: reserved_mem: rearrange code a bit Move __rmem_check_for_overlap() and __rmem_cmp() functions before fdt_scan_reserved_mem_reg_nodes() to avoid forward declaration and keep related code close together. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-6-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- drivers/of/of_reserved_mem.c | 99 ++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 5dd585bcf8a8..f9b6d3ebcc20 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -214,7 +214,55 @@ static int __init __reserved_mem_check_root(unsigned long node) return 0; } -static void __init __rmem_check_for_overlap(void); +static int __init __rmem_cmp(const void *a, const void *b) +{ + const struct reserved_mem *ra = a, *rb = b; + + if (ra->base < rb->base) + return -1; + + if (ra->base > rb->base) + return 1; + + /* + * Put the dynamic allocations (address == 0, size == 0) before static + * allocations at address 0x0 so that overlap detection works + * correctly. + */ + if (ra->size < rb->size) + return -1; + if (ra->size > rb->size) + return 1; + + return 0; +} + +static void __init __rmem_check_for_overlap(void) +{ + int i; + + if (reserved_mem_count < 2) + return; + + sort(reserved_mem, reserved_mem_count, sizeof(reserved_mem[0]), + __rmem_cmp, NULL); + for (i = 0; i < reserved_mem_count - 1; i++) { + struct reserved_mem *this, *next; + + this = &reserved_mem[i]; + next = &reserved_mem[i + 1]; + + if (this->base + this->size > next->base) { + phys_addr_t this_end, next_end; + + this_end = this->base + this->size; + next_end = next->base + next->size; + pr_err("OVERLAP DETECTED!\n%s (%pa--%pa) overlaps with %s (%pa--%pa)\n", + this->name, &this->base, &this_end, + next->name, &next->base, &next_end); + } + } +} /** * fdt_scan_reserved_mem_reg_nodes() - Store info for the "reg" defined @@ -565,55 +613,6 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem, return ret; } -static int __init __rmem_cmp(const void *a, const void *b) -{ - const struct reserved_mem *ra = a, *rb = b; - - if (ra->base < rb->base) - return -1; - - if (ra->base > rb->base) - return 1; - - /* - * Put the dynamic allocations (address == 0, size == 0) before static - * allocations at address 0x0 so that overlap detection works - * correctly. - */ - if (ra->size < rb->size) - return -1; - if (ra->size > rb->size) - return 1; - - return 0; -} - -static void __init __rmem_check_for_overlap(void) -{ - int i; - - if (reserved_mem_count < 2) - return; - - sort(reserved_mem, reserved_mem_count, sizeof(reserved_mem[0]), - __rmem_cmp, NULL); - for (i = 0; i < reserved_mem_count - 1; i++) { - struct reserved_mem *this, *next; - - this = &reserved_mem[i]; - next = &reserved_mem[i + 1]; - - if (this->base + this->size > next->base) { - phys_addr_t this_end, next_end; - - this_end = this->base + this->size; - next_end = next->base + next->size; - pr_err("OVERLAP DETECTED!\n%s (%pa--%pa) overlaps with %s (%pa--%pa)\n", - this->name, &this->base, &this_end, - next->name, &next->base, &next_end); - } - } -} /** * fdt_init_reserved_mem_node() - Initialize a reserved memory region From bf66171579ce738d3dccce78b7dd37de2ba947f2 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:22 +0100 Subject: [PATCH 15/24] of: reserved_mem: clarify fdt_scan_reserved_mem*() functions Rename fdt_scan_reserved_mem_reg_nodes() to fdt_scan_reserved_mem_late() to clearly show how it differs from fdt_scan_reserved_mem() and update description of both functions. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-7-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- drivers/of/fdt.c | 2 +- drivers/of/of_private.h | 2 +- drivers/of/of_reserved_mem.c | 24 +++++++++++++++--------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 331646d667b9..43a0944ca462 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1274,7 +1274,7 @@ void __init unflatten_device_tree(void) void *fdt = initial_boot_params; /* Save the statically-placed regions in the reserved_mem array */ - fdt_scan_reserved_mem_reg_nodes(); + fdt_scan_reserved_mem_late(); /* Populate an empty root node when bootloader doesn't provide one */ if (!fdt) { diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index df0bb00349e0..0ae16da066e2 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -186,7 +186,7 @@ static inline struct device_node *__of_get_dma_parent(const struct device_node * #endif int fdt_scan_reserved_mem(void); -void __init fdt_scan_reserved_mem_reg_nodes(void); +void __init fdt_scan_reserved_mem_late(void); bool of_fdt_device_is_available(const void *blob, unsigned long node); diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index f9b6d3ebcc20..037e3d74dde1 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -265,16 +265,15 @@ static void __init __rmem_check_for_overlap(void) } /** - * fdt_scan_reserved_mem_reg_nodes() - Store info for the "reg" defined - * reserved memory regions. + * fdt_scan_reserved_mem_late() - Scan FDT and initialize remaining reserved + * memory regions. * - * This function is used to scan through the DT and store the - * information for the reserved memory regions that are defined using - * the "reg" property. The region node number, name, base address, and - * size are all stored in the reserved_mem array by calling the - * fdt_reserved_mem_save_node() function. + * This function is used to scan again through the DT and initialize the + * "static" reserved memory regions, that are defined using the "reg" + * property. Each such region is then initialized with its specific init + * function and stored in the global reserved_mem array. */ -void __init fdt_scan_reserved_mem_reg_nodes(void) +void __init fdt_scan_reserved_mem_late(void) { const void *fdt = initial_boot_params; phys_addr_t base, size; @@ -328,7 +327,14 @@ void __init fdt_scan_reserved_mem_reg_nodes(void) static int __init __reserved_mem_alloc_size(unsigned long node, const char *uname); /* - * fdt_scan_reserved_mem() - scan a single FDT node for reserved memory + * fdt_scan_reserved_mem() - reserve and allocate memory occupied by + * reserved memory regions. + * + * This function is used to scan through the FDT and mark memory occupied + * by all static (defined by the "reg" property) reserved memory regions. + * Then memory for all dynamic regions (defined by size & alignment) is + * allocated, a region specific init function is called and region information + * is stored in the reserved_mem array. */ int __init fdt_scan_reserved_mem(void) { From 34e0e2a8ea9e9e4f4dceb33072103dffaa1366b3 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Mar 2026 10:00:23 +0100 Subject: [PATCH 16/24] of: reserved_mem: rework fdt_init_reserved_mem_node() Move the content of fdt_reserved_mem_save_node() to fdt_init_reserved_mem_node() function. Initialization is no longer performed in two steps as it was initially, so fdt_reserved_mem_save_node() name is a bit misleading and that function now performs full initialization of the reserved memory region. This also fixes the problem of keeping pointers to the regions, which failed to initialize, what might cause issues when such region is assigned to the device. Signed-off-by: Marek Szyprowski Link: https://patch.msgid.link/20260325090023.3175348-8-m.szyprowski@samsung.com Signed-off-by: Rob Herring (Arm) --- drivers/of/of_reserved_mem.c | 62 ++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 037e3d74dde1..8d5777cb5d1b 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -102,36 +102,13 @@ static void __init alloc_reserved_mem_array(void) reserved_mem = new_array; } -static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem, - unsigned long node); +static void fdt_init_reserved_mem_node(unsigned long node, const char *uname, + phys_addr_t base, phys_addr_t size); static int fdt_validate_reserved_mem_node(unsigned long node, phys_addr_t *align); static int fdt_fixup_reserved_mem_node(unsigned long node, phys_addr_t base, phys_addr_t size); -/* - * fdt_reserved_mem_save_node() - save fdt node for second pass initialization - */ -static void __init fdt_reserved_mem_save_node(unsigned long node, const char *uname, - phys_addr_t base, phys_addr_t size) -{ - struct reserved_mem *rmem = &reserved_mem[reserved_mem_count]; - - if (reserved_mem_count == total_reserved_mem_cnt) { - pr_err("not enough space for all defined regions.\n"); - return; - } - - rmem->name = uname; - rmem->base = base; - rmem->size = size; - - /* Call the region specific initialization function */ - fdt_init_reserved_mem_node(rmem, node); - - reserved_mem_count++; -} - static int __init early_init_dt_reserve_memory(phys_addr_t base, phys_addr_t size, bool nomap) { @@ -316,7 +293,7 @@ void __init fdt_scan_reserved_mem_late(void) if (size) { uname = fdt_get_name(fdt, child, NULL); - fdt_reserved_mem_save_node(child, uname, base, size); + fdt_init_reserved_mem_node(child, uname, base, size); } } @@ -515,9 +492,8 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam } fdt_fixup_reserved_mem_node(node, base, size); + fdt_init_reserved_mem_node(node, uname, base, size); - /* Save region in the reserved_mem array */ - fdt_reserved_mem_save_node(node, uname, base, size); return 0; } @@ -619,30 +595,46 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem, return ret; } - /** * fdt_init_reserved_mem_node() - Initialize a reserved memory region - * @rmem: reserved_mem struct of the memory region to be initialized. * @node: fdt node of the initialized region + * @uname: name of the reserved memory node + * @base: base address of the reserved memory region + * @size: size of the reserved memory region * - * This function is used to call the region specific initialization - * function for a reserved memory region. + * This function calls the region-specific initialization function for a + * reserved memory region and saves all region-specific data to the + * reserved_mem array to allow of_reserved_mem_lookup() to find it. */ -static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem, - unsigned long node) +static void __init fdt_init_reserved_mem_node(unsigned long node, const char *uname, + phys_addr_t base, phys_addr_t size) { int err = 0; bool nomap; + struct reserved_mem *rmem = &reserved_mem[reserved_mem_count]; + + if (reserved_mem_count == total_reserved_mem_cnt) { + pr_err("not enough space for all defined regions.\n"); + return; + } + + rmem->name = uname; + rmem->base = base; + rmem->size = size; + nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; err = __reserved_mem_init_node(rmem, node); if (err != 0 && err != -ENODEV) { pr_info("node %s compatible matching fail\n", rmem->name); + rmem->name = NULL; + if (nomap) memblock_clear_nomap(rmem->base, rmem->size); else memblock_phys_free(rmem->base, rmem->size); + return; } else { phys_addr_t end = rmem->base + rmem->size - 1; bool reusable = @@ -654,6 +646,8 @@ static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem, reusable ? "reusable" : "non-reusable", rmem->name ? rmem->name : "unknown"); } + + reserved_mem_count++; } struct rmem_assigned_device { From 25bd73562941b04cfba1a278d8c84f2b1c69b8e9 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:10 +0200 Subject: [PATCH 17/24] dma: contiguous: Turn heap registration logic around The CMA heap instantiation was initially developed by having the contiguous DMA code call into the CMA heap to create a new instance every time a reserved memory area is probed. Turning the CMA heap into a module would create a dependency of the kernel on a module, which doesn't work. Let's turn the logic around and do the opposite: store all the reserved memory CMA regions into the contiguous DMA code, and provide an iterator for the heap to use when it probes. Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-1-e18fda504419@kernel.org --- drivers/dma-buf/heaps/cma_heap.c | 19 ++--------- include/linux/dma-buf/heaps/cma.h | 16 --------- include/linux/dma-map-ops.h | 5 +++ kernel/dma/contiguous.c | 55 ++++++++++++++++++++++++++++--- 4 files changed, 57 insertions(+), 38 deletions(-) delete mode 100644 include/linux/dma-buf/heaps/cma.h diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c index bd3370b9a3f6..33cac626da11 100644 --- a/drivers/dma-buf/heaps/cma_heap.c +++ b/drivers/dma-buf/heaps/cma_heap.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -30,19 +29,6 @@ #define DEFAULT_CMA_NAME "default_cma_region" -static struct cma *dma_areas[MAX_CMA_AREAS] __initdata; -static unsigned int dma_areas_num __initdata; - -int __init dma_heap_cma_register_heap(struct cma *cma) -{ - if (dma_areas_num >= ARRAY_SIZE(dma_areas)) - return -EINVAL; - - dma_areas[dma_areas_num++] = cma; - - return 0; -} - struct cma_heap { struct dma_heap *heap; struct cma *cma; @@ -414,6 +400,7 @@ static int __init __add_cma_heap(struct cma *cma, const char *name) static int __init add_cma_heaps(void) { struct cma *default_cma = dev_get_cma_area(NULL); + struct cma *cma; unsigned int i; int ret; @@ -423,9 +410,7 @@ static int __init add_cma_heaps(void) return ret; } - for (i = 0; i < dma_areas_num; i++) { - struct cma *cma = dma_areas[i]; - + for (i = 0; (cma = dma_contiguous_get_area_by_idx(i)) != NULL; i++) { ret = __add_cma_heap(cma, cma_get_name(cma)); if (ret) { pr_warn("Failed to add CMA heap %s", cma_get_name(cma)); diff --git a/include/linux/dma-buf/heaps/cma.h b/include/linux/dma-buf/heaps/cma.h deleted file mode 100644 index e751479e21e7..000000000000 --- a/include/linux/dma-buf/heaps/cma.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef DMA_BUF_HEAP_CMA_H_ -#define DMA_BUF_HEAP_CMA_H_ - -struct cma; - -#ifdef CONFIG_DMABUF_HEAPS_CMA -int dma_heap_cma_register_heap(struct cma *cma); -#else -static inline int dma_heap_cma_register_heap(struct cma *cma) -{ - return 0; -} -#endif // CONFIG_DMABUF_HEAPS_CMA - -#endif // DMA_BUF_HEAP_CMA_H_ diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..c4c93c72ff6f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -99,6 +99,7 @@ static inline struct cma *dev_get_cma_area(struct device *dev) return dev->cma_area; return dma_contiguous_default_area; } +struct cma *dma_contiguous_get_area_by_idx(unsigned int idx); void dma_contiguous_reserve(phys_addr_t addr_limit); int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, @@ -117,6 +118,10 @@ static inline struct cma *dev_get_cma_area(struct device *dev) { return NULL; } +static inline struct cma *dma_contiguous_get_area_by_idx(unsigned int idx) +{ + return NULL; +} static inline void dma_contiguous_reserve(phys_addr_t limit) { } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index c56004d314dc..afa9fd313040 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -53,6 +52,37 @@ #define CMA_SIZE_MBYTES 0 #endif +static struct cma *dma_contiguous_areas[MAX_CMA_AREAS]; +static unsigned int dma_contiguous_areas_num; + +static int dma_contiguous_insert_area(struct cma *cma) +{ + if (dma_contiguous_areas_num >= ARRAY_SIZE(dma_contiguous_areas)) + return -EINVAL; + + dma_contiguous_areas[dma_contiguous_areas_num++] = cma; + + return 0; +} + +/** + * dma_contiguous_get_area_by_idx() - Get contiguous area at given index + * @idx: index of the area we query + * + * Queries for the contiguous area located at index @idx. + * + * Returns: + * A pointer to the requested contiguous area, or NULL otherwise. + */ +struct cma *dma_contiguous_get_area_by_idx(unsigned int idx) +{ + if (idx >= dma_contiguous_areas_num) + return NULL; + + return dma_contiguous_areas[idx]; +} +EXPORT_SYMBOL_GPL(dma_contiguous_get_area_by_idx); + struct cma *dma_contiguous_default_area; /* @@ -264,9 +294,24 @@ void __init dma_contiguous_reserve(phys_addr_t limit) if (ret) return; - ret = dma_heap_cma_register_heap(dma_contiguous_default_area); + /* + * We need to insert the new area in our list to avoid + * any inconsistencies between having the default area + * listed in the DT or not. + * + * The DT case is handled by rmem_cma_setup() and will + * always insert all its areas in our list. However, if + * it didn't run (because OF_RESERVED_MEM isn't set, or + * there's no DT region specified), then we don't have a + * default area yet, and no area in our list. + * + * This block creates the default area in such a case, + * but we also need to insert it in our list to avoid + * having a default area but an empty list. + */ + ret = dma_contiguous_insert_area(dma_contiguous_default_area); if (ret) - pr_warn("Couldn't register default CMA heap."); + pr_warn("Couldn't queue default CMA region for heap creation."); } } @@ -506,9 +551,9 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); - err = dma_heap_cma_register_heap(cma); + err = dma_contiguous_insert_area(cma); if (err) - pr_warn("Couldn't register CMA heap."); + pr_warn("Couldn't store CMA reserved area."); return 0; } From b3707be95f045c4e526e419435af29dc9dd1c267 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:11 +0200 Subject: [PATCH 18/24] dma: contiguous: Make dev_get_cma_area() a proper function As we try to enable dma-buf heaps, and the CMA one in particular, to compile as modules, we need to export dev_get_cma_area(). It's currently implemented as an inline function that returns either the content of device->cma_area or dma_contiguous_default_area. Thus, it means we need to export dma_contiguous_default_area, which isn't really something we want any module to have access to. Instead, let's make dev_get_cma_area() a proper function we will be able to export so we can avoid exporting dma_contiguous_default_area. Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-2-e18fda504419@kernel.org --- include/linux/dma-map-ops.h | 7 +------ kernel/dma/contiguous.c | 8 ++++++++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index c4c93c72ff6f..8604106c0c01 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -93,12 +93,7 @@ static inline void set_dma_ops(struct device *dev, #ifdef CONFIG_DMA_CMA extern struct cma *dma_contiguous_default_area; -static inline struct cma *dev_get_cma_area(struct device *dev) -{ - if (dev && dev->cma_area) - return dev->cma_area; - return dma_contiguous_default_area; -} +struct cma *dev_get_cma_area(struct device *dev); struct cma *dma_contiguous_get_area_by_idx(unsigned int idx); void dma_contiguous_reserve(phys_addr_t addr_limit); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index afa9fd313040..40a0ead24979 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -131,6 +131,14 @@ bool __init cma_skip_dt_default_reserved_mem(void) return size_cmdline != -1; } +struct cma *dev_get_cma_area(struct device *dev) +{ + if (dev && dev->cma_area) + return dev->cma_area; + + return dma_contiguous_default_area; +} + #ifdef CONFIG_DMA_NUMA_CMA static struct cma *dma_contiguous_numa_area[MAX_NUMNODES]; From 633040f853467a490437ace26d6a5413e64c0dd0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:12 +0200 Subject: [PATCH 19/24] dma: contiguous: Make dma_contiguous_default_area static Now that dev_get_cma_area() is no longer inline, we don't have any user of dma_contiguous_default_area() outside of contiguous.c so we can make it static. Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-3-e18fda504419@kernel.org --- include/linux/dma-map-ops.h | 2 -- kernel/dma/contiguous.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 8604106c0c01..bef279ebeae7 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -91,8 +91,6 @@ static inline void set_dma_ops(struct device *dev, #endif /* CONFIG_ARCH_HAS_DMA_OPS */ #ifdef CONFIG_DMA_CMA -extern struct cma *dma_contiguous_default_area; - struct cma *dev_get_cma_area(struct device *dev); struct cma *dma_contiguous_get_area_by_idx(unsigned int idx); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 40a0ead24979..fd8d3518a232 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -83,7 +83,7 @@ struct cma *dma_contiguous_get_area_by_idx(unsigned int idx) } EXPORT_SYMBOL_GPL(dma_contiguous_get_area_by_idx); -struct cma *dma_contiguous_default_area; +static struct cma *dma_contiguous_default_area; /* * Default global CMA area size can be defined in kernel's .config. From 6207948f389ec1b938a39aa43fb4aedd58d65e0d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:13 +0200 Subject: [PATCH 20/24] dma: contiguous: Export dev_get_cma_area() The CMA dma-buf heap uses the dev_get_cma_area() function to retrieve the default contiguous area. Now that this function is no longer inlined, and since we want to turn the CMA heap into a module, let's export it. Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-4-e18fda504419@kernel.org --- kernel/dma/contiguous.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index fd8d3518a232..83a5bd9488e1 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -138,6 +138,7 @@ struct cma *dev_get_cma_area(struct device *dev) return dma_contiguous_default_area; } +EXPORT_SYMBOL_GPL(dev_get_cma_area); #ifdef CONFIG_DMA_NUMA_CMA From 7e72a8f8bb0d5ba89027d64e3e2aad1984d2e20d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 31 Mar 2026 12:00:14 +0200 Subject: [PATCH 21/24] mm: cma: Export cma_alloc(), cma_release() and cma_get_name() The CMA dma-buf heap uses cma_alloc() and cma_release() to allocate and free, respectively, its CMA buffers, and cma_get_name() to get the name of the heap instance it's going to create. However, these functions are not exported. Since we want to turn the CMA heap into a module, let's export them both. Reviewed-by: T.J. Mercier Acked-by: David Hildenbrand (Arm) Signed-off-by: Maxime Ripard Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260331-dma-buf-heaps-as-modules-v4-5-e18fda504419@kernel.org --- mm/cma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/cma.c b/mm/cma.c index 94b5da468a7d..550effb9c4e0 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -52,6 +52,7 @@ const char *cma_get_name(const struct cma *cma) { return cma->name; } +EXPORT_SYMBOL_GPL(cma_get_name); static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, unsigned int align_order) @@ -951,6 +952,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, return page; } +EXPORT_SYMBOL_GPL(cma_alloc); static struct cma_memrange *find_cma_memrange(struct cma *cma, const struct page *pages, unsigned long count) @@ -1027,6 +1029,7 @@ bool cma_release(struct cma *cma, const struct page *pages, return true; } +EXPORT_SYMBOL_GPL(cma_release); bool cma_release_frozen(struct cma *cma, const struct page *pages, unsigned long count) From f0548044a02630402d374df195ed3af4cc5e4711 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 25 Mar 2026 20:23:51 +0100 Subject: [PATCH 22/24] dma-mapping: introduce DMA_ATTR_CC_SHARED for shared memory Current CC designs don't place a vIOMMU in front of untrusted devices. Instead, the DMA API forces all untrusted device DMA through swiotlb bounce buffers (is_swiotlb_force_bounce()) which copies data into shared memory on behalf of the device. When a caller has already arranged for the memory to be shared via set_memory_decrypted(), the DMA API needs to know so it can map directly using the unencrypted physical address rather than bounce buffering. Following the pattern of DMA_ATTR_MMIO, add DMA_ATTR_CC_SHARED for this purpose. Like the MMIO case, only the caller knows what kind of memory it has and must inform the DMA API for it to work correctly. Signed-off-by: Jiri Pirko Reviewed-by: Jason Gunthorpe Acked-by: Sumit Semwal Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260325192352.437608-2-jiri@resnulli.us --- include/linux/dma-mapping.h | 10 ++++++++++ include/trace/events/dma.h | 3 ++- kernel/dma/direct.h | 14 +++++++++++--- kernel/dma/mapping.c | 13 +++++++++++-- 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 677c51ab7510..db8ab24a54f4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -92,6 +92,16 @@ * flushing. */ #define DMA_ATTR_REQUIRE_COHERENT (1UL << 12) +/* + * DMA_ATTR_CC_SHARED: Indicates the DMA mapping is shared (decrypted) for + * confidential computing guests. For normal system memory the caller must have + * called set_memory_decrypted(), and pgprot_decrypted must be used when + * creating CPU PTEs for the mapping. The same shared semantic may be passed + * to the vIOMMU when it sets up the IOPTE. For MMIO use together with + * DMA_ATTR_MMIO to indicate shared MMIO. Unless DMA_ATTR_MMIO is provided + * a struct page is required. + */ +#define DMA_ATTR_CC_SHARED (1UL << 13) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 63597b004424..31c9ddf72c9d 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -34,7 +34,8 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ { DMA_ATTR_MMIO, "MMIO" }, \ { DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }, \ - { DMA_ATTR_REQUIRE_COHERENT, "REQUIRE_COHERENT" }) + { DMA_ATTR_REQUIRE_COHERENT, "REQUIRE_COHERENT" }, \ + { DMA_ATTR_CC_SHARED, "CC_SHARED" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index b86ff65496fc..7140c208c123 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -89,16 +89,24 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) - return DMA_MAPPING_ERROR; + if (!(attrs & DMA_ATTR_CC_SHARED)) { + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) + return DMA_MAPPING_ERROR; - return swiotlb_map(dev, phys, size, dir, attrs); + return swiotlb_map(dev, phys, size, dir, attrs); + } + } else if (attrs & DMA_ATTR_CC_SHARED) { + return DMA_MAPPING_ERROR; } if (attrs & DMA_ATTR_MMIO) { dma_addr = phys; if (unlikely(!dma_capable(dev, dma_addr, size, false))) goto err_overflow; + } else if (attrs & DMA_ATTR_CC_SHARED) { + dma_addr = phys_to_dma_unencrypted(dev, phys); + if (unlikely(!dma_capable(dev, dma_addr, size, false))) + goto err_overflow; } else { dma_addr = phys_to_dma(dev, phys); if (unlikely(!dma_capable(dev, dma_addr, size, true)) || diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index df3eccc7d4ca..23ed8eb9233e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,6 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; + bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED; dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); @@ -168,8 +169,11 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) + (!is_mmio && !is_cc_shared && + arch_dma_map_phys_direct(dev, phys + size))) addr = dma_direct_map_phys(dev, phys, size, dir, attrs, true); + else if (is_cc_shared) + return DMA_MAPPING_ERROR; else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (ops->map_phys) @@ -206,11 +210,16 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; + bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED; BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops) || - (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size))) + (!is_mmio && !is_cc_shared && + arch_dma_unmap_phys_direct(dev, addr + size))) dma_direct_unmap_phys(dev, addr, size, dir, attrs, true); + else if (is_cc_shared) + return; else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (ops->unmap_phys) From 78b30c50a7ac9b8fbec678d71f81dec80bf8eed6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 25 Mar 2026 20:23:52 +0100 Subject: [PATCH 23/24] dma-buf: heaps: system: add system_cc_shared heap for explicitly shared memory Add a new "system_cc_shared" dma-buf heap to allow userspace to allocate shared (decrypted) memory for confidential computing (CoCo) VMs. On CoCo VMs, guest memory is private by default. The hardware uses an encryption bit in page table entries (C-bit on AMD SEV, "shared" bit on Intel TDX) to control whether a given memory access is private or shared. The kernel's direct map is set up as private, so pages returned by alloc_pages() are private in the direct map by default. To make this memory usable for devices that do not support DMA to private memory (no TDISP support), it has to be explicitly shared. A couple of things are needed to properly handle shared memory for the dma-buf use case: - set_memory_decrypted() on the direct map after allocation: Besides clearing the encryption bit in the direct map PTEs, this also notifies the hypervisor about the page state change. On free, the inverse set_memory_encrypted() must be called before returning pages to the allocator. If re-encryption fails, pages are intentionally leaked to prevent shared memory from being reused as private. - pgprot_decrypted() for userspace and kernel virtual mappings: Any new mapping of the shared pages, be it to userspace via mmap or to kernel vmalloc space via vmap, creates PTEs independent of the direct map. These must also have the encryption bit cleared, otherwise accesses through them would see encrypted (garbage) data. - DMA_ATTR_CC_SHARED for DMA mapping: Since the pages are already shared, the DMA API needs to be informed via DMA_ATTR_CC_SHARED so it can map them correctly as unencrypted for device access. On non-CoCo VMs, the system_cc_shared heap is not registered to prevent misuse by userspace that does not understand the security implications of explicitly shared memory. Signed-off-by: Jiri Pirko Reviewed-by: T.J. Mercier Reviewed-by: Jason Gunthorpe Acked-by: Sumit Semwal Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260325192352.437608-3-jiri@resnulli.us --- drivers/dma-buf/heaps/system_heap.c | 103 ++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 5 deletions(-) diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c index b3650d8fd651..03c2b87cb111 100644 --- a/drivers/dma-buf/heaps/system_heap.c +++ b/drivers/dma-buf/heaps/system_heap.c @@ -10,17 +10,25 @@ * Andrew F. Davis */ +#include #include #include #include #include #include +#include #include +#include #include +#include #include #include #include +struct system_heap_priv { + bool cc_shared; +}; + struct system_heap_buffer { struct dma_heap *heap; struct list_head attachments; @@ -29,6 +37,7 @@ struct system_heap_buffer { struct sg_table sg_table; int vmap_cnt; void *vaddr; + bool cc_shared; }; struct dma_heap_attachment { @@ -36,6 +45,7 @@ struct dma_heap_attachment { struct sg_table table; struct list_head list; bool mapped; + bool cc_shared; }; #define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO) @@ -52,6 +62,34 @@ static gfp_t order_flags[] = {HIGH_ORDER_GFP, HIGH_ORDER_GFP, LOW_ORDER_GFP}; static const unsigned int orders[] = {8, 4, 0}; #define NUM_ORDERS ARRAY_SIZE(orders) +static int system_heap_set_page_decrypted(struct page *page) +{ + unsigned long addr = (unsigned long)page_address(page); + unsigned int nr_pages = 1 << compound_order(page); + int ret; + + ret = set_memory_decrypted(addr, nr_pages); + if (ret) + pr_warn_ratelimited("dma-buf system heap: failed to decrypt page at %p\n", + page_address(page)); + + return ret; +} + +static int system_heap_set_page_encrypted(struct page *page) +{ + unsigned long addr = (unsigned long)page_address(page); + unsigned int nr_pages = 1 << compound_order(page); + int ret; + + ret = set_memory_encrypted(addr, nr_pages); + if (ret) + pr_warn_ratelimited("dma-buf system heap: failed to re-encrypt page at %p, leaking memory\n", + page_address(page)); + + return ret; +} + static int dup_sg_table(struct sg_table *from, struct sg_table *to) { struct scatterlist *sg, *new_sg; @@ -90,6 +128,7 @@ static int system_heap_attach(struct dma_buf *dmabuf, a->dev = attachment->dev; INIT_LIST_HEAD(&a->list); a->mapped = false; + a->cc_shared = buffer->cc_shared; attachment->priv = a; @@ -119,9 +158,11 @@ static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attac { struct dma_heap_attachment *a = attachment->priv; struct sg_table *table = &a->table; + unsigned long attrs; int ret; - ret = dma_map_sgtable(attachment->dev, table, direction, 0); + attrs = a->cc_shared ? DMA_ATTR_CC_SHARED : 0; + ret = dma_map_sgtable(attachment->dev, table, direction, attrs); if (ret) return ERR_PTR(ret); @@ -188,8 +229,13 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) unsigned long addr = vma->vm_start; unsigned long pgoff = vma->vm_pgoff; struct scatterlist *sg; + pgprot_t prot; int i, ret; + prot = vma->vm_page_prot; + if (buffer->cc_shared) + prot = pgprot_decrypted(prot); + for_each_sgtable_sg(table, sg, i) { unsigned long n = sg->length >> PAGE_SHIFT; @@ -206,8 +252,7 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) if (addr + size > vma->vm_end) size = vma->vm_end - addr; - ret = remap_pfn_range(vma, addr, page_to_pfn(page), - size, vma->vm_page_prot); + ret = remap_pfn_range(vma, addr, page_to_pfn(page), size, prot); if (ret) return ret; @@ -225,6 +270,7 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer) struct page **pages = vmalloc(sizeof(struct page *) * npages); struct page **tmp = pages; struct sg_page_iter piter; + pgprot_t prot; void *vaddr; if (!pages) @@ -235,7 +281,10 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer) *tmp++ = sg_page_iter_page(&piter); } - vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + prot = PAGE_KERNEL; + if (buffer->cc_shared) + prot = pgprot_decrypted(prot); + vaddr = vmap(pages, npages, VM_MAP, prot); vfree(pages); if (!vaddr) @@ -296,6 +345,14 @@ static void system_heap_dma_buf_release(struct dma_buf *dmabuf) for_each_sgtable_sg(table, sg, i) { struct page *page = sg_page(sg); + /* + * Intentionally leak pages that cannot be re-encrypted + * to prevent shared memory from being reused. + */ + if (buffer->cc_shared && + system_heap_set_page_encrypted(page)) + continue; + __free_pages(page, compound_order(page)); } sg_free_table(table); @@ -347,6 +404,8 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap, DEFINE_DMA_BUF_EXPORT_INFO(exp_info); unsigned long size_remaining = len; unsigned int max_order = orders[0]; + struct system_heap_priv *priv = dma_heap_get_drvdata(heap); + bool cc_shared = priv->cc_shared; struct dma_buf *dmabuf; struct sg_table *table; struct scatterlist *sg; @@ -362,6 +421,7 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap, mutex_init(&buffer->lock); buffer->heap = heap; buffer->len = len; + buffer->cc_shared = cc_shared; INIT_LIST_HEAD(&pages); i = 0; @@ -396,6 +456,14 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap, list_del(&page->lru); } + if (cc_shared) { + for_each_sgtable_sg(table, sg, i) { + ret = system_heap_set_page_decrypted(sg_page(sg)); + if (ret) + goto free_pages; + } + } + /* create the dmabuf */ exp_info.exp_name = dma_heap_get_name(heap); exp_info.ops = &system_heap_buf_ops; @@ -413,6 +481,13 @@ free_pages: for_each_sgtable_sg(table, sg, i) { struct page *p = sg_page(sg); + /* + * Intentionally leak pages that cannot be re-encrypted + * to prevent shared memory from being reused. + */ + if (buffer->cc_shared && + system_heap_set_page_encrypted(p)) + continue; __free_pages(p, compound_order(p)); } sg_free_table(table); @@ -428,6 +503,14 @@ static const struct dma_heap_ops system_heap_ops = { .allocate = system_heap_allocate, }; +static struct system_heap_priv system_heap_priv = { + .cc_shared = false, +}; + +static struct system_heap_priv system_heap_cc_shared_priv = { + .cc_shared = true, +}; + static int __init system_heap_create(void) { struct dma_heap_export_info exp_info; @@ -435,8 +518,18 @@ static int __init system_heap_create(void) exp_info.name = "system"; exp_info.ops = &system_heap_ops; - exp_info.priv = NULL; + exp_info.priv = &system_heap_priv; + sys_heap = dma_heap_add(&exp_info); + if (IS_ERR(sys_heap)) + return PTR_ERR(sys_heap); + + if (IS_ENABLED(CONFIG_HIGHMEM) || + !cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return 0; + + exp_info.name = "system_cc_shared"; + exp_info.priv = &system_heap_cc_shared_priv; sys_heap = dma_heap_add(&exp_info); if (IS_ERR(sys_heap)) return PTR_ERR(sys_heap); From 15818b2cd42df3cc886f4cc46acfab4d072dcacc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 7 Apr 2026 11:26:17 +0200 Subject: [PATCH 24/24] dma-buf: heaps: system: document system_cc_shared heap Document the system_cc_shared dma-buf heap that was introduced recently. Describe its purpose, availability conditions and relation to confidential computing VMs. Signed-off-by: Jiri Pirko Reviewed-by: T.J.Mercier Acked-by: Sumit Semwal Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260407092617.635223-1-jiri@resnulli.us --- Documentation/userspace-api/dma-buf-heaps.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/userspace-api/dma-buf-heaps.rst b/Documentation/userspace-api/dma-buf-heaps.rst index 05445c83b79a..f56b743cdb36 100644 --- a/Documentation/userspace-api/dma-buf-heaps.rst +++ b/Documentation/userspace-api/dma-buf-heaps.rst @@ -16,6 +16,13 @@ following heaps: - The ``system`` heap allocates virtually contiguous, cacheable, buffers. + - The ``system_cc_shared`` heap allocates virtually contiguous, cacheable, + buffers using shared (decrypted) memory. It is only present on + confidential computing (CoCo) VMs where memory encryption is active + (e.g., AMD SEV, Intel TDX). The allocated pages have the encryption + bit cleared, making them accessible for device DMA without TDISP + support. On non-CoCo VM configurations, this heap is not registered. + - The ``default_cma_region`` heap allocates physically contiguous, cacheable, buffers. Only present if a CMA region is present. Such a region is usually created either through the kernel commandline