Merge tag 'dma-mapping-7.1-2026-04-16' of git://git.kernel.org/pub/scm/linux/kernel/git/mszyprowski/linux

Pull dma-mapping updates from Marek Szyprowski: - added support for batched cache sync, what improves performance of dma_map/unmap_sg() operations on ARM64 architecture (Barry Song) - introduced DMA_ATTR_CC_SHARED attribute for explicitly shared memory used in confidential computing (Jiri Pirko) - refactored spaghetti-like code in drivers/of/of_reserved_mem.c and its clients (Marek Szyprowski, shared branch with device-tree updates to avoid merge conflicts) - prepared Contiguous Memory Allocator related code for making dma-buf drivers modularized (Maxime Ripard) - added support for benchmarking dma_map_sg() calls to tools/dma utility (Qinxin Xia) * tag 'dma-mapping-7.1-2026-04-16' of git://git.kernel.org/pub/scm/linux/kernel/git/mszyprowski/linux: (24 commits) dma-buf: heaps: system: document system_cc_shared heap dma-buf: heaps: system: add system_cc_shared heap for explicitly shared memory dma-mapping: introduce DMA_ATTR_CC_SHARED for shared memory mm: cma: Export cma_alloc(), cma_release() and cma_get_name() dma: contiguous: Export dev_get_cma_area() dma: contiguous: Make dma_contiguous_default_area static dma: contiguous: Make dev_get_cma_area() a proper function dma: contiguous: Turn heap registration logic around of: reserved_mem: rework fdt_init_reserved_mem_node() of: reserved_mem: clarify fdt_scan_reserved_mem*() functions of: reserved_mem: rearrange code a bit of: reserved_mem: replace CMA quirks by generic methods of: reserved_mem: switch to ops based OF_DECLARE() of: reserved_mem: use -ENODEV instead of -ENOENT of: reserved_mem: remove fdt node from the structure dma-mapping: fix false kernel-doc comment marker dma-mapping: Support batch mode for dma_direct_{map,unmap}_sg dma-mapping: Separate DMA sync issuing and completion waiting arm64: Provide dcache_inval_poc_nosync helper arm64: Provide dcache_clean_poc_nosync helper ...
2026-04-18 06:44:00 -04:00 · 2026-04-17 11:12:42 -07:00
parent d662a710c6 15818b2cd4
commit 87768582a4
33 changed files with 931 additions and 359 deletions
--- a/Documentation/userspace-api/dma-buf-heaps.rst
+++ b/Documentation/userspace-api/dma-buf-heaps.rst
@@ -16,6 +16,13 @@ following heaps:
 - The ``system`` heap allocates virtually contiguous, cacheable, buffers.
 - The ``system_cc_shared`` heap allocates virtually contiguous, cacheable,
   buffers using shared (decrypted) memory. It is only present on
   confidential computing (CoCo) VMs where memory encryption is active
   (e.g., AMD SEV, Intel TDX). The allocated pages have the encryption
   bit cleared, making them accessible for device DMA without TDISP
   support. On non-CoCo VM configurations, this heap is not registered.
 - The ``default_cma_region`` heap allocates physically contiguous,
   cacheable, buffers. Only present if a CMA region is present. Such a
   region is usually created either through the kernel commandline
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -54,6 +54,7 @@ config ARM64
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_BATCHED_DMA_SYNC
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_ZONE_DMA_SET if EXPERT
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -371,14 +371,13 @@ alternative_endif
 * [start, end) with dcache line size explicitly provided.
 *
 * 	op:		operation passed to dc instruction
 * 	domain:		domain used in dsb instruction
 * 	start:          starting virtual address of the region
 * 	end:            end virtual address of the region
 *	linesz:		dcache line size
 * 	fixup:		optional label to branch to on user fault
 * 	Corrupts:       start, end, tmp
 */
-	.macro dcache_by_myline_op op, domain, start, end, linesz, tmp, fixup
+	.macro dcache_by_myline_op_nosync op, start, end, linesz, tmp, fixup
 	sub	\tmp, \linesz, #1
 	bic	\start, \start, \tmp
 alternative_if ARM64_WORKAROUND_4311569
@@ -412,14 +411,28 @@ alternative_if ARM64_WORKAROUND_4311569
 	cbnz	\start, .Ldcache_op\@
 	.endif
 alternative_else_nop_endif
 	dsb	\domain
 	_cond_uaccess_extable .Ldcache_op\@, \fixup
 	.endm
 /*
 * Macro to perform a data cache maintenance for the interval
- * [start, end)
+ * [start, end) without waiting for completion
 *
 * 	op:		operation passed to dc instruction
 * 	start:          starting virtual address of the region
 * 	end:            end virtual address of the region
 * 	fixup:		optional label to branch to on user fault
 * 	Corrupts:       start, end, tmp1, tmp2
 */
 	.macro dcache_by_line_op_nosync op, start, end, tmp1, tmp2, fixup
 	dcache_line_size \tmp1, \tmp2
 	dcache_by_myline_op_nosync \op, \start, \end, \tmp1, \tmp2, \fixup
 	.endm
 /*
 * Macro to perform a data cache maintenance for the interval
 * [start, end) and wait for completion
 *
 * 	op:		operation passed to dc instruction
 * 	domain:		domain used in dsb instruction
@@ -429,8 +442,8 @@ alternative_else_nop_endif
 * 	Corrupts:       start, end, tmp1, tmp2
 */
 	.macro dcache_by_line_op op, domain, start, end, tmp1, tmp2, fixup
-	dcache_line_size \tmp1, \tmp2
+	dcache_by_line_op_nosync \op, \start, \end, \tmp1, \tmp2, \fixup
-	dcache_by_myline_op \op, \domain, \start, \end, \tmp1, \tmp2, \fixup
+	dsb \domain
 	.endm
 /*
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -87,6 +87,11 @@ int cache_line_size(void);
 #define dma_get_cache_alignment	cache_line_size
 static inline void arch_sync_dma_flush(void)
 {
 	dsb(sy);
 }
 /* Compress a u64 MPIDR value into 32 bits. */
 static inline u64 arch_compact_of_hwid(u64 id)
 {
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -74,6 +74,8 @@ extern void icache_inval_pou(unsigned long start, unsigned long end);
 extern void dcache_clean_inval_poc(unsigned long start, unsigned long end);
 extern void dcache_inval_poc(unsigned long start, unsigned long end);
 extern void dcache_clean_poc(unsigned long start, unsigned long end);
 extern void dcache_inval_poc_nosync(unsigned long start, unsigned long end);
 extern void dcache_clean_poc_nosync(unsigned long start, unsigned long end);
 extern void dcache_clean_pop(unsigned long start, unsigned long end);
 extern void dcache_clean_pou(unsigned long start, unsigned long end);
 extern long caches_clean_inval_user_pou(unsigned long start, unsigned long end);
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -64,7 +64,8 @@ SYM_CODE_START(arm64_relocate_new_kernel)
 	mov	x19, x13
 	copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8
 	add	x1, x19, #PAGE_SIZE
-	dcache_by_myline_op civac, sy, x19, x1, x15, x20
+	dcache_by_myline_op_nosync civac, x19, x1, x15, x20
 	dsb	sy
 	b	.Lnext
 .Ltest_indirection:
 	tbz	x16, IND_INDIRECTION_BIT, .Ltest_destination
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -132,17 +132,7 @@ alternative_else_nop_endif
 	ret
 SYM_FUNC_END(dcache_clean_pou)
-/*
+.macro __dcache_inval_poc_nosync
 *	dcache_inval_poc(start, end)
 *
 * 	Ensure that any D-cache lines for the interval [start, end)
 * 	are invalidated. Any partial lines at the ends of the interval are
 *	also cleaned to PoC to prevent data loss.
 *
 *	- start   - kernel start address of region
 *	- end     - kernel end address of region
 */
 SYM_FUNC_START(__pi_dcache_inval_poc)
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	tst	x1, x3				// end cache line aligned?
@@ -158,11 +148,41 @@ SYM_FUNC_START(__pi_dcache_inval_poc)
 3:	add	x0, x0, x2
 	cmp	x0, x1
 	b.lo	2b
 .endm
 /*
 *	dcache_inval_poc(start, end)
 *
 * 	Ensure that any D-cache lines for the interval [start, end)
 * 	are invalidated. Any partial lines at the ends of the interval are
 *	also cleaned to PoC to prevent data loss.
 *
 *	- start   - kernel start address of region
 *	- end     - kernel end address of region
 */
 SYM_FUNC_START(__pi_dcache_inval_poc)
 	__dcache_inval_poc_nosync
 	dsb	sy
 	ret
 SYM_FUNC_END(__pi_dcache_inval_poc)
 SYM_FUNC_ALIAS(dcache_inval_poc, __pi_dcache_inval_poc)
 /*
 *	dcache_inval_poc_nosync(start, end)
 *
 * 	Issue the instructions of D-cache lines for the interval [start, end)
 * 	for invalidation. Not necessarily cleaned to PoC till an explicit dsb
 *	sy is issued later
 *
 *	- start   - kernel start address of region
 *	- end     - kernel end address of region
 */
 SYM_FUNC_START(__pi_dcache_inval_poc_nosync)
 	__dcache_inval_poc_nosync
 	ret
 SYM_FUNC_END(__pi_dcache_inval_poc_nosync)
 SYM_FUNC_ALIAS(dcache_inval_poc_nosync, __pi_dcache_inval_poc_nosync)
 /*
 *	dcache_clean_poc(start, end)
 *
@@ -178,6 +198,21 @@ SYM_FUNC_START(__pi_dcache_clean_poc)
 SYM_FUNC_END(__pi_dcache_clean_poc)
 SYM_FUNC_ALIAS(dcache_clean_poc, __pi_dcache_clean_poc)
 /*
 *	dcache_clean_poc_nosync(start, end)
 *
 * 	Issue the instructions of D-cache lines for the interval [start, end).
 * 	not necessarily cleaned to the PoC till an explicit dsb sy afterward.
 *
 *	- start   - virtual start address of region
 *	- end     - virtual end address of region
 */
 SYM_FUNC_START(__pi_dcache_clean_poc_nosync)
 	dcache_by_line_op_nosync cvac, x0, x1, x2, x3
 	ret
 SYM_FUNC_END(__pi_dcache_clean_poc_nosync)
 SYM_FUNC_ALIAS(dcache_clean_poc_nosync, __pi_dcache_clean_poc_nosync)
 /*
 *	dcache_clean_pop(start, end)
 *
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
 {
 	unsigned long start = (unsigned long)phys_to_virt(paddr);
-	dcache_clean_poc(start, start + size);
+	dcache_clean_poc_nosync(start, start + size);
 }
 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
@@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 	if (dir == DMA_TO_DEVICE)
 		return;
-	dcache_inval_poc(start, start + size);
+	dcache_inval_poc_nosync(start, start + size);
 }
 void arch_dma_prep_coherent(struct page *page, size_t size)
--- a/drivers/dma-buf/heaps/cma_heap.c
+++ b/drivers/dma-buf/heaps/cma_heap.c
@@ -14,7 +14,6 @@
 #include <linux/cma.h>
 #include <linux/dma-buf.h>
 #include <linux/dma-buf/heaps/cma.h>
 #include <linux/dma-heap.h>
 #include <linux/dma-map-ops.h>
 #include <linux/err.h>
@@ -30,19 +29,6 @@
 #define DEFAULT_CMA_NAME "default_cma_region"
 static struct cma *dma_areas[MAX_CMA_AREAS] __initdata;
 static unsigned int dma_areas_num __initdata;
 int __init dma_heap_cma_register_heap(struct cma *cma)
 {
 	if (dma_areas_num >= ARRAY_SIZE(dma_areas))
 		return -EINVAL;
 	dma_areas[dma_areas_num++] = cma;
 	return 0;
 }
 struct cma_heap {
 	struct dma_heap *heap;
 	struct cma *cma;
@@ -411,6 +397,7 @@ static int __init __add_cma_heap(struct cma *cma, const char *name)
 static int __init add_cma_heaps(void)
 {
 	struct cma *default_cma = dev_get_cma_area(NULL);
 	struct cma *cma;
 	unsigned int i;
 	int ret;
@@ -420,9 +407,7 @@ static int __init add_cma_heaps(void)
 			return ret;
 	}
-	for (i = 0; i < dma_areas_num; i++) {
+	for (i = 0; (cma = dma_contiguous_get_area_by_idx(i)) != NULL; i++) {
 		struct cma *cma = dma_areas[i];
 		ret = __add_cma_heap(cma, cma_get_name(cma));
 		if (ret) {
 			pr_warn("Failed to add CMA heap %s", cma_get_name(cma));
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -10,17 +10,25 @@
 *	Andrew F. Davis <afd@ti.com>
 */
 #include <linux/cc_platform.h>
 #include <linux/dma-buf.h>
 #include <linux/dma-mapping.h>
 #include <linux/dma-heap.h>
 #include <linux/err.h>
 #include <linux/highmem.h>
 #include <linux/mem_encrypt.h>
 #include <linux/mm.h>
 #include <linux/set_memory.h>
 #include <linux/module.h>
 #include <linux/pgtable.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 struct system_heap_priv {
 	bool cc_shared;
 };
 struct system_heap_buffer {
 	struct dma_heap *heap;
 	struct list_head attachments;
@@ -29,6 +37,7 @@ struct system_heap_buffer {
 	struct sg_table sg_table;
 	int vmap_cnt;
 	void *vaddr;
 	bool cc_shared;
 };
 struct dma_heap_attachment {
@@ -36,6 +45,7 @@ struct dma_heap_attachment {
 	struct sg_table table;
 	struct list_head list;
 	bool mapped;
 	bool cc_shared;
 };
 #define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO)
@@ -52,6 +62,34 @@ static gfp_t order_flags[] = {HIGH_ORDER_GFP, HIGH_ORDER_GFP, LOW_ORDER_GFP};
 static const unsigned int orders[] = {8, 4, 0};
 #define NUM_ORDERS ARRAY_SIZE(orders)
 static int system_heap_set_page_decrypted(struct page *page)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	unsigned int nr_pages = 1 << compound_order(page);
 	int ret;
 	ret = set_memory_decrypted(addr, nr_pages);
 	if (ret)
 		pr_warn_ratelimited("dma-buf system heap: failed to decrypt page at %p\n",
 				    page_address(page));
 	return ret;
 }
 static int system_heap_set_page_encrypted(struct page *page)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	unsigned int nr_pages = 1 << compound_order(page);
 	int ret;
 	ret = set_memory_encrypted(addr, nr_pages);
 	if (ret)
 		pr_warn_ratelimited("dma-buf system heap: failed to re-encrypt page at %p, leaking memory\n",
 				    page_address(page));
 	return ret;
 }
 static int dup_sg_table(struct sg_table *from, struct sg_table *to)
 {
 	struct scatterlist *sg, *new_sg;
@@ -90,6 +128,7 @@ static int system_heap_attach(struct dma_buf *dmabuf,
 	a->dev = attachment->dev;
 	INIT_LIST_HEAD(&a->list);
 	a->mapped = false;
 	a->cc_shared = buffer->cc_shared;
 	attachment->priv = a;
@@ -119,9 +158,11 @@ static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attac
 {
 	struct dma_heap_attachment *a = attachment->priv;
 	struct sg_table *table = &a->table;
 	unsigned long attrs;
 	int ret;
-	ret = dma_map_sgtable(attachment->dev, table, direction, 0);
+	attrs = a->cc_shared ? DMA_ATTR_CC_SHARED : 0;
 	ret = dma_map_sgtable(attachment->dev, table, direction, attrs);
 	if (ret)
 		return ERR_PTR(ret);
@@ -188,8 +229,13 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
 	unsigned long addr = vma->vm_start;
 	unsigned long pgoff = vma->vm_pgoff;
 	struct scatterlist *sg;
 	pgprot_t prot;
 	int i, ret;
 	prot = vma->vm_page_prot;
 	if (buffer->cc_shared)
 		prot = pgprot_decrypted(prot);
 	for_each_sgtable_sg(table, sg, i) {
 		unsigned long n = sg->length >> PAGE_SHIFT;
@@ -206,8 +252,7 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
 		if (addr + size > vma->vm_end)
 			size = vma->vm_end - addr;
-		ret = remap_pfn_range(vma, addr, page_to_pfn(page),
+		ret = remap_pfn_range(vma, addr, page_to_pfn(page), size, prot);
 				size, vma->vm_page_prot);
 		if (ret)
 			return ret;
@@ -225,6 +270,7 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
 	struct page **pages = vmalloc(sizeof(struct page *) * npages);
 	struct page **tmp = pages;
 	struct sg_page_iter piter;
 	pgprot_t prot;
 	void *vaddr;
 	if (!pages)
@@ -235,7 +281,10 @@ static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
 		*tmp++ = sg_page_iter_page(&piter);
 	}
-	vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
+	prot = PAGE_KERNEL;
 	if (buffer->cc_shared)
 		prot = pgprot_decrypted(prot);
 	vaddr = vmap(pages, npages, VM_MAP, prot);
 	vfree(pages);
 	if (!vaddr)
@@ -296,6 +345,14 @@ static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
 	for_each_sgtable_sg(table, sg, i) {
 		struct page *page = sg_page(sg);
 		/*
 		 * Intentionally leak pages that cannot be re-encrypted
 		 * to prevent shared memory from being reused.
 		 */
 		if (buffer->cc_shared &&
 		    system_heap_set_page_encrypted(page))
 			continue;
 		__free_pages(page, compound_order(page));
 	}
 	sg_free_table(table);
@@ -347,6 +404,8 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
 	unsigned long size_remaining = len;
 	unsigned int max_order = orders[0];
 	struct system_heap_priv *priv = dma_heap_get_drvdata(heap);
 	bool cc_shared = priv->cc_shared;
 	struct dma_buf *dmabuf;
 	struct sg_table *table;
 	struct scatterlist *sg;
@@ -362,6 +421,7 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
 	mutex_init(&buffer->lock);
 	buffer->heap = heap;
 	buffer->len = len;
 	buffer->cc_shared = cc_shared;
 	INIT_LIST_HEAD(&pages);
 	i = 0;
@@ -396,6 +456,14 @@ static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
 		list_del(&page->lru);
 	}
 	if (cc_shared) {
 		for_each_sgtable_sg(table, sg, i) {
 			ret = system_heap_set_page_decrypted(sg_page(sg));
 			if (ret)
 				goto free_pages;
 		}
 	}
 	/* create the dmabuf */
 	exp_info.exp_name = dma_heap_get_name(heap);
 	exp_info.ops = &system_heap_buf_ops;
@@ -413,6 +481,13 @@ free_pages:
 	for_each_sgtable_sg(table, sg, i) {
 		struct page *p = sg_page(sg);
 		/*
 		 * Intentionally leak pages that cannot be re-encrypted
 		 * to prevent shared memory from being reused.
 		 */
 		if (buffer->cc_shared &&
 		    system_heap_set_page_encrypted(p))
 			continue;
 		__free_pages(p, compound_order(p));
 	}
 	sg_free_table(table);
@@ -428,6 +503,14 @@ static const struct dma_heap_ops system_heap_ops = {
 	.allocate = system_heap_allocate,
 };
 static struct system_heap_priv system_heap_priv = {
 	.cc_shared = false,
 };
 static struct system_heap_priv system_heap_cc_shared_priv = {
 	.cc_shared = true,
 };
 static int __init system_heap_create(void)
 {
 	struct dma_heap_export_info exp_info;
@@ -435,8 +518,18 @@ static int __init system_heap_create(void)
 	exp_info.name = "system";
 	exp_info.ops = &system_heap_ops;
-	exp_info.priv = NULL;
+	exp_info.priv = &system_heap_priv;
 	sys_heap = dma_heap_add(&exp_info);
 	if (IS_ERR(sys_heap))
 		return PTR_ERR(sys_heap);
 	if (IS_ENABLED(CONFIG_HIGHMEM) ||
 	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT))
 		return 0;
 	exp_info.name = "system_cc_shared";
 	exp_info.priv = &system_heap_cc_shared_priv;
 	sys_heap = dma_heap_add(&exp_info);
 	if (IS_ERR(sys_heap))
 		return PTR_ERR(sys_heap);
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1106,8 +1106,10 @@ void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
 		return;
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu(phys, size, dir);
 		arch_sync_dma_flush();
 	}
 	swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
@@ -1123,8 +1125,10 @@ void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
 	swiotlb_sync_single_for_device(dev, phys, size, dir);
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_device(phys, size, dir);
 		arch_sync_dma_flush();
 	}
 }
 void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
@@ -1133,13 +1137,15 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
 	struct scatterlist *sg;
 	int i;
-	if (sg_dma_is_swiotlb(sgl))
+	if (sg_dma_is_swiotlb(sgl)) {
 		for_each_sg(sgl, sg, nelems, i)
 			iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
 						      sg->length, dir);
-	else if (!dev_is_dma_coherent(dev))
+	} else if (!dev_is_dma_coherent(dev)) {
 		for_each_sg(sgl, sg, nelems, i)
 			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 		arch_sync_dma_flush();
 	}
 }
 void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
@@ -1148,14 +1154,16 @@ void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 	struct scatterlist *sg;
 	int i;
-	if (sg_dma_is_swiotlb(sgl))
+	if (sg_dma_is_swiotlb(sgl)) {
 		for_each_sg(sgl, sg, nelems, i)
 			iommu_dma_sync_single_for_device(dev,
 							 sg_dma_address(sg),
 							 sg->length, dir);
-	else if (!dev_is_dma_coherent(dev))
+	} else if (!dev_is_dma_coherent(dev)) {
 		for_each_sg(sgl, sg, nelems, i)
 			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
 		arch_sync_dma_flush();
 	}
 }
 static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
@@ -1230,8 +1238,10 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 			return DMA_MAPPING_ERROR;
 	}
-	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
 		arch_sync_dma_for_device(phys, size, dir);
 		arch_sync_dma_flush();
 	}
 	iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
 	if (iova == DMA_MAPPING_ERROR &&
@@ -1254,8 +1264,10 @@ void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle,
 	if (WARN_ON(!phys))
 		return;
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev))
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu(phys, size, dir);
 		arch_sync_dma_flush();
 	}
 	__iommu_dma_unmap(dev, dma_handle, size);
@@ -2004,6 +2016,8 @@ int dma_iova_sync(struct device *dev, struct dma_iova_state *state,
 	dma_addr_t addr = state->addr + offset;
 	size_t iova_start_pad = iova_offset(iovad, addr);
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_flush();
 	return iommu_sync_map(domain, addr - iova_start_pad,
 		      iova_align(iovad, size + iova_start_pad));
 }
@@ -2017,6 +2031,8 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev,
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	struct iova_domain *iovad = &cookie->iovad;
 	size_t iova_start_pad = iova_offset(iovad, addr);
 	bool need_sync_dma = !dev_is_dma_coherent(dev) &&
 			!(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO));
 	dma_addr_t end = addr + size;
 	do {
@@ -2040,6 +2056,9 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev,
 		addr += len;
 		iova_start_pad = 0;
 	} while (addr < end);
 	if (need_sync_dma)
 		arch_sync_dma_flush();
 }
 static void __iommu_dma_iova_unlink(struct device *dev,
--- a/drivers/memory/tegra/tegra210-emc-table.c
+++ b/drivers/memory/tegra/tegra210-emc-table.c
@@ -70,19 +70,20 @@ static void tegra210_emc_table_device_release(struct reserved_mem *rmem,
 	memunmap(timings);
 }
-static const struct reserved_mem_ops tegra210_emc_table_ops = {
+static int tegra210_emc_table_init(unsigned long node,
-	.device_init = tegra210_emc_table_device_init,
+				   struct reserved_mem *rmem)
 	.device_release = tegra210_emc_table_device_release,
 };
 static int tegra210_emc_table_init(struct reserved_mem *rmem)
 {
 	pr_debug("Tegra210 EMC table at %pa, size %lu bytes\n", &rmem->base,
 		 (unsigned long)rmem->size);
 	rmem->ops = &tegra210_emc_table_ops;
 	return 0;
 }
 static const struct reserved_mem_ops tegra210_emc_table_ops = {
 	.node_init = tegra210_emc_table_init,
 	.device_init = tegra210_emc_table_device_init,
 	.device_release = tegra210_emc_table_device_release,
 };
 RESERVEDMEM_OF_DECLARE(tegra210_emc_table, "nvidia,tegra210-emc-table",
-		       tegra210_emc_table_init);
+		       &tegra210_emc_table_ops);
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1295,7 +1295,7 @@ void __init unflatten_device_tree(void)
 	void *fdt = initial_boot_params;
 	/* Save the statically-placed regions in the reserved_mem array */
-	fdt_scan_reserved_mem_reg_nodes();
+	fdt_scan_reserved_mem_late();
 	/* Populate an empty root node when bootloader doesn't provide one */
 	if (!fdt) {
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -186,7 +186,7 @@ static inline struct device_node *__of_get_dma_parent(const struct device_node *
 #endif
 int fdt_scan_reserved_mem(void);
-void __init fdt_scan_reserved_mem_reg_nodes(void);
+void __init fdt_scan_reserved_mem_late(void);
 bool of_fdt_device_is_available(const void *blob, unsigned long node);
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -24,8 +24,6 @@
 #include <linux/slab.h>
 #include <linux/memblock.h>
 #include <linux/kmemleak.h>
 #include <linux/cma.h>
 #include <linux/dma-map-ops.h>
 #include "of_private.h"
@@ -104,30 +102,12 @@ static void __init alloc_reserved_mem_array(void)
 	reserved_mem = new_array;
 }
-static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem);
+static void fdt_init_reserved_mem_node(unsigned long node, const char *uname,
-/*
+				       phys_addr_t base, phys_addr_t size);
- * fdt_reserved_mem_save_node() - save fdt node for second pass initialization
+static int fdt_validate_reserved_mem_node(unsigned long node,
- */
+					  phys_addr_t *align);
-static void __init fdt_reserved_mem_save_node(unsigned long node, const char *uname,
+static int fdt_fixup_reserved_mem_node(unsigned long node,
-					      phys_addr_t base, phys_addr_t size)
+				       phys_addr_t base, phys_addr_t size);
 {
 	struct reserved_mem *rmem = &reserved_mem[reserved_mem_count];
 	if (reserved_mem_count == total_reserved_mem_cnt) {
 		pr_err("not enough space for all defined regions.\n");
 		return;
 	}
 	rmem->fdt_node = node;
 	rmem->name = uname;
 	rmem->base = base;
 	rmem->size = size;
 	/* Call the region specific initialization function */
 	fdt_init_reserved_mem_node(rmem);
 	reserved_mem_count++;
 }
 static int __init early_init_dt_reserve_memory(phys_addr_t base,
 					       phys_addr_t size, bool nomap)
@@ -154,21 +134,19 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
 					     const char *uname)
 {
 	phys_addr_t base, size;
-	int i, len;
+	int i, len, err;
 	const __be32 *prop;
-	bool nomap, default_cma;
+	bool nomap;
 	prop = of_flat_dt_get_addr_size_prop(node, "reg", &len);
 	if (!prop)
 		return -ENOENT;
 	nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL;
 	default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
-	if (default_cma && cma_skip_dt_default_reserved_mem()) {
+	err = fdt_validate_reserved_mem_node(node, NULL);
-		pr_err("Skipping dt linux,cma-default for \"cma=\" kernel param.\n");
+	if (err && err != -ENODEV)
-		return -EINVAL;
+		return err;
 	}
 	for (i = 0; i < len; i++) {
 		u64 b, s;
@@ -179,10 +157,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
 		size = s;
 		if (size && early_init_dt_reserve_memory(base, size, nomap) == 0) {
-			/* Architecture specific contiguous memory fixup. */
+			fdt_fixup_reserved_mem_node(node, base, size);
 			if (of_flat_dt_is_compatible(node, "shared-dma-pool") &&
 			    of_get_flat_dt_prop(node, "reusable", NULL))
 				dma_contiguous_early_fixup(base, size);
 			pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
 				uname, &base, (unsigned long)(size / SZ_1M));
 		} else {
@@ -216,19 +191,66 @@ static int __init __reserved_mem_check_root(unsigned long node)
 	return 0;
 }
-static void __init __rmem_check_for_overlap(void);
+static int __init __rmem_cmp(const void *a, const void *b)
 {
 	const struct reserved_mem *ra = a, *rb = b;
 	if (ra->base < rb->base)
 		return -1;
 	if (ra->base > rb->base)
 		return 1;
 	/*
 	 * Put the dynamic allocations (address == 0, size == 0) before static
 	 * allocations at address 0x0 so that overlap detection works
 	 * correctly.
 	 */
 	if (ra->size < rb->size)
 		return -1;
 	if (ra->size > rb->size)
 		return 1;
 	return 0;
 }
 static void __init __rmem_check_for_overlap(void)
 {
 	int i;
 	if (reserved_mem_count < 2)
 		return;
 	sort(reserved_mem, reserved_mem_count, sizeof(reserved_mem[0]),
 	     __rmem_cmp, NULL);
 	for (i = 0; i < reserved_mem_count - 1; i++) {
 		struct reserved_mem *this, *next;
 		this = &reserved_mem[i];
 		next = &reserved_mem[i + 1];
 		if (this->base + this->size > next->base) {
 			phys_addr_t this_end, next_end;
 			this_end = this->base + this->size;
 			next_end = next->base + next->size;
 			pr_err("OVERLAP DETECTED!\n%s (%pa--%pa) overlaps with %s (%pa--%pa)\n",
 			       this->name, &this->base, &this_end,
 			       next->name, &next->base, &next_end);
 		}
 	}
 }
 /**
- * fdt_scan_reserved_mem_reg_nodes() - Store info for the "reg" defined
+ * fdt_scan_reserved_mem_late() - Scan FDT and initialize remaining reserved
- * reserved memory regions.
+ * memory regions.
 *
- * This function is used to scan through the DT and store the
+ * This function is used to scan again through the DT and initialize the
- * information for the reserved memory regions that are defined using
+ * "static" reserved memory regions, that are defined using the "reg"
- * the "reg" property. The region node number, name, base address, and
+ * property. Each such region is then initialized with its specific init
- * size are all stored in the reserved_mem array by calling the
+ * function and stored in the global reserved_mem array.
 * fdt_reserved_mem_save_node() function.
 */
-void __init fdt_scan_reserved_mem_reg_nodes(void)
+void __init fdt_scan_reserved_mem_late(void)
 {
 	const void *fdt = initial_boot_params;
 	phys_addr_t base, size;
@@ -253,23 +275,25 @@ void __init fdt_scan_reserved_mem_reg_nodes(void)
 	fdt_for_each_subnode(child, fdt, node) {
 		const char *uname;
 		bool default_cma = of_get_flat_dt_prop(child, "linux,cma-default", NULL);
 		u64 b, s;
 		int ret;
 		if (!of_fdt_device_is_available(fdt, child))
 			continue;
 		if (default_cma && cma_skip_dt_default_reserved_mem())
 			continue;
 		if (!of_flat_dt_get_addr_size(child, "reg", &b, &s))
 			continue;
 		ret = fdt_validate_reserved_mem_node(child, NULL);
 		if (ret && ret != -ENODEV)
 			continue;
 		base = b;
 		size = s;
 		if (size) {
 			uname = fdt_get_name(fdt, child, NULL);
-			fdt_reserved_mem_save_node(child, uname, base, size);
+			fdt_init_reserved_mem_node(child, uname, base, size);
 		}
 	}
@@ -280,7 +304,14 @@ void __init fdt_scan_reserved_mem_reg_nodes(void)
 static int __init __reserved_mem_alloc_size(unsigned long node, const char *uname);
 /*
- * fdt_scan_reserved_mem() - scan a single FDT node for reserved memory
+ * fdt_scan_reserved_mem() - reserve and allocate memory occupied by
 * reserved memory regions.
 *
 * This function is used to scan through the FDT and mark memory occupied
 * by all static (defined by the "reg" property) reserved memory regions.
 * Then memory for all dynamic regions (defined by size & alignment) is
 * allocated, a region specific init function is called and region information
 * is stored in the reserved_mem array.
 */
 int __init fdt_scan_reserved_mem(void)
 {
@@ -397,7 +428,7 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam
 	phys_addr_t base = 0, align = 0, size;
 	int i, len;
 	const __be32 *prop;
-	bool nomap, default_cma;
+	bool nomap;
 	int ret;
 	prop = of_get_flat_dt_prop(node, "size", &len);
@@ -421,19 +452,10 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam
 	}
 	nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL;
 	default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
-	if (default_cma && cma_skip_dt_default_reserved_mem()) {
+	ret = fdt_validate_reserved_mem_node(node, &align);
-		pr_err("Skipping dt linux,cma-default for \"cma=\" kernel param.\n");
+	if (ret && ret != -ENODEV)
-		return -EINVAL;
+		return ret;
 	}
 	/* Need adjust the alignment to satisfy the CMA requirement */
 	if (IS_ENABLED(CONFIG_CMA)
 	    && of_flat_dt_is_compatible(node, "shared-dma-pool")
 	    && of_get_flat_dt_prop(node, "reusable", NULL)
 	    && !nomap)
 		align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES);
 	prop = of_flat_dt_get_addr_size_prop(node, "alloc-ranges", &len);
 	if (prop) {
@@ -468,121 +490,151 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam
 		       uname, (unsigned long)(size / SZ_1M));
 		return -ENOMEM;
 	}
-	/* Architecture specific contiguous memory fixup. */
+
-	if (of_flat_dt_is_compatible(node, "shared-dma-pool") &&
+	fdt_fixup_reserved_mem_node(node, base, size);
-	    of_get_flat_dt_prop(node, "reusable", NULL))
+	fdt_init_reserved_mem_node(node, uname, base, size);
-		dma_contiguous_early_fixup(base, size);
+
 	/* Save region in the reserved_mem array */
 	fdt_reserved_mem_save_node(node, uname, base, size);
 	return 0;
 }
 extern const struct of_device_id __reservedmem_of_table[];
 static const struct of_device_id __rmem_of_table_sentinel
 	__used __section("__reservedmem_of_table_end");
-/*
+/**
- * __reserved_mem_init_node() - call region specific reserved memory init code
+ * fdt_fixup_reserved_mem_node() - call fixup function for a reserved memory node
 * @node: FDT node to fixup
 * @base: base address of the reserved memory region
 * @size: size of the reserved memory region
 *
 * This function iterates through the reserved memory drivers and calls
 * the node_fixup callback for the compatible entry matching the node.
 *
 * Return: 0 on success, -ENODEV if no compatible match found
 */
-static int __init __reserved_mem_init_node(struct reserved_mem *rmem)
+static int __init fdt_fixup_reserved_mem_node(unsigned long node,
 					phys_addr_t base, phys_addr_t size)
 {
 	extern const struct of_device_id __reservedmem_of_table[];
 	const struct of_device_id *i;
-	int ret = -ENOENT;
+	int ret = -ENODEV;
-	for (i = __reservedmem_of_table; i < &__rmem_of_table_sentinel; i++) {
+	for (i = __reservedmem_of_table; ret == -ENODEV &&
-		reservedmem_of_init_fn initfn = i->data;
+	     i < &__rmem_of_table_sentinel; i++) {
-		const char *compat = i->compatible;
+		const struct reserved_mem_ops *ops = i->data;
-		if (!of_flat_dt_is_compatible(rmem->fdt_node, compat))
+		if (!of_flat_dt_is_compatible(node, i->compatible))
 			continue;
-		ret = initfn(rmem);
+		if (ops->node_fixup)
 			ret = ops->node_fixup(node, base, size);
 	}
 	return ret;
 }
 /**
 * fdt_validate_reserved_mem_node() - validate a reserved memory node
 * @node: FDT node to validate
 * @align: pointer to store the validated alignment (may be modified by callback)
 *
 * This function iterates through the reserved memory drivers and calls
 * the node_validate callback for the compatible entry matching the node.
 *
 * Return: 0 on success, -ENODEV if no compatible match found
 */
 static int __init fdt_validate_reserved_mem_node(unsigned long node, phys_addr_t *align)
 {
 	const struct of_device_id *i;
 	int ret = -ENODEV;
 	for (i = __reservedmem_of_table; ret == -ENODEV &&
 	     i < &__rmem_of_table_sentinel; i++) {
 		const struct reserved_mem_ops *ops = i->data;
 		if (!of_flat_dt_is_compatible(node, i->compatible))
 			continue;
 		if (ops->node_validate)
 			ret = ops->node_validate(node, align);
 	}
 	return ret;
 }
 /**
 * __reserved_mem_init_node() - initialize a reserved memory region
 * @rmem: reserved_mem structure to initialize
 * @node: FDT node describing the reserved memory region
 *
 * This function iterates through the reserved memory drivers and calls the
 * node_init callback for the compatible entry matching the node. On success,
 * the operations pointer is stored in the reserved_mem structure.
 *
 * Return: 0 on success, -ENODEV if no compatible match found
 */
 static int __init __reserved_mem_init_node(struct reserved_mem *rmem,
 					   unsigned long node)
 {
 	const struct of_device_id *i;
 	int ret = -ENODEV;
 	for (i = __reservedmem_of_table; ret == -ENODEV &&
 	     i < &__rmem_of_table_sentinel; i++) {
 		const struct reserved_mem_ops *ops = i->data;
 		const char *compat = i->compatible;
 		if (!of_flat_dt_is_compatible(node, compat))
 			continue;
 		ret = ops->node_init(node, rmem);
 		if (ret == 0) {
 			rmem->ops = ops;
 			pr_info("initialized node %s, compatible id %s\n",
 				rmem->name, compat);
-			break;
+			return ret;
 		}
 	}
 	return ret;
 }
 static int __init __rmem_cmp(const void *a, const void *b)
 {
 	const struct reserved_mem *ra = a, *rb = b;
 	if (ra->base < rb->base)
 		return -1;
 	if (ra->base > rb->base)
 		return 1;
 	/*
 	 * Put the dynamic allocations (address == 0, size == 0) before static
 	 * allocations at address 0x0 so that overlap detection works
 	 * correctly.
 	 */
 	if (ra->size < rb->size)
 		return -1;
 	if (ra->size > rb->size)
 		return 1;
 	if (ra->fdt_node < rb->fdt_node)
 		return -1;
 	if (ra->fdt_node > rb->fdt_node)
 		return 1;
 	return 0;
 }
 static void __init __rmem_check_for_overlap(void)
 {
 	int i;
 	if (reserved_mem_count < 2)
 		return;
 	sort(reserved_mem, reserved_mem_count, sizeof(reserved_mem[0]),
 	     __rmem_cmp, NULL);
 	for (i = 0; i < reserved_mem_count - 1; i++) {
 		struct reserved_mem *this, *next;
 		this = &reserved_mem[i];
 		next = &reserved_mem[i + 1];
 		if (this->base + this->size > next->base) {
 			phys_addr_t this_end, next_end;
 			this_end = this->base + this->size;
 			next_end = next->base + next->size;
 			pr_err("OVERLAP DETECTED!\n%s (%pa--%pa) overlaps with %s (%pa--%pa)\n",
 			       this->name, &this->base, &this_end,
 			       next->name, &next->base, &next_end);
 		}
 	}
 }
 /**
 * fdt_init_reserved_mem_node() - Initialize a reserved memory region
- * @rmem: reserved_mem struct of the memory region to be initialized.
+ * @node: fdt node of the initialized region
 * @uname: name of the reserved memory node
 * @base: base address of the reserved memory region
 * @size: size of the reserved memory region
 *
- * This function is used to call the region specific initialization
+ * This function calls the region-specific initialization function for a
- * function for a reserved memory region.
+ * reserved memory region and saves all region-specific data to the
 * reserved_mem array to allow of_reserved_mem_lookup() to find it.
 */
-static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem)
+static void __init fdt_init_reserved_mem_node(unsigned long node, const char *uname,
 					      phys_addr_t base, phys_addr_t size)
 {
 	unsigned long node = rmem->fdt_node;
 	int err = 0;
 	bool nomap;
 	struct reserved_mem *rmem = &reserved_mem[reserved_mem_count];
 	if (reserved_mem_count == total_reserved_mem_cnt) {
 		pr_err("not enough space for all defined regions.\n");
 		return;
 	}
 	rmem->name = uname;
 	rmem->base = base;
 	rmem->size = size;
 	nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL;
-	err = __reserved_mem_init_node(rmem);
+	err = __reserved_mem_init_node(rmem, node);
-	if (err != 0 && err != -ENOENT) {
+	if (err != 0 && err != -ENODEV) {
 		pr_info("node %s compatible matching fail\n", rmem->name);
 		rmem->name = NULL;
 		if (nomap)
 			memblock_clear_nomap(rmem->base, rmem->size);
 		else
 			memblock_phys_free(rmem->base, rmem->size);
 		return;
 	} else {
 		phys_addr_t end = rmem->base + rmem->size - 1;
 		bool reusable =
@@ -594,6 +646,8 @@ static void __init fdt_init_reserved_mem_node(struct reserved_mem *rmem)
 			reusable ? "reusable" : "non-reusable",
 			rmem->name ? rmem->name : "unknown");
 	}
 	reserved_mem_count++;
 }
 struct rmem_assigned_device {
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -262,11 +262,13 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys,
 done:
 	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) {
 			arch_sync_dma_for_device(phys, size, dir);
-		else
+			arch_sync_dma_flush();
 		} else {
 			xen_dma_sync_for_device(dev, dev_addr, size, dir);
 		}
 	}
 	return dev_addr;
 }
@@ -287,11 +289,13 @@ static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr,
 	BUG_ON(dir == DMA_NONE);
 	if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) {
 			arch_sync_dma_for_cpu(paddr, size, dir);
-		else
+			arch_sync_dma_flush();
 		} else {
 			xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
 		}
 	}
 	/* NOTE: We use dev_addr here, not paddr! */
 	pool = xen_swiotlb_find_pool(hwdev, dev_addr);
@@ -308,11 +312,13 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr,
 	struct io_tlb_pool *pool;
 	if (!dev_is_dma_coherent(dev)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) {
 			arch_sync_dma_for_cpu(paddr, size, dir);
-		else
+			arch_sync_dma_flush();
 		} else {
 			xen_dma_sync_for_cpu(dev, dma_addr, size, dir);
 		}
 	}
 	pool = xen_swiotlb_find_pool(dev, dma_addr);
 	if (pool)
@@ -331,12 +337,14 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr,
 		__swiotlb_sync_single_for_device(dev, paddr, size, dir, pool);
 	if (!dev_is_dma_coherent(dev)) {
-		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr))))
+		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) {
 			arch_sync_dma_for_device(paddr, size, dir);
-		else
+			arch_sync_dma_flush();
 		} else {
 			xen_dma_sync_for_device(dev, dma_addr, size, dir);
 		}
 	}
 }
 /*
 * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -61,14 +61,4 @@ extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
 extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end);
 extern void cma_reserve_pages_on_error(struct cma *cma);
 #ifdef CONFIG_DMA_CMA
 extern bool cma_skip_dt_default_reserved_mem(void);
 #else
 static inline bool cma_skip_dt_default_reserved_mem(void)
 {
 	return false;
 }
 #endif
 #endif
--- a/include/linux/dma-buf/heaps/cma.h
+++ b/include/linux/dma-buf/heaps/cma.h
@@ -1,16 +0,0 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef DMA_BUF_HEAP_CMA_H_
 #define DMA_BUF_HEAP_CMA_H_
 struct cma;
 #ifdef CONFIG_DMABUF_HEAPS_CMA
 int dma_heap_cma_register_heap(struct cma *cma);
 #else
 static inline int dma_heap_cma_register_heap(struct cma *cma)
 {
 	return 0;
 }
 #endif // CONFIG_DMABUF_HEAPS_CMA
 #endif // DMA_BUF_HEAP_CMA_H_
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -91,14 +91,8 @@ static inline void set_dma_ops(struct device *dev,
 #endif /* CONFIG_ARCH_HAS_DMA_OPS */
 #ifdef CONFIG_DMA_CMA
-extern struct cma *dma_contiguous_default_area;
+struct cma *dev_get_cma_area(struct device *dev);
-
+struct cma *dma_contiguous_get_area_by_idx(unsigned int idx);
 static inline struct cma *dev_get_cma_area(struct device *dev)
 {
 	if (dev && dev->cma_area)
 		return dev->cma_area;
 	return dma_contiguous_default_area;
 }
 void dma_contiguous_reserve(phys_addr_t addr_limit);
 int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
@@ -117,6 +111,10 @@ static inline struct cma *dev_get_cma_area(struct device *dev)
 {
 	return NULL;
 }
 static inline struct cma *dma_contiguous_get_area_by_idx(unsigned int idx)
 {
 	return NULL;
 }
 static inline void dma_contiguous_reserve(phys_addr_t limit)
 {
 }
@@ -147,9 +145,6 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page,
 {
 	__free_pages(page, get_order(size));
 }
 static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
 {
 }
 #endif /* CONFIG_DMA_CMA*/
 #ifdef CONFIG_DMA_DECLARE_COHERENT
@@ -361,6 +356,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 #ifndef CONFIG_ARCH_HAS_BATCHED_DMA_SYNC
 static inline void arch_sync_dma_flush(void)
 {
 }
 #endif
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 void arch_sync_dma_for_cpu_all(void);
 #else
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -9,7 +9,7 @@
 #include <linux/bug.h>
 #include <linux/cache.h>
-/**
+/*
 * List of possible attributes associated with a DMA mapping. The semantics
 * of each attribute should be defined in Documentation/core-api/dma-attributes.rst.
 */
@@ -92,6 +92,16 @@
 * flushing.
 */
 #define DMA_ATTR_REQUIRE_COHERENT	(1UL << 12)
 /*
 * DMA_ATTR_CC_SHARED: Indicates the DMA mapping is shared (decrypted) for
 * confidential computing guests. For normal system memory the caller must have
 * called set_memory_decrypted(), and pgprot_decrypted must be used when
 * creating CPU PTEs for the mapping. The same shared semantic may be passed
 * to the vIOMMU when it sets up the IOPTE. For MMIO use together with
 * DMA_ATTR_MMIO to indicate shared MMIO. Unless DMA_ATTR_MMIO is provided
 * a struct page is required.
 */
 #define DMA_ATTR_CC_SHARED	(1UL << 13)
 /*
 * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
--- a/include/linux/of_reserved_mem.h
+++ b/include/linux/of_reserved_mem.h
@@ -11,7 +11,6 @@ struct resource;
 struct reserved_mem {
 	const char			*name;
 	unsigned long			fdt_node;
 	const struct reserved_mem_ops	*ops;
 	phys_addr_t			base;
 	phys_addr_t			size;
@@ -19,18 +18,20 @@ struct reserved_mem {
 };
 struct reserved_mem_ops {
 	int	(*node_validate)(unsigned long fdt_node, phys_addr_t *align);
 	int	(*node_fixup)(unsigned long fdt_node, phys_addr_t base,
 			      phys_addr_t size);
 	int	(*node_init)(unsigned long fdt_node, struct reserved_mem *rmem);
 	int	(*device_init)(struct reserved_mem *rmem,
 			       struct device *dev);
 	void	(*device_release)(struct reserved_mem *rmem,
 				  struct device *dev);
 };
 typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem);
 #ifdef CONFIG_OF_RESERVED_MEM
-#define RESERVEDMEM_OF_DECLARE(name, compat, init)			\
+#define RESERVEDMEM_OF_DECLARE(name, compat, ops)			\
-	_OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn)
+	_OF_DECLARE(reservedmem, name, compat, ops, struct reserved_mem_ops *)
 int of_reserved_mem_device_init_by_idx(struct device *dev,
 				       struct device_node *np, int idx);
@@ -48,8 +49,9 @@ int of_reserved_mem_region_count(const struct device_node *np);
 #else
-#define RESERVEDMEM_OF_DECLARE(name, compat, init)			\
+#define RESERVEDMEM_OF_DECLARE(name, compat, ops)			\
-	_OF_DECLARE_STUB(reservedmem, name, compat, init, reservedmem_of_init_fn)
+	_OF_DECLARE_STUB(reservedmem, name, compat, ops,		\
 			 struct reserved_mem_ops *)
 static inline int of_reserved_mem_device_init_by_idx(struct device *dev,
 					struct device_node *np, int idx)
--- a/include/trace/events/dma.h
+++ b/include/trace/events/dma.h
@@ -34,7 +34,8 @@ TRACE_DEFINE_ENUM(DMA_NONE);
 		{ DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \
 		{ DMA_ATTR_MMIO, "MMIO" }, \
 		{ DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }, \
-		{ DMA_ATTR_REQUIRE_COHERENT, "REQUIRE_COHERENT" })
+		{ DMA_ATTR_REQUIRE_COHERENT, "REQUIRE_COHERENT" }, \
 		{ DMA_ATTR_CC_SHARED, "CC_SHARED" })
 DECLARE_EVENT_CLASS(dma_map,
 	TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr,
--- a/include/uapi/linux/map_benchmark.h
+++ b/include/uapi/linux/map_benchmark.h
@@ -17,6 +17,12 @@
 #define DMA_MAP_TO_DEVICE       1
 #define DMA_MAP_FROM_DEVICE     2
 enum {
 	DMA_MAP_BENCH_SINGLE_MODE,
 	DMA_MAP_BENCH_SG_MODE,
 	DMA_MAP_BENCH_MODE_MAX
 };
 struct map_benchmark {
 	__u64 avg_map_100ns; /* average map latency in 100ns */
 	__u64 map_stddev; /* standard deviation of map latency */
@@ -28,8 +34,11 @@ struct map_benchmark {
 	__u32 dma_bits; /* DMA addressing capability */
 	__u32 dma_dir; /* DMA data direction */
 	__u32 dma_trans_ns; /* time for DMA transmission in ns */
-	__u32 granule;  /* how many PAGE_SIZE will do map/unmap once a time */
+	__u32 granule;  /* - SINGLE_MODE: number of pages mapped/unmapped per operation
-	__u8 expansion[76]; /* For future use */
+			 * - SG_MODE: number of scatterlist entries (each maps one page)
 			 */
 	__u8 map_mode;  /* the mode of dma map */
 	__u8 expansion[75]; /* For future use */
 };
 #endif /* _UAPI_DMA_BENCHMARK_H */
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -72,6 +72,9 @@ config ARCH_HAS_DMA_PREP_COHERENT
 config ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	bool
 config ARCH_HAS_BATCHED_DMA_SYNC
 	bool
 #
 # Select this option if the architecture assumes DMA devices are coherent
 # by default.
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -362,17 +362,11 @@ static void rmem_dma_device_release(struct reserved_mem *rmem,
 		dev->dma_mem = NULL;
 }
 static const struct reserved_mem_ops rmem_dma_ops = {
 	.device_init	= rmem_dma_device_init,
 	.device_release	= rmem_dma_device_release,
 };
-static int __init rmem_dma_setup(struct reserved_mem *rmem)
+static int __init rmem_dma_setup(unsigned long node, struct reserved_mem *rmem)
 {
 	unsigned long node = rmem->fdt_node;
 	if (of_get_flat_dt_prop(node, "reusable", NULL))
-		return -EINVAL;
+		return -ENODEV;
 #ifdef CONFIG_ARM
 	if (!of_get_flat_dt_prop(node, "no-map", NULL)) {
@@ -390,7 +384,6 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem)
 	}
 #endif
 	rmem->ops = &rmem_dma_ops;
 	pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n",
 		&rmem->base, (unsigned long)rmem->size / SZ_1M);
 	return 0;
@@ -407,5 +400,11 @@ static int __init dma_init_reserved_memory(void)
 core_initcall(dma_init_reserved_memory);
 #endif /* CONFIG_DMA_GLOBAL_POOL */
-RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
+static const struct reserved_mem_ops rmem_dma_ops = {
 	.node_init	= rmem_dma_setup,
 	.device_init	= rmem_dma_device_init,
 	.device_release	= rmem_dma_device_release,
 };
 RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", &rmem_dma_ops);
 #endif
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -42,7 +42,6 @@
 #include <linux/memblock.h>
 #include <linux/err.h>
 #include <linux/sizes.h>
 #include <linux/dma-buf/heaps/cma.h>
 #include <linux/dma-map-ops.h>
 #include <linux/cma.h>
 #include <linux/nospec.h>
@@ -53,7 +52,38 @@
 #define CMA_SIZE_MBYTES 0
 #endif
-struct cma *dma_contiguous_default_area;
+static struct cma *dma_contiguous_areas[MAX_CMA_AREAS];
 static unsigned int dma_contiguous_areas_num;
 static int dma_contiguous_insert_area(struct cma *cma)
 {
 	if (dma_contiguous_areas_num >= ARRAY_SIZE(dma_contiguous_areas))
 		return -EINVAL;
 	dma_contiguous_areas[dma_contiguous_areas_num++] = cma;
 	return 0;
 }
 /**
 * dma_contiguous_get_area_by_idx() - Get contiguous area at given index
 * @idx: index of the area we query
 *
 * Queries for the contiguous area located at index @idx.
 *
 * Returns:
 * A pointer to the requested contiguous area, or NULL otherwise.
 */
 struct cma *dma_contiguous_get_area_by_idx(unsigned int idx)
 {
 	if (idx >= dma_contiguous_areas_num)
 		return NULL;
 	return dma_contiguous_areas[idx];
 }
 EXPORT_SYMBOL_GPL(dma_contiguous_get_area_by_idx);
 static struct cma *dma_contiguous_default_area;
 /*
 * Default global CMA area size can be defined in kernel's .config.
@@ -91,15 +121,14 @@ static int __init early_cma(char *p)
 }
 early_param("cma", early_cma);
-/*
+struct cma *dev_get_cma_area(struct device *dev)
 * cma_skip_dt_default_reserved_mem - This is called from the
 * reserved_mem framework to detect if the default cma region is being
 * set by the "cma=" kernel parameter.
 */
 bool __init cma_skip_dt_default_reserved_mem(void)
 {
-	return size_cmdline != -1;
+	if (dev && dev->cma_area)
 		return dev->cma_area;
 	return dma_contiguous_default_area;
 }
 EXPORT_SYMBOL_GPL(dev_get_cma_area);
 #ifdef CONFIG_DMA_NUMA_CMA
@@ -264,9 +293,24 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
 		if (ret)
 			return;
-		ret = dma_heap_cma_register_heap(dma_contiguous_default_area);
+		/*
 		 * We need to insert the new area in our list to avoid
 		 * any inconsistencies between having the default area
 		 * listed in the DT or not.
 		 *
 		 * The DT case is handled by rmem_cma_setup() and will
 		 * always insert all its areas in our list. However, if
 		 * it didn't run (because OF_RESERVED_MEM isn't set, or
 		 * there's no DT region specified), then we don't have a
 		 * default area yet, and no area in our list.
 		 *
 		 * This block creates the default area in such a case,
 		 * but we also need to insert it in our list to avoid
 		 * having a default area but an empty list.
 		 */
 		ret = dma_contiguous_insert_area(dma_contiguous_default_area);
 		if (ret)
-			pr_warn("Couldn't register default CMA heap.");
+			pr_warn("Couldn't queue default CMA region for heap creation.");
 	}
 }
@@ -470,47 +514,89 @@ static void rmem_cma_device_release(struct reserved_mem *rmem,
 	dev->cma_area = NULL;
 }
-static const struct reserved_mem_ops rmem_cma_ops = {
+static int __init __rmem_cma_verify_node(unsigned long node)
 	.device_init	= rmem_cma_device_init,
 	.device_release = rmem_cma_device_release,
 };
 static int __init rmem_cma_setup(struct reserved_mem *rmem)
 {
 	unsigned long node = rmem->fdt_node;
 	bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
 	struct cma *cma;
 	int err;
 	if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
 	    of_get_flat_dt_prop(node, "no-map", NULL))
-		return -EINVAL;
+		return -ENODEV;
 	if (size_cmdline != -1 &&
 	    of_get_flat_dt_prop(node, "linux,cma-default", NULL)) {
 		pr_err("Skipping dt linux,cma-default node in favor for \"cma=\" kernel param.\n");
 		return -EBUSY;
 	}
 	return 0;
 }
 static int __init rmem_cma_validate(unsigned long node, phys_addr_t *align)
 {
 	int ret = __rmem_cma_verify_node(node);
 	if (ret)
 		return ret;
 	if (align)
 		*align = max_t(phys_addr_t, *align, CMA_MIN_ALIGNMENT_BYTES);
 	return 0;
 }
 static int __init rmem_cma_fixup(unsigned long node, phys_addr_t base,
 				    phys_addr_t size)
 {
 	int ret = __rmem_cma_verify_node(node);
 	if (ret)
 		return ret;
 	/* Architecture specific contiguous memory fixup. */
 	dma_contiguous_early_fixup(base, size);
 	return 0;
 }
 static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem)
 {
 	bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
 	struct cma *cma;
 	int ret;
 	ret = __rmem_cma_verify_node(node);
 	if (ret)
 		return ret;
 	if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
 		pr_err("Reserved memory: incorrect alignment of CMA region\n");
 		return -EINVAL;
 	}
-	err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
+	ret = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
-	if (err) {
+	if (ret) {
 		pr_err("Reserved memory: unable to setup CMA region\n");
-		return err;
+		return ret;
 	}
 	if (default_cma)
 		dma_contiguous_default_area = cma;
 	rmem->ops = &rmem_cma_ops;
 	rmem->priv = cma;
 	pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n",
 		&rmem->base, (unsigned long)rmem->size / SZ_1M);
-	err = dma_heap_cma_register_heap(cma);
+	ret = dma_contiguous_insert_area(cma);
-	if (err)
+	if (ret)
-		pr_warn("Couldn't register CMA heap.");
+		pr_warn("Couldn't store CMA reserved area.");
 	return 0;
 }
-RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup);
+
 static const struct reserved_mem_ops rmem_cma_ops = {
 	.node_validate  = rmem_cma_validate,
 	.node_fixup	= rmem_cma_fixup,
 	.node_init	= rmem_cma_setup,
 	.device_init	= rmem_cma_device_init,
 	.device_release = rmem_cma_device_release,
 };
 RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", &rmem_cma_ops);
 #endif
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -406,6 +406,8 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 			arch_sync_dma_for_device(paddr, sg->length,
 					dir);
 	}
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_flush();
 }
 #endif
@@ -427,9 +429,11 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
 	}
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_flush();
 		arch_sync_dma_for_cpu_all();
 	}
 }
 /*
 * Unmaps segments, except for ones marked as pci_p2pdma which do not
@@ -440,15 +444,20 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 {
 	struct scatterlist *sg;
 	int i;
 	bool need_sync = false;
 	for_each_sg(sgl,  sg, nents, i) {
-		if (sg_dma_is_bus_address(sg))
+		if (sg_dma_is_bus_address(sg)) {
 			sg_dma_unmark_bus_address(sg);
-		else
+		} else {
 			need_sync = true;
 			dma_direct_unmap_phys(dev, sg->dma_address,
-					      sg_dma_len(sg), dir, attrs);
+					      sg_dma_len(sg), dir, attrs, false);
 		}
 	}
 	if (need_sync && !dev_is_dma_coherent(dev))
 		arch_sync_dma_flush();
 }
 #endif
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
@@ -457,6 +466,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct pci_p2pdma_map_state p2pdma_state = {};
 	struct scatterlist *sg;
 	int i, ret;
 	bool need_sync = false;
 	for_each_sg(sgl, sg, nents, i) {
 		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -468,8 +478,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			 */
 			break;
 		case PCI_P2PDMA_MAP_NONE:
 			need_sync = true;
 			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
-					sg->length, dir, attrs);
+					sg->length, dir, attrs, false);
 			if (sg->dma_address == DMA_MAPPING_ERROR) {
 				ret = -EIO;
 				goto out_unmap;
@@ -488,6 +499,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		sg_dma_len(sg) = sg->length;
 	}
 	if (need_sync && !dev_is_dma_coherent(dev))
 		arch_sync_dma_flush();
 	return nents;
 out_unmap:
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -60,17 +60,22 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 	swiotlb_sync_single_for_device(dev, paddr, size, dir);
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_device(paddr, size, dir);
 		arch_sync_dma_flush();
 	}
 }
 static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+		dma_addr_t addr, size_t size, enum dma_data_direction dir,
 		bool flush)
 {
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 	if (!dev_is_dma_coherent(dev)) {
 		arch_sync_dma_for_cpu(paddr, size, dir);
 		if (flush)
 			arch_sync_dma_flush();
 		arch_sync_dma_for_cpu_all();
 	}
@@ -79,21 +84,29 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
+		unsigned long attrs, bool flush)
 {
 	dma_addr_t dma_addr;
 	if (is_swiotlb_force_bounce(dev)) {
 		if (!(attrs & DMA_ATTR_CC_SHARED)) {
 			if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
 				return DMA_MAPPING_ERROR;
 			return swiotlb_map(dev, phys, size, dir, attrs);
 		}
 	} else if (attrs & DMA_ATTR_CC_SHARED) {
 		return DMA_MAPPING_ERROR;
 	}
 	if (attrs & DMA_ATTR_MMIO) {
 		dma_addr = phys;
 		if (unlikely(!dma_capable(dev, dma_addr, size, false)))
 			goto err_overflow;
 	} else if (attrs & DMA_ATTR_CC_SHARED) {
 		dma_addr = phys_to_dma_unencrypted(dev, phys);
 		if (unlikely(!dma_capable(dev, dma_addr, size, false)))
 			goto err_overflow;
 	} else {
 		dma_addr = phys_to_dma(dev, phys);
 		if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
@@ -107,8 +120,11 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
 	}
 	if (!dev_is_dma_coherent(dev) &&
-	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
 		arch_sync_dma_for_device(phys, size, dir);
 		if (flush)
 			arch_sync_dma_flush();
 	}
 	return dma_addr;
 err_overflow:
@@ -120,7 +136,8 @@ err_overflow:
 }
 static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
+		size_t size, enum dma_data_direction dir, unsigned long attrs,
 		bool flush)
 {
 	phys_addr_t phys;
@@ -130,7 +147,7 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 	phys = dma_to_phys(dev, addr);
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush);
 	swiotlb_tbl_unmap_single(dev, phys, size, dir,
 					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -5,6 +5,7 @@
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 #include <linux/cleanup.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/device.h>
@@ -15,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/timekeeping.h>
 #include <uapi/linux/map_benchmark.h>
@@ -31,17 +33,219 @@ struct map_benchmark_data {
 	atomic64_t loops;
 };
-static int map_benchmark_thread(void *data)
+struct map_benchmark_ops {
 	void *(*prepare)(struct map_benchmark_data *map);
 	void (*unprepare)(void *mparam);
 	void (*initialize_data)(void *mparam);
 	int (*do_map)(void *mparam);
 	void (*do_unmap)(void *mparam);
 };
 struct dma_single_map_param {
 	struct device *dev;
 	dma_addr_t addr;
 	void *xbuf;
 	u32 npages;
 	u32 dma_dir;
 };
 static void *dma_single_map_benchmark_prepare(struct map_benchmark_data *map)
 {
-	void *buf;
+	struct dma_single_map_param *params __free(kfree) = kzalloc(sizeof(*params),
-	dma_addr_t dma_addr;
+								    GFP_KERNEL);
-	struct map_benchmark_data *map = data;
+	if (!params)
-	int npages = map->bparam.granule;
+		return NULL;
-	u64 size = npages * PAGE_SIZE;
+
 	params->npages = map->bparam.granule;
 	params->dma_dir = map->bparam.dma_dir;
 	params->dev = map->dev;
 	params->xbuf = alloc_pages_exact(params->npages * PAGE_SIZE, GFP_KERNEL);
 	if (!params->xbuf)
 		return NULL;
 	return_ptr(params);
 }
 static void dma_single_map_benchmark_unprepare(void *mparam)
 {
 	struct dma_single_map_param *params = mparam;
 	free_pages_exact(params->xbuf, params->npages * PAGE_SIZE);
 	kfree(params);
 }
 static void dma_single_map_benchmark_initialize_data(void *mparam)
 {
 	struct dma_single_map_param *params = mparam;
 	/*
 	 * for a non-coherent device, if we don't stain them in the
 	 * cache, this will give an underestimate of the real-world
 	 * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
 	 * 66 means everything goes well! 66 is lucky.
 	 */
 	if (params->dma_dir != DMA_FROM_DEVICE)
 		memset(params->xbuf, 0x66, params->npages * PAGE_SIZE);
 }
 static int dma_single_map_benchmark_do_map(void *mparam)
 {
 	struct dma_single_map_param *params = mparam;
 	params->addr = dma_map_single(params->dev, params->xbuf,
 				      params->npages * PAGE_SIZE, params->dma_dir);
 	if (unlikely(dma_mapping_error(params->dev, params->addr))) {
 		pr_err("dma_map_single failed on %s\n", dev_name(params->dev));
 		return -ENOMEM;
 	}
 	return 0;
 }
 static void dma_single_map_benchmark_do_unmap(void *mparam)
 {
 	struct dma_single_map_param *params = mparam;
 	dma_unmap_single(params->dev, params->addr,
 			 params->npages * PAGE_SIZE, params->dma_dir);
 }
 static struct map_benchmark_ops dma_single_map_benchmark_ops = {
 	.prepare = dma_single_map_benchmark_prepare,
 	.unprepare = dma_single_map_benchmark_unprepare,
 	.initialize_data = dma_single_map_benchmark_initialize_data,
 	.do_map = dma_single_map_benchmark_do_map,
 	.do_unmap = dma_single_map_benchmark_do_unmap,
 };
 struct dma_sg_map_param {
 	struct sg_table sgt;
 	struct device *dev;
 	void **buf;
 	u32 npages;
 	u32 dma_dir;
 };
 static void *dma_sg_map_benchmark_prepare(struct map_benchmark_data *map)
 {
 	struct scatterlist *sg;
 	int i;
 	struct dma_sg_map_param *params = kzalloc(sizeof(*params), GFP_KERNEL);
 	if (!params)
 		return NULL;
 	/*
 	 * Set the number of scatterlist entries based on the granule.
 	 * In SG mode, 'granule' represents the number of scatterlist entries.
 	 * Each scatterlist entry corresponds to a single page.
 	 */
 	params->npages = map->bparam.granule;
 	params->dma_dir = map->bparam.dma_dir;
 	params->dev = map->dev;
 	params->buf = kmalloc_array(params->npages, sizeof(*params->buf),
 				    GFP_KERNEL);
 	if (!params->buf)
 		goto out;
 	if (sg_alloc_table(&params->sgt, params->npages, GFP_KERNEL))
 		goto free_buf;
 	for_each_sgtable_sg(&params->sgt, sg, i) {
 		params->buf[i] = (void *)__get_free_page(GFP_KERNEL);
 		if (!params->buf[i])
 			goto free_page;
 		sg_set_buf(sg, params->buf[i], PAGE_SIZE);
 	}
 	return params;
 free_page:
 	while (i-- > 0)
 		free_page((unsigned long)params->buf[i]);
 	sg_free_table(&params->sgt);
 free_buf:
 	kfree(params->buf);
 out:
 	kfree(params);
 	return NULL;
 }
 static void dma_sg_map_benchmark_unprepare(void *mparam)
 {
 	struct dma_sg_map_param *params = mparam;
 	int i;
 	for (i = 0; i < params->npages; i++)
 		free_page((unsigned long)params->buf[i]);
 	sg_free_table(&params->sgt);
 	kfree(params->buf);
 	kfree(params);
 }
 static void dma_sg_map_benchmark_initialize_data(void *mparam)
 {
 	struct dma_sg_map_param *params = mparam;
 	struct scatterlist *sg;
 	int i = 0;
 	if (params->dma_dir == DMA_FROM_DEVICE)
 		return;
 	for_each_sgtable_sg(&params->sgt, sg, i)
 		memset(params->buf[i], 0x66, PAGE_SIZE);
 }
 static int dma_sg_map_benchmark_do_map(void *mparam)
 {
 	struct dma_sg_map_param *params = mparam;
 	int ret = 0;
-	buf = alloc_pages_exact(size, GFP_KERNEL);
+	int sg_mapped = dma_map_sg(params->dev, params->sgt.sgl,
-	if (!buf)
+				   params->npages, params->dma_dir);
 	if (!sg_mapped) {
 		pr_err("dma_map_sg failed on %s\n", dev_name(params->dev));
 		ret = -ENOMEM;
 	}
 	return ret;
 }
 static void dma_sg_map_benchmark_do_unmap(void *mparam)
 {
 	struct dma_sg_map_param *params = mparam;
 	dma_unmap_sg(params->dev, params->sgt.sgl, params->npages,
 		     params->dma_dir);
 }
 static struct map_benchmark_ops dma_sg_map_benchmark_ops = {
 	.prepare = dma_sg_map_benchmark_prepare,
 	.unprepare = dma_sg_map_benchmark_unprepare,
 	.initialize_data = dma_sg_map_benchmark_initialize_data,
 	.do_map = dma_sg_map_benchmark_do_map,
 	.do_unmap = dma_sg_map_benchmark_do_unmap,
 };
 static struct map_benchmark_ops *dma_map_benchmark_ops[DMA_MAP_BENCH_MODE_MAX] = {
 	[DMA_MAP_BENCH_SINGLE_MODE] = &dma_single_map_benchmark_ops,
 	[DMA_MAP_BENCH_SG_MODE] = &dma_sg_map_benchmark_ops,
 };
 static int map_benchmark_thread(void *data)
 {
 	struct map_benchmark_data *map = data;
 	__u8 map_mode = map->bparam.map_mode;
 	int ret = 0;
 	struct map_benchmark_ops *mb_ops = dma_map_benchmark_ops[map_mode];
 	void *mparam = mb_ops->prepare(map);
 	if (!mparam)
 		return -ENOMEM;
 	while (!kthread_should_stop())  {
@@ -49,23 +253,12 @@ static int map_benchmark_thread(void *data)
 		ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
 		ktime_t map_delta, unmap_delta;
-		/*
+		mb_ops->initialize_data(mparam);
 		 * for a non-coherent device, if we don't stain them in the
 		 * cache, this will give an underestimate of the real-world
 		 * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
 		 * 66 means evertything goes well! 66 is lucky.
 		 */
 		if (map->dir != DMA_FROM_DEVICE)
 			memset(buf, 0x66, size);
 		map_stime = ktime_get();
-		dma_addr = dma_map_single(map->dev, buf, size, map->dir);
+		ret = mb_ops->do_map(mparam);
-		if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
+		if (ret)
 			pr_err("dma_map_single failed on %s\n",
 				dev_name(map->dev));
 			ret = -ENOMEM;
 			goto out;
-		}
+
 		map_etime = ktime_get();
 		map_delta = ktime_sub(map_etime, map_stime);
@@ -73,7 +266,8 @@ static int map_benchmark_thread(void *data)
 		ndelay(map->bparam.dma_trans_ns);
 		unmap_stime = ktime_get();
-		dma_unmap_single(map->dev, dma_addr, size, map->dir);
+		mb_ops->do_unmap(mparam);
 		unmap_etime = ktime_get();
 		unmap_delta = ktime_sub(unmap_etime, unmap_stime);
@@ -108,7 +302,7 @@ static int map_benchmark_thread(void *data)
 	}
 out:
-	free_pages_exact(buf, size);
+	mb_ops->unprepare(mparam);
 	return ret;
 }
@@ -209,6 +403,12 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case DMA_MAP_BENCHMARK:
 		if (map->bparam.map_mode < 0 ||
 		    map->bparam.map_mode >= DMA_MAP_BENCH_MODE_MAX) {
 			pr_err("invalid map mode\n");
 			return -EINVAL;
 		}
 		if (map->bparam.threads == 0 ||
 		    map->bparam.threads > DMA_MAP_MAX_THREADS) {
 			pr_err("invalid thread number\n");
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -157,6 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	bool is_mmio = attrs & DMA_ATTR_MMIO;
 	bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED;
 	dma_addr_t addr = DMA_MAPPING_ERROR;
 	BUG_ON(!valid_dma_direction(dir));
@@ -168,8 +169,11 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
 		return DMA_MAPPING_ERROR;
 	if (dma_map_direct(dev, ops) ||
-	    (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
+	    (!is_mmio && !is_cc_shared &&
-		addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
+	     arch_dma_map_phys_direct(dev, phys + size)))
 		addr = dma_direct_map_phys(dev, phys, size, dir, attrs, true);
 	else if (is_cc_shared)
 		return DMA_MAPPING_ERROR;
 	else if (use_dma_iommu(dev))
 		addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
 	else if (ops->map_phys)
@@ -206,11 +210,16 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	bool is_mmio = attrs & DMA_ATTR_MMIO;
 	bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED;
 	BUG_ON(!valid_dma_direction(dir));
 	if (dma_map_direct(dev, ops) ||
-	    (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
+	    (!is_mmio && !is_cc_shared &&
-		dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+	     arch_dma_unmap_phys_direct(dev, addr + size)))
 		dma_direct_unmap_phys(dev, addr, size, dir, attrs, true);
 	else if (is_cc_shared)
 		return;
 	else if (use_dma_iommu(dev))
 		iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
 	else if (ops->unmap_phys)
@@ -379,7 +388,7 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 	BUG_ON(!valid_dma_direction(dir));
 	if (dma_map_direct(dev, ops))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir, true);
 	else if (use_dma_iommu(dev))
 		iommu_dma_sync_single_for_cpu(dev, addr, size, dir);
 	else if (ops->sync_single_for_cpu)
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -868,6 +868,9 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
 	if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev))
 		arch_sync_dma_flush();
 	/*
 	 * It's valid for tlb_offset to be negative. This can happen when the
 	 * "offset" returned by swiotlb_align_offset() is non-zero, and the
@@ -1612,8 +1615,10 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 		return DMA_MAPPING_ERROR;
 	}
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
 		arch_sync_dma_for_device(swiotlb_addr, size, dir);
 		arch_sync_dma_flush();
 	}
 	return dma_addr;
 }
@@ -1872,26 +1877,25 @@ static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
 	dev->dma_io_tlb_mem = &io_tlb_default_mem;
 }
-static const struct reserved_mem_ops rmem_swiotlb_ops = {
+static int __init rmem_swiotlb_setup(unsigned long node,
-	.device_init = rmem_swiotlb_device_init,
+				     struct reserved_mem *rmem)
 	.device_release = rmem_swiotlb_device_release,
 };
 static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
 {
 	unsigned long node = rmem->fdt_node;
 	if (of_get_flat_dt_prop(node, "reusable", NULL) ||
 	    of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
 	    of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
 	    of_get_flat_dt_prop(node, "no-map", NULL))
 		return -EINVAL;
 	rmem->ops = &rmem_swiotlb_ops;
 	pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
 		&rmem->base, (unsigned long)rmem->size / SZ_1M);
 	return 0;
 }
-RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
 	.node_init = rmem_swiotlb_setup,
 	.device_init = rmem_swiotlb_device_init,
 	.device_release = rmem_swiotlb_device_release,
 };
 RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", &rmem_swiotlb_ops);
 #endif /* CONFIG_DMA_RESTRICTED_POOL */
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -52,6 +52,7 @@ const char *cma_get_name(const struct cma *cma)
 {
 	return cma->name;
 }
 EXPORT_SYMBOL_GPL(cma_get_name);
 static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
 					     unsigned int align_order)
@@ -951,6 +952,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	return page;
 }
 EXPORT_SYMBOL_GPL(cma_alloc);
 static struct cma_memrange *find_cma_memrange(struct cma *cma,
 		const struct page *pages, unsigned long count)
@@ -1030,6 +1032,7 @@ bool cma_release(struct cma *cma, const struct page *pages,
 	return true;
 }
 EXPORT_SYMBOL_GPL(cma_release);
 bool cma_release_frozen(struct cma *cma, const struct page *pages,
 		unsigned long count)
--- a/tools/dma/dma_map_benchmark.c
+++ b/tools/dma/dma_map_benchmark.c
@@ -20,12 +20,19 @@ static char *directions[] = {
 	"FROM_DEVICE",
 };
 static char *mode[] = {
 	"SINGLE_MODE",
 	"SG_MODE",
 };
 int main(int argc, char **argv)
 {
 	struct map_benchmark map;
 	int fd, opt;
 	/* default single thread, run 20 seconds on NUMA_NO_NODE */
 	int threads = 1, seconds = 20, node = -1;
 	/* default single map mode */
 	int map_mode = DMA_MAP_BENCH_SINGLE_MODE;
 	/* default dma mask 32bit, bidirectional DMA */
 	int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
 	/* default granule 1 PAGESIZE */
@@ -33,7 +40,7 @@ int main(int argc, char **argv)
 	int cmd = DMA_MAP_BENCHMARK;
-	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) {
+	while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:m:")) != -1) {
 		switch (opt) {
 		case 't':
 			threads = atoi(optarg);
@@ -56,11 +63,20 @@ int main(int argc, char **argv)
 		case 'g':
 			granule = atoi(optarg);
 			break;
 		case 'm':
 			map_mode = atoi(optarg);
 			break;
 		default:
 			return -1;
 		}
 	}
 	if (map_mode < 0 || map_mode >= DMA_MAP_BENCH_MODE_MAX) {
 		fprintf(stderr, "invalid map mode, SINGLE_MODE:%d, SG_MODE: %d\n",
 			DMA_MAP_BENCH_SINGLE_MODE, DMA_MAP_BENCH_SG_MODE);
 		exit(1);
 	}
 	if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) {
 		fprintf(stderr, "invalid number of threads, must be in 1-%d\n",
 			DMA_MAP_MAX_THREADS);
@@ -110,14 +126,15 @@ int main(int argc, char **argv)
 	map.dma_dir = dir;
 	map.dma_trans_ns = xdelay;
 	map.granule = granule;
 	map.map_mode = map_mode;
 	if (ioctl(fd, cmd, &map)) {
 		perror("ioctl");
 		exit(1);
 	}
-	printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n",
+	printf("dma mapping benchmark(%s): threads:%d seconds:%d node:%d dir:%s granule:%d\n",
-			threads, seconds, node, directions[dir], granule);
+			mode[map_mode], threads, seconds, node, directions[dir], granule);
 	printf("average map latency(us):%.1f standard deviation:%.1f\n",
 			map.avg_map_100ns/10.0, map.map_stddev/10.0);
 	printf("average unmap latency(us):%.1f standard deviation:%.1f\n",