mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
This function was a masterclass in bad naming, for various historical reasons. It claimed to be a non-cached user copy. It is literally _neither_ of those things. It's a specialty memory copy routine that uses non-temporal stores for the destination (but not the source), and that does exception handling for both source and destination accesses. Also note that while it works for unaligned targets, any unaligned parts (whether at beginning or end) will not use non-temporal stores, since only words and quadwords can be non-temporal on x86. The exception handling means that it _can_ be used for user space accesses, but not on its own - it needs all the normal "start user space access" logic around it. But typically the user space access would be the source, not the non-temporal destination. That was the original intention of this, where the destination was some fragile persistent memory target that needed non-temporal stores in order to catch machine check exceptions synchronously and deal with them gracefully. Thus that non-descriptive name: one use case was to copy from user space into a non-cached kernel buffer. However, the existing users are a mix of that intended use-case, and a couple of random drivers that just did this as a performance tweak. Some of those random drivers then actively misused the user copying version (with STAC/CLAC and all) to do kernel copies without ever even caring about the exception handling, _just_ for the non-temporal destination. Rename it as a first small step to actually make it halfway sane, and change the prototype to be more normal: it doesn't take a user pointer unless the caller has done the proper conversion, and the argument size is the full size_t (it still won't actually copy more than 4GB in one go, but there's also no reason to silently truncate the size argument in the caller). Finally, use this now sanely named function in the NTB code, which mis-used a user copy version (with STAC/CLAC and all) of this interface despite it not actually being a user copy at all. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
245 lines
5.0 KiB
ArmAsm
245 lines
5.0 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
|
|
*/
|
|
|
|
#include <linux/export.h>
|
|
#include <linux/linkage.h>
|
|
#include <linux/objtool.h>
|
|
#include <asm/asm.h>
|
|
|
|
/*
|
|
* copy_user_nocache - Uncached memory copy with exception handling
|
|
*
|
|
* This copies from user space into kernel space, but the kernel
|
|
* space accesses can take a machine check exception, so they too
|
|
* need exception handling.
|
|
*
|
|
* Note: only 32-bit and 64-bit stores have non-temporal versions,
|
|
* and we only use aligned versions. Any unaligned parts at the
|
|
* start or end of the copy will be done using normal cached stores.
|
|
*
|
|
* Input:
|
|
* rdi destination
|
|
* rsi source
|
|
* edx count
|
|
*
|
|
* Output:
|
|
* rax uncopied bytes or 0 if successful.
|
|
*/
|
|
SYM_FUNC_START(copy_to_nontemporal)
|
|
ANNOTATE_NOENDBR
|
|
/* If destination is not 7-byte aligned, we'll have to align it */
|
|
testb $7,%dil
|
|
jne .Lalign
|
|
|
|
.Lis_aligned:
|
|
cmp $64,%edx
|
|
jb .Lquadwords
|
|
|
|
.p2align 4,0x90
|
|
.Lunrolled:
|
|
10: movq (%rsi),%r8
|
|
11: movq 8(%rsi),%r9
|
|
12: movq 16(%rsi),%r10
|
|
13: movq 24(%rsi),%r11
|
|
20: movnti %r8,(%rdi)
|
|
21: movnti %r9,8(%rdi)
|
|
22: movnti %r10,16(%rdi)
|
|
23: movnti %r11,24(%rdi)
|
|
30: movq 32(%rsi),%r8
|
|
31: movq 40(%rsi),%r9
|
|
32: movq 48(%rsi),%r10
|
|
33: movq 56(%rsi),%r11
|
|
40: movnti %r8,32(%rdi)
|
|
41: movnti %r9,40(%rdi)
|
|
42: movnti %r10,48(%rdi)
|
|
43: movnti %r11,56(%rdi)
|
|
|
|
addq $64,%rsi
|
|
addq $64,%rdi
|
|
sub $64,%edx
|
|
cmp $64,%edx
|
|
jae .Lunrolled
|
|
|
|
/*
|
|
* First set of user mode loads have been done
|
|
* without any stores, so if they fail, we can
|
|
* just try the non-unrolled loop.
|
|
*/
|
|
_ASM_EXTABLE_UA(10b, .Lquadwords)
|
|
_ASM_EXTABLE_UA(11b, .Lquadwords)
|
|
_ASM_EXTABLE_UA(12b, .Lquadwords)
|
|
_ASM_EXTABLE_UA(13b, .Lquadwords)
|
|
|
|
/*
|
|
* The second set of user mode loads have been
|
|
* done with 32 bytes stored to the destination,
|
|
* so we need to take that into account before
|
|
* falling back to the unrolled loop.
|
|
*/
|
|
_ASM_EXTABLE_UA(30b, .Lfixup32)
|
|
_ASM_EXTABLE_UA(31b, .Lfixup32)
|
|
_ASM_EXTABLE_UA(32b, .Lfixup32)
|
|
_ASM_EXTABLE_UA(33b, .Lfixup32)
|
|
|
|
/*
|
|
* An exception on a write means that we're
|
|
* done, but we need to update the count
|
|
* depending on where in the unrolled loop
|
|
* we were.
|
|
*/
|
|
_ASM_EXTABLE_UA(20b, .Ldone0)
|
|
_ASM_EXTABLE_UA(21b, .Ldone8)
|
|
_ASM_EXTABLE_UA(22b, .Ldone16)
|
|
_ASM_EXTABLE_UA(23b, .Ldone24)
|
|
_ASM_EXTABLE_UA(40b, .Ldone32)
|
|
_ASM_EXTABLE_UA(41b, .Ldone40)
|
|
_ASM_EXTABLE_UA(42b, .Ldone48)
|
|
_ASM_EXTABLE_UA(43b, .Ldone56)
|
|
|
|
.Lquadwords:
|
|
cmp $8,%edx
|
|
jb .Llong
|
|
50: movq (%rsi),%rax
|
|
51: movnti %rax,(%rdi)
|
|
addq $8,%rsi
|
|
addq $8,%rdi
|
|
sub $8,%edx
|
|
jmp .Lquadwords
|
|
|
|
/*
|
|
* If we fail on the last full quadword, we will
|
|
* not try to do any byte-wise cached accesses.
|
|
* We will try to do one more 4-byte uncached
|
|
* one, though.
|
|
*/
|
|
_ASM_EXTABLE_UA(50b, .Llast4)
|
|
_ASM_EXTABLE_UA(51b, .Ldone0)
|
|
|
|
.Llong:
|
|
test $4,%dl
|
|
je .Lword
|
|
60: movl (%rsi),%eax
|
|
61: movnti %eax,(%rdi)
|
|
addq $4,%rsi
|
|
addq $4,%rdi
|
|
sub $4,%edx
|
|
.Lword:
|
|
sfence
|
|
test $2,%dl
|
|
je .Lbyte
|
|
70: movw (%rsi),%ax
|
|
71: movw %ax,(%rdi)
|
|
addq $2,%rsi
|
|
addq $2,%rdi
|
|
sub $2,%edx
|
|
.Lbyte:
|
|
test $1,%dl
|
|
je .Ldone
|
|
80: movb (%rsi),%al
|
|
81: movb %al,(%rdi)
|
|
dec %edx
|
|
.Ldone:
|
|
mov %edx,%eax
|
|
RET
|
|
|
|
/*
|
|
* If we fail on the last four bytes, we won't
|
|
* bother with any fixups. It's dead, Jim. Note
|
|
* that there's no need for 'sfence' for any
|
|
* of this, since the exception will have been
|
|
* serializing.
|
|
*/
|
|
_ASM_EXTABLE_UA(60b, .Ldone)
|
|
_ASM_EXTABLE_UA(61b, .Ldone)
|
|
_ASM_EXTABLE_UA(70b, .Ldone)
|
|
_ASM_EXTABLE_UA(71b, .Ldone)
|
|
_ASM_EXTABLE_UA(80b, .Ldone)
|
|
_ASM_EXTABLE_UA(81b, .Ldone)
|
|
|
|
/*
|
|
* This is the "head needs aliging" case when
|
|
* the destination isn't 8-byte aligned. The
|
|
* 4-byte case can be done uncached, but any
|
|
* smaller alignment is done with regular stores.
|
|
*/
|
|
.Lalign:
|
|
test $1,%dil
|
|
je .Lalign_word
|
|
test %edx,%edx
|
|
je .Ldone
|
|
90: movb (%rsi),%al
|
|
91: movb %al,(%rdi)
|
|
inc %rsi
|
|
inc %rdi
|
|
dec %edx
|
|
.Lalign_word:
|
|
test $2,%dil
|
|
je .Lalign_long
|
|
cmp $2,%edx
|
|
jb .Lbyte
|
|
92: movw (%rsi),%ax
|
|
93: movw %ax,(%rdi)
|
|
addq $2,%rsi
|
|
addq $2,%rdi
|
|
sub $2,%edx
|
|
.Lalign_long:
|
|
test $4,%dil
|
|
je .Lis_aligned
|
|
cmp $4,%edx
|
|
jb .Lword
|
|
94: movl (%rsi),%eax
|
|
95: movnti %eax,(%rdi)
|
|
addq $4,%rsi
|
|
addq $4,%rdi
|
|
sub $4,%edx
|
|
jmp .Lis_aligned
|
|
|
|
/*
|
|
* If we fail on the initial alignment accesses,
|
|
* we're all done. Again, no point in trying to
|
|
* do byte-by-byte probing if the 4-byte load
|
|
* fails - we're not doing any uncached accesses
|
|
* any more.
|
|
*/
|
|
_ASM_EXTABLE_UA(90b, .Ldone)
|
|
_ASM_EXTABLE_UA(91b, .Ldone)
|
|
_ASM_EXTABLE_UA(92b, .Ldone)
|
|
_ASM_EXTABLE_UA(93b, .Ldone)
|
|
_ASM_EXTABLE_UA(94b, .Ldone)
|
|
_ASM_EXTABLE_UA(95b, .Ldone)
|
|
|
|
/*
|
|
* Exception table fixups for faults in the middle
|
|
*/
|
|
.Ldone56: sub $8,%edx
|
|
.Ldone48: sub $8,%edx
|
|
.Ldone40: sub $8,%edx
|
|
.Ldone32: sub $8,%edx
|
|
.Ldone24: sub $8,%edx
|
|
.Ldone16: sub $8,%edx
|
|
.Ldone8: sub $8,%edx
|
|
.Ldone0:
|
|
mov %edx,%eax
|
|
RET
|
|
|
|
.Lfixup32:
|
|
addq $32,%rsi
|
|
addq $32,%rdi
|
|
sub $32,%edx
|
|
jmp .Lquadwords
|
|
|
|
.Llast4:
|
|
52: movl (%rsi),%eax
|
|
53: movnti %eax,(%rdi)
|
|
sfence
|
|
sub $4,%edx
|
|
mov %edx,%eax
|
|
RET
|
|
_ASM_EXTABLE_UA(52b, .Ldone0)
|
|
_ASM_EXTABLE_UA(53b, .Ldone0)
|
|
|
|
SYM_FUNC_END(copy_to_nontemporal)
|
|
EXPORT_SYMBOL(copy_to_nontemporal)
|