mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
xor: pass the entire operation to the low-level ops
Currently the high-level xor code chunks up all operations into small units for only up to 1 + 4 vectors, and passes it to four different methods. This means the FPU/vector context is entered and left a lot for wide stripes, and a lot of indirect expensive indirect calls are performed. Switch to passing the entire gen_xor request to the low-level ops, and provide a macro to dispatch it to the existing helper. This reduce the number of indirect calls and FPU/vector context switches by a factor approaching nr_stripes / 4, and also reduces source and binary code size. Link: https://lkml.kernel.org/r/20260327061704.3707577-27-hch@lst.de Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Eric Biggers <ebiggers@kernel.org> Tested-by: Eric Biggers <ebiggers@kernel.org> Cc: Albert Ou <aou@eecs.berkeley.edu> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Alexandre Ghiti <alex@ghiti.fr> Cc: Andreas Larsson <andreas@gaisler.com> Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: "Borislav Petkov (AMD)" <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Mason <clm@fb.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David S. Miller <davem@davemloft.net> Cc: David Sterba <dsterba@suse.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jason A. Donenfeld <jason@zx2c4.com> Cc: Johannes Berg <johannes@sipsolutions.net> Cc: Li Nan <linan122@huawei.com> Cc: Madhavan Srinivasan <maddy@linux.ibm.com> Cc: Magnus Lindholm <linmag7@gmail.com> Cc: Matt Turner <mattst88@gmail.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Richard Henderson <richard.henderson@linaro.org> Cc: Richard Weinberger <richard@nod.at> Cc: Russell King <linux@armlinux.org.uk> Cc: Song Liu <song@kernel.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Ted Ts'o <tytso@mit.edu> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: WANG Xuerui <kernel@xen0n.name> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
committed by
Andrew Morton
parent
0f629e7283
commit
80dcf0a783
@@ -2,11 +2,6 @@
|
||||
#ifndef _XOR_H
|
||||
#define _XOR_H
|
||||
|
||||
#define MAX_XOR_BLOCKS 4
|
||||
|
||||
extern void xor_blocks(unsigned int count, unsigned int bytes,
|
||||
void *dest, void **srcs);
|
||||
|
||||
void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes);
|
||||
|
||||
#endif /* _XOR_H */
|
||||
|
||||
@@ -832,18 +832,17 @@ xor_alpha_prefetch_5: \n\
|
||||
.end xor_alpha_prefetch_5 \n\
|
||||
");
|
||||
|
||||
DO_XOR_BLOCKS(alpha, xor_alpha_2, xor_alpha_3, xor_alpha_4, xor_alpha_5);
|
||||
|
||||
struct xor_block_template xor_block_alpha = {
|
||||
.name = "alpha",
|
||||
.do_2 = xor_alpha_2,
|
||||
.do_3 = xor_alpha_3,
|
||||
.do_4 = xor_alpha_4,
|
||||
.do_5 = xor_alpha_5,
|
||||
.name = "alpha",
|
||||
.xor_gen = xor_gen_alpha,
|
||||
};
|
||||
|
||||
DO_XOR_BLOCKS(alpha_prefetch, xor_alpha_prefetch_2, xor_alpha_prefetch_3,
|
||||
xor_alpha_prefetch_4, xor_alpha_prefetch_5);
|
||||
|
||||
struct xor_block_template xor_block_alpha_prefetch = {
|
||||
.name = "alpha prefetch",
|
||||
.do_2 = xor_alpha_prefetch_2,
|
||||
.do_3 = xor_alpha_prefetch_3,
|
||||
.do_4 = xor_alpha_prefetch_4,
|
||||
.do_5 = xor_alpha_prefetch_5,
|
||||
.name = "alpha prefetch",
|
||||
.xor_gen = xor_gen_alpha_prefetch,
|
||||
};
|
||||
|
||||
@@ -5,54 +5,15 @@
|
||||
#include "xor_impl.h"
|
||||
#include "xor_arch.h"
|
||||
|
||||
extern struct xor_block_template const xor_block_neon_inner;
|
||||
|
||||
static void
|
||||
xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2)
|
||||
static void xor_gen_neon(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
kernel_neon_begin();
|
||||
xor_block_neon_inner.do_2(bytes, p1, p2);
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
static void
|
||||
xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3)
|
||||
{
|
||||
kernel_neon_begin();
|
||||
xor_block_neon_inner.do_3(bytes, p1, p2, p3);
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
static void
|
||||
xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4)
|
||||
{
|
||||
kernel_neon_begin();
|
||||
xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
static void
|
||||
xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4,
|
||||
const unsigned long * __restrict p5)
|
||||
{
|
||||
kernel_neon_begin();
|
||||
xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
|
||||
xor_gen_neon_inner(dest, srcs, src_cnt, bytes);
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_neon = {
|
||||
.name = "neon",
|
||||
.do_2 = xor_neon_2,
|
||||
.do_3 = xor_neon_3,
|
||||
.do_4 = xor_neon_4,
|
||||
.do_5 = xor_neon_5
|
||||
.name = "neon",
|
||||
.xor_gen = xor_gen_neon,
|
||||
};
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
*/
|
||||
|
||||
#include "xor_impl.h"
|
||||
#include "xor_arch.h"
|
||||
|
||||
#ifndef __ARM_NEON__
|
||||
#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
|
||||
@@ -22,10 +23,4 @@
|
||||
#define NO_TEMPLATE
|
||||
#include "../xor-8regs.c"
|
||||
|
||||
struct xor_block_template const xor_block_neon_inner = {
|
||||
.name = "__inner_neon__",
|
||||
.do_2 = xor_8regs_2,
|
||||
.do_3 = xor_8regs_3,
|
||||
.do_4 = xor_8regs_4,
|
||||
.do_5 = xor_8regs_5,
|
||||
};
|
||||
__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
|
||||
|
||||
@@ -127,10 +127,10 @@ xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines);
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(arm4regs, xor_arm4regs_2, xor_arm4regs_3, xor_arm4regs_4,
|
||||
xor_arm4regs_5);
|
||||
|
||||
struct xor_block_template xor_block_arm4regs = {
|
||||
.name = "arm4regs",
|
||||
.do_2 = xor_arm4regs_2,
|
||||
.do_3 = xor_arm4regs_3,
|
||||
.do_4 = xor_arm4regs_4,
|
||||
.do_5 = xor_arm4regs_5,
|
||||
.name = "arm4regs",
|
||||
.xor_gen = xor_gen_arm4regs,
|
||||
};
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
extern struct xor_block_template xor_block_arm4regs;
|
||||
extern struct xor_block_template xor_block_neon;
|
||||
|
||||
void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes);
|
||||
|
||||
static __always_inline void __init arch_xor_init(void)
|
||||
{
|
||||
xor_register(&xor_block_arm4regs);
|
||||
|
||||
@@ -10,50 +10,16 @@
|
||||
#include "xor-neon.h"
|
||||
|
||||
#define XOR_TEMPLATE(_name) \
|
||||
static void \
|
||||
xor_##_name##_2(unsigned long bytes, unsigned long * __restrict p1, \
|
||||
const unsigned long * __restrict p2) \
|
||||
static void xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \
|
||||
unsigned int bytes) \
|
||||
{ \
|
||||
scoped_ksimd() \
|
||||
__xor_##_name##_2(bytes, p1, p2); \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
xor_##_name##_3(unsigned long bytes, unsigned long * __restrict p1, \
|
||||
const unsigned long * __restrict p2, \
|
||||
const unsigned long * __restrict p3) \
|
||||
{ \
|
||||
scoped_ksimd() \
|
||||
__xor_##_name##_3(bytes, p1, p2, p3); \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
xor_##_name##_4(unsigned long bytes, unsigned long * __restrict p1, \
|
||||
const unsigned long * __restrict p2, \
|
||||
const unsigned long * __restrict p3, \
|
||||
const unsigned long * __restrict p4) \
|
||||
{ \
|
||||
scoped_ksimd() \
|
||||
__xor_##_name##_4(bytes, p1, p2, p3, p4); \
|
||||
} \
|
||||
\
|
||||
static void \
|
||||
xor_##_name##_5(unsigned long bytes, unsigned long * __restrict p1, \
|
||||
const unsigned long * __restrict p2, \
|
||||
const unsigned long * __restrict p3, \
|
||||
const unsigned long * __restrict p4, \
|
||||
const unsigned long * __restrict p5) \
|
||||
{ \
|
||||
scoped_ksimd() \
|
||||
__xor_##_name##_5(bytes, p1, p2, p3, p4, p5); \
|
||||
xor_gen_##_name##_inner(dest, srcs, src_cnt, bytes); \
|
||||
} \
|
||||
\
|
||||
struct xor_block_template xor_block_##_name = { \
|
||||
.name = __stringify(_name), \
|
||||
.do_2 = xor_##_name##_2, \
|
||||
.do_3 = xor_##_name##_3, \
|
||||
.do_4 = xor_##_name##_4, \
|
||||
.do_5 = xor_##_name##_5 \
|
||||
.name = __stringify(_name), \
|
||||
.xor_gen = xor_gen_##_name, \
|
||||
};
|
||||
|
||||
XOR_TEMPLATE(neon);
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "xor_arch.h"
|
||||
#include "xor-neon.h"
|
||||
|
||||
void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2)
|
||||
{
|
||||
uint64_t *dp1 = (uint64_t *)p1;
|
||||
@@ -37,7 +37,7 @@ void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3)
|
||||
{
|
||||
@@ -73,7 +73,7 @@ void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4)
|
||||
@@ -118,7 +118,7 @@ void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4,
|
||||
@@ -172,6 +172,9 @@ void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
|
||||
__xor_neon_5);
|
||||
|
||||
static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
|
||||
{
|
||||
uint64x2_t res;
|
||||
@@ -182,7 +185,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
|
||||
return res;
|
||||
}
|
||||
|
||||
void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3)
|
||||
{
|
||||
@@ -216,7 +219,7 @@ void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4)
|
||||
@@ -259,7 +262,7 @@ void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4,
|
||||
@@ -304,3 +307,6 @@ void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
dp5 += 8;
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4,
|
||||
__xor_eor3_5);
|
||||
|
||||
@@ -1,30 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
|
||||
void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2);
|
||||
void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3);
|
||||
void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4);
|
||||
void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4,
|
||||
const unsigned long * __restrict p5);
|
||||
|
||||
#define __xor_eor3_2 __xor_neon_2
|
||||
void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3);
|
||||
void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4);
|
||||
void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4,
|
||||
const unsigned long * __restrict p5);
|
||||
void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes);
|
||||
void xor_gen_eor3_inner(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes);
|
||||
|
||||
@@ -11,63 +11,23 @@
|
||||
#include "xor_arch.h"
|
||||
#include "xor_simd.h"
|
||||
|
||||
#define MAKE_XOR_GLUE_2(flavor) \
|
||||
static void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,\
|
||||
const unsigned long * __restrict p2) \
|
||||
#define MAKE_XOR_GLUES(flavor) \
|
||||
DO_XOR_BLOCKS(flavor##_inner, __xor_##flavor##_2, __xor_##flavor##_3, \
|
||||
__xor_##flavor##_4, __xor_##flavor##_5); \
|
||||
\
|
||||
static void xor_gen_##flavor(void *dest, void **srcs, unsigned int src_cnt, \
|
||||
unsigned int bytes) \
|
||||
{ \
|
||||
kernel_fpu_begin(); \
|
||||
__xor_##flavor##_2(bytes, p1, p2); \
|
||||
xor_gen_##flavor##_inner(dest, srcs, src_cnt, bytes); \
|
||||
kernel_fpu_end(); \
|
||||
} \
|
||||
|
||||
#define MAKE_XOR_GLUE_3(flavor) \
|
||||
static void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,\
|
||||
const unsigned long * __restrict p2, \
|
||||
const unsigned long * __restrict p3) \
|
||||
{ \
|
||||
kernel_fpu_begin(); \
|
||||
__xor_##flavor##_3(bytes, p1, p2, p3); \
|
||||
kernel_fpu_end(); \
|
||||
} \
|
||||
|
||||
#define MAKE_XOR_GLUE_4(flavor) \
|
||||
static void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,\
|
||||
const unsigned long * __restrict p2, \
|
||||
const unsigned long * __restrict p3, \
|
||||
const unsigned long * __restrict p4) \
|
||||
{ \
|
||||
kernel_fpu_begin(); \
|
||||
__xor_##flavor##_4(bytes, p1, p2, p3, p4); \
|
||||
kernel_fpu_end(); \
|
||||
} \
|
||||
|
||||
#define MAKE_XOR_GLUE_5(flavor) \
|
||||
static void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,\
|
||||
const unsigned long * __restrict p2, \
|
||||
const unsigned long * __restrict p3, \
|
||||
const unsigned long * __restrict p4, \
|
||||
const unsigned long * __restrict p5) \
|
||||
{ \
|
||||
kernel_fpu_begin(); \
|
||||
__xor_##flavor##_5(bytes, p1, p2, p3, p4, p5); \
|
||||
kernel_fpu_end(); \
|
||||
} \
|
||||
|
||||
#define MAKE_XOR_GLUES(flavor) \
|
||||
MAKE_XOR_GLUE_2(flavor); \
|
||||
MAKE_XOR_GLUE_3(flavor); \
|
||||
MAKE_XOR_GLUE_4(flavor); \
|
||||
MAKE_XOR_GLUE_5(flavor); \
|
||||
\
|
||||
struct xor_block_template xor_block_##flavor = { \
|
||||
.name = __stringify(flavor), \
|
||||
.do_2 = xor_##flavor##_2, \
|
||||
.do_3 = xor_##flavor##_3, \
|
||||
.do_4 = xor_##flavor##_4, \
|
||||
.do_5 = xor_##flavor##_5, \
|
||||
\
|
||||
struct xor_block_template xor_block_##flavor = { \
|
||||
.name = __stringify(flavor), \
|
||||
.xor_gen = xor_gen_##flavor \
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_CPU_HAS_LSX
|
||||
MAKE_XOR_GLUES(lsx);
|
||||
#endif /* CONFIG_CPU_HAS_LSX */
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
* Sparse (as at v0.5.0) gets very, very confused by this file.
|
||||
* Make it a bit simpler for it.
|
||||
*/
|
||||
#include "xor_impl.h"
|
||||
#if !defined(__CHECKER__)
|
||||
#include <altivec.h>
|
||||
#else
|
||||
@@ -49,9 +50,9 @@ typedef vector signed char unative_t;
|
||||
V1##_3 = vec_xor(V1##_3, V2##_3); \
|
||||
} while (0)
|
||||
|
||||
void __xor_altivec_2(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in)
|
||||
static void __xor_altivec_2(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in)
|
||||
{
|
||||
DEFINE(v1);
|
||||
DEFINE(v2);
|
||||
@@ -68,10 +69,10 @@ void __xor_altivec_2(unsigned long bytes,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_altivec_3(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in,
|
||||
const unsigned long * __restrict v3_in)
|
||||
static void __xor_altivec_3(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in,
|
||||
const unsigned long * __restrict v3_in)
|
||||
{
|
||||
DEFINE(v1);
|
||||
DEFINE(v2);
|
||||
@@ -92,11 +93,11 @@ void __xor_altivec_3(unsigned long bytes,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_altivec_4(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in,
|
||||
const unsigned long * __restrict v3_in,
|
||||
const unsigned long * __restrict v4_in)
|
||||
static void __xor_altivec_4(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in,
|
||||
const unsigned long * __restrict v3_in,
|
||||
const unsigned long * __restrict v4_in)
|
||||
{
|
||||
DEFINE(v1);
|
||||
DEFINE(v2);
|
||||
@@ -121,12 +122,12 @@ void __xor_altivec_4(unsigned long bytes,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
void __xor_altivec_5(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in,
|
||||
const unsigned long * __restrict v3_in,
|
||||
const unsigned long * __restrict v4_in,
|
||||
const unsigned long * __restrict v5_in)
|
||||
static void __xor_altivec_5(unsigned long bytes,
|
||||
unsigned long * __restrict v1_in,
|
||||
const unsigned long * __restrict v2_in,
|
||||
const unsigned long * __restrict v3_in,
|
||||
const unsigned long * __restrict v4_in,
|
||||
const unsigned long * __restrict v5_in)
|
||||
{
|
||||
DEFINE(v1);
|
||||
DEFINE(v2);
|
||||
@@ -154,3 +155,6 @@ void __xor_altivec_5(unsigned long bytes,
|
||||
v5 += 4;
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
__DO_XOR_BLOCKS(altivec_inner, __xor_altivec_2, __xor_altivec_3,
|
||||
__xor_altivec_4, __xor_altivec_5);
|
||||
|
||||
@@ -6,17 +6,5 @@
|
||||
* outside of the enable/disable altivec block.
|
||||
*/
|
||||
|
||||
void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2);
|
||||
void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3);
|
||||
void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4);
|
||||
void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4,
|
||||
const unsigned long * __restrict p5);
|
||||
void xor_gen_altivec_inner(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes);
|
||||
|
||||
@@ -12,56 +12,17 @@
|
||||
#include "xor_arch.h"
|
||||
#include "xor_vmx.h"
|
||||
|
||||
static void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2)
|
||||
static void xor_gen_altivec(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
preempt_disable();
|
||||
enable_kernel_altivec();
|
||||
__xor_altivec_2(bytes, p1, p2);
|
||||
disable_kernel_altivec();
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3)
|
||||
{
|
||||
preempt_disable();
|
||||
enable_kernel_altivec();
|
||||
__xor_altivec_3(bytes, p1, p2, p3);
|
||||
disable_kernel_altivec();
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4)
|
||||
{
|
||||
preempt_disable();
|
||||
enable_kernel_altivec();
|
||||
__xor_altivec_4(bytes, p1, p2, p3, p4);
|
||||
disable_kernel_altivec();
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p2,
|
||||
const unsigned long * __restrict p3,
|
||||
const unsigned long * __restrict p4,
|
||||
const unsigned long * __restrict p5)
|
||||
{
|
||||
preempt_disable();
|
||||
enable_kernel_altivec();
|
||||
__xor_altivec_5(bytes, p1, p2, p3, p4, p5);
|
||||
xor_gen_altivec_inner(dest, srcs, src_cnt, bytes);
|
||||
disable_kernel_altivec();
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_altivec = {
|
||||
.name = "altivec",
|
||||
.do_2 = xor_altivec_2,
|
||||
.do_3 = xor_altivec_3,
|
||||
.do_4 = xor_altivec_4,
|
||||
.do_5 = xor_altivec_5,
|
||||
.name = "altivec",
|
||||
.xor_gen = xor_gen_altivec,
|
||||
};
|
||||
|
||||
@@ -9,48 +9,17 @@
|
||||
#include "xor_impl.h"
|
||||
#include "xor_arch.h"
|
||||
|
||||
static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1,
|
||||
const unsigned long *__restrict p2)
|
||||
{
|
||||
kernel_vector_begin();
|
||||
xor_regs_2_(bytes, p1, p2);
|
||||
kernel_vector_end();
|
||||
}
|
||||
DO_XOR_BLOCKS(vector_inner, xor_regs_2_, xor_regs_3_, xor_regs_4_, xor_regs_5_);
|
||||
|
||||
static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1,
|
||||
const unsigned long *__restrict p2,
|
||||
const unsigned long *__restrict p3)
|
||||
static void xor_gen_vector(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
kernel_vector_begin();
|
||||
xor_regs_3_(bytes, p1, p2, p3);
|
||||
kernel_vector_end();
|
||||
}
|
||||
|
||||
static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1,
|
||||
const unsigned long *__restrict p2,
|
||||
const unsigned long *__restrict p3,
|
||||
const unsigned long *__restrict p4)
|
||||
{
|
||||
kernel_vector_begin();
|
||||
xor_regs_4_(bytes, p1, p2, p3, p4);
|
||||
kernel_vector_end();
|
||||
}
|
||||
|
||||
static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1,
|
||||
const unsigned long *__restrict p2,
|
||||
const unsigned long *__restrict p3,
|
||||
const unsigned long *__restrict p4,
|
||||
const unsigned long *__restrict p5)
|
||||
{
|
||||
kernel_vector_begin();
|
||||
xor_regs_5_(bytes, p1, p2, p3, p4, p5);
|
||||
xor_gen_vector_inner(dest, srcs, src_cnt, bytes);
|
||||
kernel_vector_end();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_rvv = {
|
||||
.name = "rvv",
|
||||
.do_2 = xor_vector_2,
|
||||
.do_3 = xor_vector_3,
|
||||
.do_4 = xor_vector_4,
|
||||
.do_5 = xor_vector_5
|
||||
.name = "rvv",
|
||||
.xor_gen = xor_gen_vector,
|
||||
};
|
||||
|
||||
@@ -125,10 +125,9 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
: : "0", "cc", "memory");
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(xc, xor_xc_2, xor_xc_3, xor_xc_4, xor_xc_5);
|
||||
|
||||
struct xor_block_template xor_block_xc = {
|
||||
.name = "xc",
|
||||
.do_2 = xor_xc_2,
|
||||
.do_3 = xor_xc_3,
|
||||
.do_4 = xor_xc_4,
|
||||
.do_5 = xor_xc_5,
|
||||
.name = "xc",
|
||||
.xor_gen = xor_gen_xc,
|
||||
};
|
||||
|
||||
@@ -244,10 +244,9 @@ sparc_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(sparc32, sparc_2, sparc_3, sparc_4, sparc_5);
|
||||
|
||||
struct xor_block_template xor_block_SPARC = {
|
||||
.name = "SPARC",
|
||||
.do_2 = sparc_2,
|
||||
.do_3 = sparc_3,
|
||||
.do_4 = sparc_4,
|
||||
.do_5 = sparc_5,
|
||||
.name = "SPARC",
|
||||
.xor_gen = xor_gen_sparc32,
|
||||
};
|
||||
|
||||
@@ -28,12 +28,11 @@ void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
|
||||
/* XXX Ugh, write cheetah versions... -DaveM */
|
||||
|
||||
DO_XOR_BLOCKS(vis, xor_vis_2, xor_vis_3, xor_vis_4, xor_vis_5);
|
||||
|
||||
struct xor_block_template xor_block_VIS = {
|
||||
.name = "VIS",
|
||||
.do_2 = xor_vis_2,
|
||||
.do_3 = xor_vis_3,
|
||||
.do_4 = xor_vis_4,
|
||||
.do_5 = xor_vis_5,
|
||||
.name = "VIS",
|
||||
.xor_gen = xor_gen_vis,
|
||||
};
|
||||
|
||||
void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
@@ -51,10 +50,10 @@ void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
const unsigned long * __restrict p4,
|
||||
const unsigned long * __restrict p5);
|
||||
|
||||
DO_XOR_BLOCKS(niagara, xor_niagara_2, xor_niagara_3, xor_niagara_4,
|
||||
xor_niagara_5);
|
||||
|
||||
struct xor_block_template xor_block_niagara = {
|
||||
.name = "Niagara",
|
||||
.do_2 = xor_niagara_2,
|
||||
.do_3 = xor_niagara_3,
|
||||
.do_4 = xor_niagara_4,
|
||||
.do_5 = xor_niagara_5,
|
||||
.name = "Niagara",
|
||||
.xor_gen = xor_gen_niagara,
|
||||
};
|
||||
|
||||
@@ -29,8 +29,6 @@ static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
|
||||
{
|
||||
unsigned long lines = bytes >> 9;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (lines--) {
|
||||
#undef BLOCK
|
||||
#define BLOCK(i, reg) \
|
||||
@@ -47,8 +45,6 @@ do { \
|
||||
p0 = (unsigned long *)((uintptr_t)p0 + 512);
|
||||
p1 = (unsigned long *)((uintptr_t)p1 + 512);
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
|
||||
@@ -57,8 +53,6 @@ static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
|
||||
{
|
||||
unsigned long lines = bytes >> 9;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (lines--) {
|
||||
#undef BLOCK
|
||||
#define BLOCK(i, reg) \
|
||||
@@ -78,8 +72,6 @@ do { \
|
||||
p1 = (unsigned long *)((uintptr_t)p1 + 512);
|
||||
p2 = (unsigned long *)((uintptr_t)p2 + 512);
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
|
||||
@@ -89,8 +81,6 @@ static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
|
||||
{
|
||||
unsigned long lines = bytes >> 9;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (lines--) {
|
||||
#undef BLOCK
|
||||
#define BLOCK(i, reg) \
|
||||
@@ -113,8 +103,6 @@ do { \
|
||||
p2 = (unsigned long *)((uintptr_t)p2 + 512);
|
||||
p3 = (unsigned long *)((uintptr_t)p3 + 512);
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
|
||||
@@ -125,8 +113,6 @@ static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
|
||||
{
|
||||
unsigned long lines = bytes >> 9;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (lines--) {
|
||||
#undef BLOCK
|
||||
#define BLOCK(i, reg) \
|
||||
@@ -152,14 +138,19 @@ do { \
|
||||
p3 = (unsigned long *)((uintptr_t)p3 + 512);
|
||||
p4 = (unsigned long *)((uintptr_t)p4 + 512);
|
||||
}
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5);
|
||||
|
||||
static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
kernel_fpu_begin();
|
||||
xor_gen_avx_inner(dest, srcs, src_cnt, bytes);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_avx = {
|
||||
.name = "avx",
|
||||
.do_2 = xor_avx_2,
|
||||
.do_3 = xor_avx_3,
|
||||
.do_4 = xor_avx_4,
|
||||
.do_5 = xor_avx_5,
|
||||
.name = "avx",
|
||||
.xor_gen = xor_gen_avx,
|
||||
};
|
||||
|
||||
@@ -21,8 +21,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 7;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -55,8 +53,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
"+r" (p1), "+r" (p2)
|
||||
:
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -66,8 +62,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 7;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -105,8 +99,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
"+r" (p1), "+r" (p2), "+r" (p3)
|
||||
:
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -117,8 +109,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 7;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -161,8 +151,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
"+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
|
||||
:
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
|
||||
@@ -175,8 +163,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 7;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
/* Make sure GCC forgets anything it knows about p4 or p5,
|
||||
such that it won't pass to the asm volatile below a
|
||||
register that is shared with any other variable. That's
|
||||
@@ -237,8 +223,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
Clobber them just to be sure nobody does something stupid
|
||||
like assuming they have some legal value. */
|
||||
asm("" : "=r" (p4), "=r" (p5));
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
#undef LD
|
||||
@@ -255,8 +239,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 6;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
" .align 32 ;\n"
|
||||
" 1: ;\n"
|
||||
@@ -293,8 +275,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
"+r" (p1), "+r" (p2)
|
||||
:
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -304,8 +284,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 6;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
" .align 32,0x90 ;\n"
|
||||
" 1: ;\n"
|
||||
@@ -351,8 +329,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
"+r" (p1), "+r" (p2), "+r" (p3)
|
||||
:
|
||||
: "memory" );
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -363,8 +339,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 6;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
" .align 32,0x90 ;\n"
|
||||
" 1: ;\n"
|
||||
@@ -419,8 +393,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
"+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
|
||||
:
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -432,8 +404,6 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 6;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
/* Make sure GCC forgets anything it knows about p4 or p5,
|
||||
such that it won't pass to the asm volatile below a
|
||||
register that is shared with any other variable. That's
|
||||
@@ -510,22 +480,36 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
Clobber them just to be sure nobody does something stupid
|
||||
like assuming they have some legal value. */
|
||||
asm("" : "=r" (p4), "=r" (p5));
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(pII_mmx_inner, xor_pII_mmx_2, xor_pII_mmx_3, xor_pII_mmx_4,
|
||||
xor_pII_mmx_5);
|
||||
|
||||
static void xor_gen_pII_mmx(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
kernel_fpu_begin();
|
||||
xor_gen_pII_mmx_inner(dest, srcs, src_cnt, bytes);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_pII_mmx = {
|
||||
.name = "pII_mmx",
|
||||
.do_2 = xor_pII_mmx_2,
|
||||
.do_3 = xor_pII_mmx_3,
|
||||
.do_4 = xor_pII_mmx_4,
|
||||
.do_5 = xor_pII_mmx_5,
|
||||
.name = "pII_mmx",
|
||||
.xor_gen = xor_gen_pII_mmx,
|
||||
};
|
||||
|
||||
DO_XOR_BLOCKS(p5_mmx_inner, xor_p5_mmx_2, xor_p5_mmx_3, xor_p5_mmx_4,
|
||||
xor_p5_mmx_5);
|
||||
|
||||
static void xor_gen_p5_mmx(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
kernel_fpu_begin();
|
||||
xor_gen_p5_mmx_inner(dest, srcs, src_cnt, bytes);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_p5_mmx = {
|
||||
.name = "p5_mmx",
|
||||
.do_2 = xor_p5_mmx_2,
|
||||
.do_3 = xor_p5_mmx_3,
|
||||
.do_4 = xor_p5_mmx_4,
|
||||
.do_5 = xor_p5_mmx_5,
|
||||
.name = "p5_mmx",
|
||||
.xor_gen = xor_gen_p5_mmx,
|
||||
};
|
||||
|
||||
@@ -51,8 +51,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -93,8 +91,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p1] "+r" (p1), [p2] "+r" (p2)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -103,8 +99,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -128,8 +122,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p1] "+r" (p1), [p2] "+r" (p2)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -139,8 +131,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -188,8 +178,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -199,8 +187,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -226,8 +212,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -238,8 +222,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -294,8 +276,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -306,8 +286,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -335,8 +313,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -348,8 +324,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -411,8 +385,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -424,8 +396,6 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
{
|
||||
unsigned long lines = bytes >> 8;
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile(
|
||||
#undef BLOCK
|
||||
#define BLOCK(i) \
|
||||
@@ -455,22 +425,35 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
|
||||
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
|
||||
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(sse_inner, xor_sse_2, xor_sse_3, xor_sse_4, xor_sse_5);
|
||||
|
||||
static void xor_gen_sse(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
kernel_fpu_begin();
|
||||
xor_gen_sse_inner(dest, srcs, src_cnt, bytes);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_sse = {
|
||||
.name = "sse",
|
||||
.do_2 = xor_sse_2,
|
||||
.do_3 = xor_sse_3,
|
||||
.do_4 = xor_sse_4,
|
||||
.do_5 = xor_sse_5,
|
||||
.name = "sse",
|
||||
.xor_gen = xor_gen_sse,
|
||||
};
|
||||
|
||||
DO_XOR_BLOCKS(sse_pf64_inner, xor_sse_2_pf64, xor_sse_3_pf64, xor_sse_4_pf64,
|
||||
xor_sse_5_pf64);
|
||||
|
||||
static void xor_gen_sse_pf64(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes)
|
||||
{
|
||||
kernel_fpu_begin();
|
||||
xor_gen_sse_pf64_inner(dest, srcs, src_cnt, bytes);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
struct xor_block_template xor_block_sse_pf64 = {
|
||||
.name = "prefetch64-sse",
|
||||
.do_2 = xor_sse_2_pf64,
|
||||
.do_3 = xor_sse_3_pf64,
|
||||
.do_4 = xor_sse_4_pf64,
|
||||
.do_5 = xor_sse_5_pf64,
|
||||
.name = "prefetch64-sse",
|
||||
.xor_gen = xor_gen_sse_pf64,
|
||||
};
|
||||
|
||||
@@ -258,10 +258,10 @@ xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
goto once_more;
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(32regs_p, xor_32regs_p_2, xor_32regs_p_3, xor_32regs_p_4,
|
||||
xor_32regs_p_5);
|
||||
|
||||
struct xor_block_template xor_block_32regs_p = {
|
||||
.name = "32regs_prefetch",
|
||||
.do_2 = xor_32regs_p_2,
|
||||
.do_3 = xor_32regs_p_3,
|
||||
.do_4 = xor_32regs_p_4,
|
||||
.do_5 = xor_32regs_p_5,
|
||||
.name = "32regs_prefetch",
|
||||
.xor_gen = xor_gen_32regs_p,
|
||||
};
|
||||
|
||||
@@ -209,10 +209,9 @@ xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
} while (--lines > 0);
|
||||
}
|
||||
|
||||
DO_XOR_BLOCKS(32regs, xor_32regs_2, xor_32regs_3, xor_32regs_4, xor_32regs_5);
|
||||
|
||||
struct xor_block_template xor_block_32regs = {
|
||||
.name = "32regs",
|
||||
.do_2 = xor_32regs_2,
|
||||
.do_3 = xor_32regs_3,
|
||||
.do_4 = xor_32regs_4,
|
||||
.do_5 = xor_32regs_5,
|
||||
.name = "32regs",
|
||||
.xor_gen = xor_gen_32regs,
|
||||
};
|
||||
|
||||
@@ -136,10 +136,11 @@ xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
goto once_more;
|
||||
}
|
||||
|
||||
|
||||
DO_XOR_BLOCKS(8regs_p, xor_8regs_p_2, xor_8regs_p_3, xor_8regs_p_4,
|
||||
xor_8regs_p_5);
|
||||
|
||||
struct xor_block_template xor_block_8regs_p = {
|
||||
.name = "8regs_prefetch",
|
||||
.do_2 = xor_8regs_p_2,
|
||||
.do_3 = xor_8regs_p_3,
|
||||
.do_4 = xor_8regs_p_4,
|
||||
.do_5 = xor_8regs_p_5,
|
||||
.name = "8regs_prefetch",
|
||||
.xor_gen = xor_gen_8regs_p,
|
||||
};
|
||||
|
||||
@@ -94,11 +94,10 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
|
||||
}
|
||||
|
||||
#ifndef NO_TEMPLATE
|
||||
DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
|
||||
|
||||
struct xor_block_template xor_block_8regs = {
|
||||
.name = "8regs",
|
||||
.do_2 = xor_8regs_2,
|
||||
.do_3 = xor_8regs_3,
|
||||
.do_4 = xor_8regs_4,
|
||||
.do_5 = xor_8regs_5,
|
||||
.name = "8regs",
|
||||
.xor_gen = xor_gen_8regs,
|
||||
};
|
||||
#endif /* NO_TEMPLATE */
|
||||
|
||||
@@ -13,39 +13,9 @@
|
||||
#include <linux/preempt.h>
|
||||
#include "xor_impl.h"
|
||||
|
||||
/* The xor routines to use. */
|
||||
/* The xor routine to use. */
|
||||
static struct xor_block_template *active_template;
|
||||
|
||||
void
|
||||
xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
|
||||
{
|
||||
unsigned long *p1, *p2, *p3, *p4;
|
||||
|
||||
WARN_ON_ONCE(!in_task() || irqs_disabled() || softirq_count());
|
||||
|
||||
p1 = (unsigned long *) srcs[0];
|
||||
if (src_count == 1) {
|
||||
active_template->do_2(bytes, dest, p1);
|
||||
return;
|
||||
}
|
||||
|
||||
p2 = (unsigned long *) srcs[1];
|
||||
if (src_count == 2) {
|
||||
active_template->do_3(bytes, dest, p1, p2);
|
||||
return;
|
||||
}
|
||||
|
||||
p3 = (unsigned long *) srcs[2];
|
||||
if (src_count == 3) {
|
||||
active_template->do_4(bytes, dest, p1, p2, p3);
|
||||
return;
|
||||
}
|
||||
|
||||
p4 = (unsigned long *) srcs[3];
|
||||
active_template->do_5(bytes, dest, p1, p2, p3, p4);
|
||||
}
|
||||
EXPORT_SYMBOL(xor_blocks);
|
||||
|
||||
/**
|
||||
* xor_gen - generate RAID-style XOR information
|
||||
* @dest: destination vector
|
||||
@@ -63,20 +33,11 @@ EXPORT_SYMBOL(xor_blocks);
|
||||
*/
|
||||
void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes)
|
||||
{
|
||||
unsigned int src_off = 0;
|
||||
|
||||
WARN_ON_ONCE(in_interrupt());
|
||||
WARN_ON_ONCE(!in_task() || irqs_disabled() || softirq_count());
|
||||
WARN_ON_ONCE(bytes == 0);
|
||||
WARN_ON_ONCE(bytes & 511);
|
||||
|
||||
while (src_cnt > 0) {
|
||||
unsigned int this_cnt = min(src_cnt, MAX_XOR_BLOCKS);
|
||||
|
||||
xor_blocks(this_cnt, bytes, dest, srcs + src_off);
|
||||
|
||||
src_cnt -= this_cnt;
|
||||
src_off += this_cnt;
|
||||
}
|
||||
active_template->xor_gen(dest, srcs, src_cnt, bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(xor_gen);
|
||||
|
||||
@@ -120,6 +81,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
|
||||
int speed;
|
||||
unsigned long reps;
|
||||
ktime_t min, start, t0;
|
||||
void *srcs[1] = { b2 };
|
||||
|
||||
preempt_disable();
|
||||
|
||||
@@ -130,7 +92,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
|
||||
cpu_relax();
|
||||
do {
|
||||
mb(); /* prevent loop optimization */
|
||||
tmpl->do_2(BENCH_SIZE, b1, b2);
|
||||
tmpl->xor_gen(b1, srcs, 1, BENCH_SIZE);
|
||||
mb();
|
||||
} while (reps++ < REPS || (t0 = ktime_get()) == start);
|
||||
min = ktime_sub(t0, start);
|
||||
|
||||
@@ -3,27 +3,47 @@
|
||||
#define _XOR_IMPL_H
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/minmax.h>
|
||||
|
||||
struct xor_block_template {
|
||||
struct xor_block_template *next;
|
||||
const char *name;
|
||||
int speed;
|
||||
void (*do_2)(unsigned long, unsigned long * __restrict,
|
||||
const unsigned long * __restrict);
|
||||
void (*do_3)(unsigned long, unsigned long * __restrict,
|
||||
const unsigned long * __restrict,
|
||||
const unsigned long * __restrict);
|
||||
void (*do_4)(unsigned long, unsigned long * __restrict,
|
||||
const unsigned long * __restrict,
|
||||
const unsigned long * __restrict,
|
||||
const unsigned long * __restrict);
|
||||
void (*do_5)(unsigned long, unsigned long * __restrict,
|
||||
const unsigned long * __restrict,
|
||||
const unsigned long * __restrict,
|
||||
const unsigned long * __restrict,
|
||||
const unsigned long * __restrict);
|
||||
void (*xor_gen)(void *dest, void **srcs, unsigned int src_cnt,
|
||||
unsigned int bytes);
|
||||
};
|
||||
|
||||
#define __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) \
|
||||
void \
|
||||
xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \
|
||||
unsigned int bytes) \
|
||||
{ \
|
||||
unsigned int src_off = 0; \
|
||||
\
|
||||
while (src_cnt > 0) { \
|
||||
unsigned int this_cnt = min(src_cnt, 4); \
|
||||
\
|
||||
if (this_cnt == 1) \
|
||||
_handle1(bytes, dest, srcs[src_off]); \
|
||||
else if (this_cnt == 2) \
|
||||
_handle2(bytes, dest, srcs[src_off], \
|
||||
srcs[src_off + 1]); \
|
||||
else if (this_cnt == 3) \
|
||||
_handle3(bytes, dest, srcs[src_off], \
|
||||
srcs[src_off + 1], srcs[src_off + 2]); \
|
||||
else \
|
||||
_handle4(bytes, dest, srcs[src_off], \
|
||||
srcs[src_off + 1], srcs[src_off + 2], \
|
||||
srcs[src_off + 3]); \
|
||||
\
|
||||
src_cnt -= this_cnt; \
|
||||
src_off += this_cnt; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) \
|
||||
static __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4)
|
||||
|
||||
/* generic implementations */
|
||||
extern struct xor_block_template xor_block_8regs;
|
||||
extern struct xor_block_template xor_block_32regs;
|
||||
|
||||
Reference in New Issue
Block a user