Skip to content

Commit 474fd6e

Browse files
author
Martin Schwidefsky
committed
RAID/s390: add SIMD implementation for raid6 gen/xor
Using vector registers is slightly faster: raid6: vx128x8 gen() 19705 MB/s raid6: vx128x8 xor() 11886 MB/s raid6: using algorithm vx128x8 gen() 19705 MB/s raid6: .... xor() 11886 MB/s, rmw enabled vs the software algorithms: raid6: int64x1 gen() 3018 MB/s raid6: int64x1 xor() 1429 MB/s raid6: int64x2 gen() 4661 MB/s raid6: int64x2 xor() 3143 MB/s raid6: int64x4 gen() 5392 MB/s raid6: int64x4 xor() 3509 MB/s raid6: int64x8 gen() 4441 MB/s raid6: int64x8 xor() 3207 MB/s raid6: using algorithm int64x4 gen() 5392 MB/s raid6: .... xor() 3509 MB/s, rmw enabled Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
1 parent 8f149ea commit 474fd6e

File tree

6 files changed

+265
-0
lines changed

6 files changed

+265
-0
lines changed

arch/s390/include/asm/vx-insn.h

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,15 @@
278278
VLVG \v, \gr, \index, 3
279279
.endm
280280

281+
/* VECTOR LOAD REGISTER */
282+
.macro VLR v1, v2
283+
VX_NUM v1, \v1
284+
VX_NUM v2, \v2
285+
.word 0xE700 | ((v1&15) << 4) | (v2&15)
286+
.word 0
287+
MRXBOPC 0, 0x56, v1, v2
288+
.endm
289+
281290
/* VECTOR LOAD */
282291
.macro VL v, disp, index="%r0", base
283292
VX_NUM v1, \v
@@ -404,6 +413,16 @@
404413

405414
/* Vector integer instructions */
406415

416+
/* VECTOR AND */
417+
.macro VN vr1, vr2, vr3
418+
VX_NUM v1, \vr1
419+
VX_NUM v2, \vr2
420+
VX_NUM v3, \vr3
421+
.word 0xE700 | ((v1&15) << 4) | (v2&15)
422+
.word ((v3&15) << 12)
423+
MRXBOPC 0, 0x68, v1, v2, v3
424+
.endm
425+
407426
/* VECTOR EXCLUSIVE OR */
408427
.macro VX vr1, vr2, vr3
409428
VX_NUM v1, \vr1
@@ -469,6 +488,73 @@
469488
MRXBOPC 0, 0x7D, v1, v2, v3
470489
.endm
471490

491+
/* VECTOR REPLICATE IMMEDIATE */
492+
.macro VREPI vr1, imm2, m3
493+
VX_NUM v1, \vr1
494+
.word 0xE700 | ((v1&15) << 4)
495+
.word \imm2
496+
MRXBOPC \m3, 0x45, v1
497+
.endm
498+
.macro VREPIB vr1, imm2
499+
VREPI \vr1, \imm2, 0
500+
.endm
501+
.macro VREPIH vr1, imm2
502+
VREPI \vr1, \imm2, 1
503+
.endm
504+
.macro VREPIF vr1, imm2
505+
VREPI \vr1, \imm2, 2
506+
.endm
507+
.macro VREPIG vr1, imm2
508+
VREP \vr1, \imm2, 3
509+
.endm
510+
511+
/* VECTOR ADD */
512+
.macro VA vr1, vr2, vr3, m4
513+
VX_NUM v1, \vr1
514+
VX_NUM v2, \vr2
515+
VX_NUM v3, \vr3
516+
.word 0xE700 | ((v1&15) << 4) | (v2&15)
517+
.word ((v3&15) << 12)
518+
MRXBOPC \m4, 0xF3, v1, v2, v3
519+
.endm
520+
.macro VAB vr1, vr2, vr3
521+
VA \vr1, \vr2, \vr3, 0
522+
.endm
523+
.macro VAH vr1, vr2, vr3
524+
VA \vr1, \vr2, \vr3, 1
525+
.endm
526+
.macro VAF vr1, vr2, vr3
527+
VA \vr1, \vr2, \vr3, 2
528+
.endm
529+
.macro VAG vr1, vr2, vr3
530+
VA \vr1, \vr2, \vr3, 3
531+
.endm
532+
.macro VAQ vr1, vr2, vr3
533+
VA \vr1, \vr2, \vr3, 4
534+
.endm
535+
536+
/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
537+
.macro VESRAV vr1, vr2, vr3, m4
538+
VX_NUM v1, \vr1
539+
VX_NUM v2, \vr2
540+
VX_NUM v3, \vr3
541+
.word 0xE700 | ((v1&15) << 4) | (v2&15)
542+
.word ((v3&15) << 12)
543+
MRXBOPC \m4, 0x7A, v1, v2, v3
544+
.endm
545+
546+
.macro VESRAVB vr1, vr2, vr3
547+
VESRAV \vr1, \vr2, \vr3, 0
548+
.endm
549+
.macro VESRAVH vr1, vr2, vr3
550+
VESRAV \vr1, \vr2, \vr3, 1
551+
.endm
552+
.macro VESRAVF vr1, vr2, vr3
553+
VESRAV \vr1, \vr2, \vr3, 2
554+
.endm
555+
.macro VESRAVG vr1, vr2, vr3
556+
VESRAV \vr1, \vr2, \vr3, 3
557+
.endm
472558

473559
#endif /* __ASSEMBLY__ */
474560
#endif /* __ASM_S390_VX_INSN_H */

include/linux/raid/pq.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ extern const struct raid6_calls raid6_avx2x1;
103103
extern const struct raid6_calls raid6_avx2x2;
104104
extern const struct raid6_calls raid6_avx2x4;
105105
extern const struct raid6_calls raid6_tilegx8;
106+
extern const struct raid6_calls raid6_s390vx8;
106107

107108
struct raid6_recov_calls {
108109
void (*data2)(int, size_t, int, int, void **);

lib/raid6/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ altivec*.c
33
int*.c
44
tables.c
55
neon?.c
6+
s390vx?.c

lib/raid6/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
77
raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
88
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
99
raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
10+
raid6_pq-$(CONFIG_S390) += s390vx8.o
1011

1112
hostprogs-y += mktables
1213

@@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8
116117
$(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE
117118
$(call if_changed,unroll)
118119

120+
targets += s390vx8.c
121+
$(obj)/s390vx8.c: UNROLL := 8
122+
$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE
123+
$(call if_changed,unroll)
124+
119125
quiet_cmd_mktable = TABLE $@
120126
cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
121127

lib/raid6/algos.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ const struct raid6_calls * const raid6_algos[] = {
6868
#endif
6969
#if defined(CONFIG_TILEGX)
7070
&raid6_tilegx8,
71+
#endif
72+
#if defined(CONFIG_S390)
73+
&raid6_s390vx8,
7174
#endif
7275
&raid6_intx1,
7376
&raid6_intx2,

lib/raid6/s390vx.uc

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
/*
2+
* raid6_vx$#.c
3+
*
4+
* $#-way unrolled RAID6 gen/xor functions for s390
5+
* based on the vector facility
6+
*
7+
* Copyright IBM Corp. 2016
8+
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
9+
*
10+
* This file is postprocessed using unroll.awk.
11+
*/
12+
13+
#include <linux/raid/pq.h>
14+
#include <asm/fpu/api.h>
15+
16+
asm(".include \"asm/vx-insn.h\"\n");
17+
18+
#define NSIZE 16
19+
20+
static inline void LOAD_CONST(void)
21+
{
22+
asm volatile("VREPIB %v24,7");
23+
asm volatile("VREPIB %v25,0x1d");
24+
}
25+
26+
/*
27+
* The SHLBYTE() operation shifts each of the 16 bytes in
28+
* vector register y left by 1 bit and stores the result in
29+
* vector register x.
30+
*/
31+
static inline void SHLBYTE(int x, int y)
32+
{
33+
asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
34+
}
35+
36+
/*
37+
* For each of the 16 bytes in the vector register y the MASK()
38+
* operation returns 0xFF if the high bit of the byte is 1,
39+
* or 0x00 if the high bit is 0. The result is stored in vector
40+
* register x.
41+
*/
42+
static inline void MASK(int x, int y)
43+
{
44+
asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y));
45+
}
46+
47+
static inline void AND(int x, int y, int z)
48+
{
49+
asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
50+
}
51+
52+
static inline void XOR(int x, int y, int z)
53+
{
54+
asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
55+
}
56+
57+
static inline void LOAD_DATA(int x, int n, u8 *ptr)
58+
{
59+
typedef struct { u8 _[16*n]; } addrtype;
60+
register addrtype *__ptr asm("1") = (addrtype *) ptr;
61+
62+
asm volatile ("VLM %2,%3,0,%r1"
63+
: : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1));
64+
}
65+
66+
static inline void STORE_DATA(int x, int n, u8 *ptr)
67+
{
68+
typedef struct { u8 _[16*n]; } addrtype;
69+
register addrtype *__ptr asm("1") = (addrtype *) ptr;
70+
71+
asm volatile ("VSTM %2,%3,0,1"
72+
: "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1));
73+
}
74+
75+
static inline void COPY_VEC(int x, int y)
76+
{
77+
asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
78+
}
79+
80+
static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
81+
{
82+
struct kernel_fpu vxstate;
83+
u8 **dptr, *p, *q;
84+
int d, z, z0;
85+
86+
kernel_fpu_begin(&vxstate, KERNEL_VXR);
87+
LOAD_CONST();
88+
89+
dptr = (u8 **) ptrs;
90+
z0 = disks - 3; /* Highest data disk */
91+
p = dptr[z0 + 1]; /* XOR parity */
92+
q = dptr[z0 + 2]; /* RS syndrome */
93+
94+
for (d = 0; d < bytes; d += $#*NSIZE) {
95+
LOAD_DATA(0,$#,&dptr[z0][d]);
96+
COPY_VEC(8+$$,0+$$);
97+
for (z = z0 - 1; z >= 0; z--) {
98+
MASK(16+$$,8+$$);
99+
AND(16+$$,16+$$,25);
100+
SHLBYTE(8+$$,8+$$);
101+
XOR(8+$$,8+$$,16+$$);
102+
LOAD_DATA(16,$#,&dptr[z][d]);
103+
XOR(0+$$,0+$$,16+$$);
104+
XOR(8+$$,8+$$,16+$$);
105+
}
106+
STORE_DATA(0,$#,&p[d]);
107+
STORE_DATA(8,$#,&q[d]);
108+
}
109+
kernel_fpu_end(&vxstate, KERNEL_VXR);
110+
}
111+
112+
static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
113+
size_t bytes, void **ptrs)
114+
{
115+
struct kernel_fpu vxstate;
116+
u8 **dptr, *p, *q;
117+
int d, z, z0;
118+
119+
dptr = (u8 **) ptrs;
120+
z0 = stop; /* P/Q right side optimization */
121+
p = dptr[disks - 2]; /* XOR parity */
122+
q = dptr[disks - 1]; /* RS syndrome */
123+
124+
kernel_fpu_begin(&vxstate, KERNEL_VXR);
125+
LOAD_CONST();
126+
127+
for (d = 0; d < bytes; d += $#*NSIZE) {
128+
/* P/Q data pages */
129+
LOAD_DATA(0,$#,&dptr[z0][d]);
130+
COPY_VEC(8+$$,0+$$);
131+
for (z = z0 - 1; z >= start; z--) {
132+
MASK(16+$$,8+$$);
133+
AND(16+$$,16+$$,25);
134+
SHLBYTE(8+$$,8+$$);
135+
XOR(8+$$,8+$$,16+$$);
136+
LOAD_DATA(16,$#,&dptr[z][d]);
137+
XOR(0+$$,0+$$,16+$$);
138+
XOR(8+$$,8+$$,16+$$);
139+
}
140+
/* P/Q left side optimization */
141+
for (z = start - 1; z >= 0; z--) {
142+
MASK(16+$$,8+$$);
143+
AND(16+$$,16+$$,25);
144+
SHLBYTE(8+$$,8+$$);
145+
XOR(8+$$,8+$$,16+$$);
146+
}
147+
LOAD_DATA(16,$#,&p[d]);
148+
XOR(16+$$,16+$$,0+$$);
149+
STORE_DATA(16,$#,&p[d]);
150+
LOAD_DATA(16,$#,&q[d]);
151+
XOR(16+$$,16+$$,8+$$);
152+
STORE_DATA(16,$#,&q[d]);
153+
}
154+
kernel_fpu_end(&vxstate, KERNEL_VXR);
155+
}
156+
157+
static int raid6_s390vx$#_valid(void)
158+
{
159+
return MACHINE_HAS_VX;
160+
}
161+
162+
const struct raid6_calls raid6_s390vx$# = {
163+
raid6_s390vx$#_gen_syndrome,
164+
raid6_s390vx$#_xor_syndrome,
165+
raid6_s390vx$#_valid,
166+
"vx128x$#",
167+
1
168+
};

0 commit comments

Comments
 (0)