arch/x86/include/asm/xor_64.h

   1 #ifndef _ASM_X86_XOR_64_H
   2 #define _ASM_X86_XOR_64_H
   3
   4 /*
   5  * Optimized RAID-5 checksumming functions for MMX and SSE.
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2, or (at your option)
  10  * any later version.
  11  *
  12  * You should have received a copy of the GNU General Public License
  13  * (for example /usr/src/linux/COPYING); if not, write to the Free
  14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15  */
  16
  17
  18 /*
  19  * Cache avoiding checksumming functions utilizing KNI instructions
  20  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  21  */
  22
  23 /*
  24  * Based on
  25  * High-speed RAID5 checksumming functions utilizing SSE instructions.
  26  * Copyright (C) 1998 Ingo Molnar.
  27  */
  28
  29 /*
  30  * x86-64 changes / gcc fixes from Andi Kleen.
  31  * Copyright 2002 Andi Kleen, SuSE Labs.
  32  *
  33  * This hasn't been optimized for the hammer yet, but there are likely
  34  * no advantages to be gotten from x86-64 here anyways.
  35  */
  36
  37 #include <asm/i387.h>
  38
  39 #define OFFS(x)         "16*("#x")"
  40 #define PF_OFFS(x)      "256+16*("#x")"
  41 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  42 #define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
  43 #define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
  44 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  45 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  46 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  47 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  48 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
  49 #define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
  50 #define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
  51 #define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
  52 #define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
  53 #define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
  54
  55
  56 static void
  57 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  58 {
  59         unsigned int lines = bytes >> 8;
  60
  61         kernel_fpu_begin();
  62
  63         asm volatile(
  64 #undef BLOCK
  65 #define BLOCK(i) \
  66                 LD(i, 0)                                \
  67                         LD(i + 1, 1)                    \
  68                 PF1(i)                                  \
  69                                 PF1(i + 2)              \
  70                                 LD(i + 2, 2)            \
  71                                         LD(i + 3, 3)    \
  72                 PF0(i + 4)                              \
  73                                 PF0(i + 6)              \
  74                 XO1(i, 0)                               \
  75                         XO1(i + 1, 1)                   \
  76                                 XO1(i + 2, 2)           \
  77                                         XO1(i + 3, 3)   \
  78                 ST(i, 0)                                \
  79                         ST(i + 1, 1)                    \
  80                                 ST(i + 2, 2)            \
  81                                         ST(i + 3, 3)    \
  82
  83
  84                 PF0(0)
  85                                 PF0(2)
  86
  87         " .align 32                     ;\n"
  88         " 1:                            ;\n"
  89
  90                 BLOCK(0)
  91                 BLOCK(4)
  92                 BLOCK(8)
  93                 BLOCK(12)
  94
  95         "       addq %[inc], %[p1]           ;\n"
  96         "       addq %[inc], %[p2]           ;\n"
  97                 "               decl %[cnt] ; jnz 1b"
  98         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
  99         : [inc] "r" (256UL)
 100         : "memory");
 101
 102         kernel_fpu_end();
 103 }
 104
 105 static void
 106 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 107           unsigned long *p3)
 108 {
 109         unsigned int lines = bytes >> 8;
 110
 111         kernel_fpu_begin();
 112         asm volatile(
 113 #undef BLOCK
 114 #define BLOCK(i) \
 115                 PF1(i)                                  \
 116                                 PF1(i + 2)              \
 117                 LD(i, 0)                                        \
 118                         LD(i + 1, 1)                    \
 119                                 LD(i + 2, 2)            \
 120                                         LD(i + 3, 3)    \
 121                 PF2(i)                                  \
 122                                 PF2(i + 2)              \
 123                 PF0(i + 4)                              \
 124                                 PF0(i + 6)              \
 125                 XO1(i, 0)                               \
 126                         XO1(i + 1, 1)                   \
 127                                 XO1(i + 2, 2)           \
 128                                         XO1(i + 3, 3)   \
 129                 XO2(i, 0)                               \
 130                         XO2(i + 1, 1)                   \
 131                                 XO2(i + 2, 2)           \
 132                                         XO2(i + 3, 3)   \
 133                 ST(i, 0)                                \
 134                         ST(i + 1, 1)                    \
 135                                 ST(i + 2, 2)            \
 136                                         ST(i + 3, 3)    \
 137
 138
 139                 PF0(0)
 140                                 PF0(2)
 141
 142         " .align 32                     ;\n"
 143         " 1:                            ;\n"
 144
 145                 BLOCK(0)
 146                 BLOCK(4)
 147                 BLOCK(8)
 148                 BLOCK(12)
 149
 150         "       addq %[inc], %[p1]           ;\n"
 151         "       addq %[inc], %[p2]          ;\n"
 152         "       addq %[inc], %[p3]           ;\n"
 153                 "               decl %[cnt] ; jnz 1b"
 154         : [cnt] "+r" (lines),
 155           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 156         : [inc] "r" (256UL)
 157         : "memory");
 158         kernel_fpu_end();
 159 }
 160
 161 static void
 162 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 163           unsigned long *p3, unsigned long *p4)
 164 {
 165         unsigned int lines = bytes >> 8;
 166
 167         kernel_fpu_begin();
 168
 169         asm volatile(
 170 #undef BLOCK
 171 #define BLOCK(i) \
 172                 PF1(i)                                  \
 173                                 PF1(i + 2)              \
 174                 LD(i, 0)                                \
 175                         LD(i + 1, 1)                    \
 176                                 LD(i + 2, 2)            \
 177                                         LD(i + 3, 3)    \
 178                 PF2(i)                                  \
 179                                 PF2(i + 2)              \
 180                 XO1(i, 0)                               \
 181                         XO1(i + 1, 1)                   \
 182                                 XO1(i + 2, 2)           \
 183                                         XO1(i + 3, 3)   \
 184                 PF3(i)                                  \
 185                                 PF3(i + 2)              \
 186                 PF0(i + 4)                              \
 187                                 PF0(i + 6)              \
 188                 XO2(i, 0)                               \
 189                         XO2(i + 1, 1)                   \
 190                                 XO2(i + 2, 2)           \
 191                                         XO2(i + 3, 3)   \
 192                 XO3(i, 0)                               \
 193                         XO3(i + 1, 1)                   \
 194                                 XO3(i + 2, 2)           \
 195                                         XO3(i + 3, 3)   \
 196                 ST(i, 0)                                \
 197                         ST(i + 1, 1)                    \
 198                                 ST(i + 2, 2)            \
 199                                         ST(i + 3, 3)    \
 200
 201
 202                 PF0(0)
 203                                 PF0(2)
 204
 205         " .align 32                     ;\n"
 206         " 1:                            ;\n"
 207
 208                 BLOCK(0)
 209                 BLOCK(4)
 210                 BLOCK(8)
 211                 BLOCK(12)
 212
 213         "       addq %[inc], %[p1]           ;\n"
 214         "       addq %[inc], %[p2]           ;\n"
 215         "       addq %[inc], %[p3]           ;\n"
 216         "       addq %[inc], %[p4]           ;\n"
 217         "       decl %[cnt] ; jnz 1b"
 218         : [cnt] "+c" (lines),
 219           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 220         : [inc] "r" (256UL)
 221         : "memory" );
 222
 223         kernel_fpu_end();
 224 }
 225
 226 static void
 227 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 228           unsigned long *p3, unsigned long *p4, unsigned long *p5)
 229 {
 230         unsigned int lines = bytes >> 8;
 231
 232         kernel_fpu_begin();
 233
 234         asm volatile(
 235 #undef BLOCK
 236 #define BLOCK(i) \
 237                 PF1(i)                                  \
 238                                 PF1(i + 2)              \
 239                 LD(i, 0)                                \
 240                         LD(i + 1, 1)                    \
 241                                 LD(i + 2, 2)            \
 242                                         LD(i + 3, 3)    \
 243                 PF2(i)                                  \
 244                                 PF2(i + 2)              \
 245                 XO1(i, 0)                               \
 246                         XO1(i + 1, 1)                   \
 247                                 XO1(i + 2, 2)           \
 248                                         XO1(i + 3, 3)   \
 249                 PF3(i)                                  \
 250                                 PF3(i + 2)              \
 251                 XO2(i, 0)                               \
 252                         XO2(i + 1, 1)                   \
 253                                 XO2(i + 2, 2)           \
 254                                         XO2(i + 3, 3)   \
 255                 PF4(i)                                  \
 256                                 PF4(i + 2)              \
 257                 PF0(i + 4)                              \
 258                                 PF0(i + 6)              \
 259                 XO3(i, 0)                               \
 260                         XO3(i + 1, 1)                   \
 261                                 XO3(i + 2, 2)           \
 262                                         XO3(i + 3, 3)   \
 263                 XO4(i, 0)                               \
 264                         XO4(i + 1, 1)                   \
 265                                 XO4(i + 2, 2)           \
 266                                         XO4(i + 3, 3)   \
 267                 ST(i, 0)                                \
 268                         ST(i + 1, 1)                    \
 269                                 ST(i + 2, 2)            \
 270                                         ST(i + 3, 3)    \
 271
 272
 273                 PF0(0)
 274                                 PF0(2)
 275
 276         " .align 32                     ;\n"
 277         " 1:                            ;\n"
 278
 279                 BLOCK(0)
 280                 BLOCK(4)
 281                 BLOCK(8)
 282                 BLOCK(12)
 283
 284         "       addq %[inc], %[p1]           ;\n"
 285         "       addq %[inc], %[p2]           ;\n"
 286         "       addq %[inc], %[p3]           ;\n"
 287         "       addq %[inc], %[p4]           ;\n"
 288         "       addq %[inc], %[p5]           ;\n"
 289         "       decl %[cnt] ; jnz 1b"
 290         : [cnt] "+c" (lines),
 291           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
 292           [p5] "+r" (p5)
 293         : [inc] "r" (256UL)
 294         : "memory");
 295
 296         kernel_fpu_end();
 297 }
 298
 299 static struct xor_block_template xor_block_sse = {
 300         .name = "generic_sse",
 301         .do_2 = xor_sse_2,
 302         .do_3 = xor_sse_3,
 303         .do_4 = xor_sse_4,
 304         .do_5 = xor_sse_5,
 305 };
 306
 307
 308 /* Also try the AVX routines */
 309 #include <asm/xor_avx.h>
 310
 311 #undef XOR_TRY_TEMPLATES
 312 #define XOR_TRY_TEMPLATES                       \
 313 do {                                            \
 314         AVX_XOR_SPEED;                          \
 315         xor_speed(&xor_block_sse);              \
 316 } while (0)
 317
 318 /* We force the use of the SSE xor block because it can write around L2.
 319    We may also be able to load into the L1 only depending on how the cpu
 320    deals with a load to a line that is being prefetched.  */
 321 #define XOR_SELECT_TEMPLATE(FASTEST) \
 322         AVX_SELECT(&xor_block_sse)
 323
 324 #endif /* _ASM_X86_XOR_64_H */