arch/x86/lib/memmove_64.S

   1 /*
   2  * Normally compiler builtins are used, but sometimes the compiler calls out
   3  * of line code. Based on asm-i386/string.h.
   4  *
   5  * This assembly file is re-written from memmove_64.c file.
   6  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   7  */
   8 #include <linux/linkage.h>
   9 #include <asm/cpufeatures.h>
  10 #include <asm/alternative-asm.h>
  11 #include <asm/export.h>
  12
  13 #undef memmove
  14
  15 /*
  16  * Implement memmove(). This can handle overlap between src and dst.
  17  *
  18  * Input:
  19  * rdi: dest
  20  * rsi: src
  21  * rdx: count
  22  *
  23  * Output:
  24  * rax: dest
  25  */
  26 .weak memmove
  27
  28 ENTRY(memmove)
  29 ENTRY(__memmove)
  30
  31         /* Handle more 32 bytes in loop */
  32         mov %rdi, %rax
  33         cmp $0x20, %rdx
  34         jb      1f
  35
  36         /* Decide forward/backward copy mode */
  37         cmp %rdi, %rsi
  38         jge .Lmemmove_begin_forward
  39         mov %rsi, %r8
  40         add %rdx, %r8
  41         cmp %rdi, %r8
  42         jg 2f
  43
  44 .Lmemmove_begin_forward:
  45         ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
  46
  47         /*
  48          * movsq instruction have many startup latency
  49          * so we handle small size by general register.
  50          */
  51         cmp  $680, %rdx
  52         jb      3f
  53         /*
  54          * movsq instruction is only good for aligned case.
  55          */
  56
  57         cmpb %dil, %sil
  58         je 4f
  59 3:
  60         sub $0x20, %rdx
  61         /*
  62          * We gobble 32 bytes forward in each loop.
  63          */
  64 5:
  65         sub $0x20, %rdx
  66         movq 0*8(%rsi), %r11
  67         movq 1*8(%rsi), %r10
  68         movq 2*8(%rsi), %r9
  69         movq 3*8(%rsi), %r8
  70         leaq 4*8(%rsi), %rsi
  71
  72         movq %r11, 0*8(%rdi)
  73         movq %r10, 1*8(%rdi)
  74         movq %r9, 2*8(%rdi)
  75         movq %r8, 3*8(%rdi)
  76         leaq 4*8(%rdi), %rdi
  77         jae 5b
  78         addq $0x20, %rdx
  79         jmp 1f
  80         /*
  81          * Handle data forward by movsq.
  82          */
  83         .p2align 4
  84 4:
  85         movq %rdx, %rcx
  86         movq -8(%rsi, %rdx), %r11
  87         lea -8(%rdi, %rdx), %r10
  88         shrq $3, %rcx
  89         rep movsq
  90         movq %r11, (%r10)
  91         jmp 13f
  92 .Lmemmove_end_forward:
  93
  94         /*
  95          * Handle data backward by movsq.
  96          */
  97         .p2align 4
  98 7:
  99         movq %rdx, %rcx
 100         movq (%rsi), %r11
 101         movq %rdi, %r10
 102         leaq -8(%rsi, %rdx), %rsi
 103         leaq -8(%rdi, %rdx), %rdi
 104         shrq $3, %rcx
 105         std
 106         rep movsq
 107         cld
 108         movq %r11, (%r10)
 109         jmp 13f
 110
 111         /*
 112          * Start to prepare for backward copy.
 113          */
 114         .p2align 4
 115 2:
 116         cmp $680, %rdx
 117         jb 6f
 118         cmp %dil, %sil
 119         je 7b
 120 6:
 121         /*
 122          * Calculate copy position to tail.
 123          */
 124         addq %rdx, %rsi
 125         addq %rdx, %rdi
 126         subq $0x20, %rdx
 127         /*
 128          * We gobble 32 bytes backward in each loop.
 129          */
 130 8:
 131         subq $0x20, %rdx
 132         movq -1*8(%rsi), %r11
 133         movq -2*8(%rsi), %r10
 134         movq -3*8(%rsi), %r9
 135         movq -4*8(%rsi), %r8
 136         leaq -4*8(%rsi), %rsi
 137
 138         movq %r11, -1*8(%rdi)
 139         movq %r10, -2*8(%rdi)
 140         movq %r9, -3*8(%rdi)
 141         movq %r8, -4*8(%rdi)
 142         leaq -4*8(%rdi), %rdi
 143         jae 8b
 144         /*
 145          * Calculate copy position to head.
 146          */
 147         addq $0x20, %rdx
 148         subq %rdx, %rsi
 149         subq %rdx, %rdi
 150 1:
 151         cmpq $16, %rdx
 152         jb 9f
 153         /*
 154          * Move data from 16 bytes to 31 bytes.
 155          */
 156         movq 0*8(%rsi), %r11
 157         movq 1*8(%rsi), %r10
 158         movq -2*8(%rsi, %rdx), %r9
 159         movq -1*8(%rsi, %rdx), %r8
 160         movq %r11, 0*8(%rdi)
 161         movq %r10, 1*8(%rdi)
 162         movq %r9, -2*8(%rdi, %rdx)
 163         movq %r8, -1*8(%rdi, %rdx)
 164         jmp 13f
 165         .p2align 4
 166 9:
 167         cmpq $8, %rdx
 168         jb 10f
 169         /*
 170          * Move data from 8 bytes to 15 bytes.
 171          */
 172         movq 0*8(%rsi), %r11
 173         movq -1*8(%rsi, %rdx), %r10
 174         movq %r11, 0*8(%rdi)
 175         movq %r10, -1*8(%rdi, %rdx)
 176         jmp 13f
 177 10:
 178         cmpq $4, %rdx
 179         jb 11f
 180         /*
 181          * Move data from 4 bytes to 7 bytes.
 182          */
 183         movl (%rsi), %r11d
 184         movl -4(%rsi, %rdx), %r10d
 185         movl %r11d, (%rdi)
 186         movl %r10d, -4(%rdi, %rdx)
 187         jmp 13f
 188 11:
 189         cmp $2, %rdx
 190         jb 12f
 191         /*
 192          * Move data from 2 bytes to 3 bytes.
 193          */
 194         movw (%rsi), %r11w
 195         movw -2(%rsi, %rdx), %r10w
 196         movw %r11w, (%rdi)
 197         movw %r10w, -2(%rdi, %rdx)
 198         jmp 13f
 199 12:
 200         cmp $1, %rdx
 201         jb 13f
 202         /*
 203          * Move data for 1 byte.
 204          */
 205         movb (%rsi), %r11b
 206         movb %r11b, (%rdi)
 207 13:
 208         retq
 209 ENDPROC(__memmove)
 210 ENDPROC(memmove)
 211 EXPORT_SYMBOL(__memmove)
 212 EXPORT_SYMBOL(memmove)