Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[cascardo/linux.git] / arch / powerpc / lib / memcpy_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11 #include <asm/export.h>
12
13         .align  7
14 _GLOBAL_TOC(memcpy)
15 BEGIN_FTR_SECTION
16 #ifdef __LITTLE_ENDIAN__
17         cmpdi   cr7,r5,0
18 #else
19         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
20 #endif
21 FTR_SECTION_ELSE
22 #ifndef SELFTEST
23         b       memcpy_power7
24 #endif
25 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
26 #ifdef __LITTLE_ENDIAN__
27         /* dumb little-endian memcpy that will get replaced at runtime */
28         addi r9,r3,-1
29         addi r4,r4,-1
30         beqlr cr7
31         mtctr r5
32 1:      lbzu r10,1(r4)
33         stbu r10,1(r9)
34         bdnz 1b
35         blr
36 #else
37         PPC_MTOCRF(0x01,r5)
38         cmpldi  cr1,r5,16
39         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
40         andi.   r6,r6,7
41         dcbt    0,r4
42         blt     cr1,.Lshort_copy
43 /* Below we want to nop out the bne if we're on a CPU that has the
44    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
45    cleared.
46    At the time of writing the only CPU that has this combination of bits
47    set is Power6. */
48 BEGIN_FTR_SECTION
49         nop
50 FTR_SECTION_ELSE
51         bne     .Ldst_unaligned
52 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
53                     CPU_FTR_UNALIGNED_LD_STD)
54 .Ldst_aligned:
55         addi    r3,r3,-16
56 BEGIN_FTR_SECTION
57         andi.   r0,r4,7
58         bne     .Lsrc_unaligned
59 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
60         srdi    r7,r5,4
61         ld      r9,0(r4)
62         addi    r4,r4,-8
63         mtctr   r7
64         andi.   r5,r5,7
65         bf      cr7*4+0,2f
66         addi    r3,r3,8
67         addi    r4,r4,8
68         mr      r8,r9
69         blt     cr1,3f
70 1:      ld      r9,8(r4)
71         std     r8,8(r3)
72 2:      ldu     r8,16(r4)
73         stdu    r9,16(r3)
74         bdnz    1b
75 3:      std     r8,8(r3)
76         beq     3f
77         addi    r3,r3,16
78 .Ldo_tail:
79         bf      cr7*4+1,1f
80         lwz     r9,8(r4)
81         addi    r4,r4,4
82         stw     r9,0(r3)
83         addi    r3,r3,4
84 1:      bf      cr7*4+2,2f
85         lhz     r9,8(r4)
86         addi    r4,r4,2
87         sth     r9,0(r3)
88         addi    r3,r3,2
89 2:      bf      cr7*4+3,3f
90         lbz     r9,8(r4)
91         stb     r9,0(r3)
92 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
93         blr
94
95 .Lsrc_unaligned:
96         srdi    r6,r5,3
97         addi    r5,r5,-16
98         subf    r4,r0,r4
99         srdi    r7,r5,4
100         sldi    r10,r0,3
101         cmpdi   cr6,r6,3
102         andi.   r5,r5,7
103         mtctr   r7
104         subfic  r11,r10,64
105         add     r5,r5,r0
106
107         bt      cr7*4+0,0f
108
109         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
110         ld      r0,8(r4)
111         sld     r6,r9,r10
112         ldu     r9,16(r4)
113         srd     r7,r0,r11
114         sld     r8,r0,r10
115         or      r7,r7,r6
116         blt     cr6,4f
117         ld      r0,8(r4)
118         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
119         b       2f
120
121 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
122         ldu     r9,8(r4)
123         sld     r8,r0,r10
124         addi    r3,r3,-8
125         blt     cr6,5f
126         ld      r0,8(r4)
127         srd     r12,r9,r11
128         sld     r6,r9,r10
129         ldu     r9,16(r4)
130         or      r12,r8,r12
131         srd     r7,r0,r11
132         sld     r8,r0,r10
133         addi    r3,r3,16
134         beq     cr6,3f
135
136         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
137 1:      or      r7,r7,r6
138         ld      r0,8(r4)
139         std     r12,8(r3)
140 2:      srd     r12,r9,r11
141         sld     r6,r9,r10
142         ldu     r9,16(r4)
143         or      r12,r8,r12
144         stdu    r7,16(r3)
145         srd     r7,r0,r11
146         sld     r8,r0,r10
147         bdnz    1b
148
149 3:      std     r12,8(r3)
150         or      r7,r7,r6
151 4:      std     r7,16(r3)
152 5:      srd     r12,r9,r11
153         or      r12,r8,r12
154         std     r12,24(r3)
155         beq     4f
156         cmpwi   cr1,r5,8
157         addi    r3,r3,32
158         sld     r9,r9,r10
159         ble     cr1,6f
160         ld      r0,8(r4)
161         srd     r7,r0,r11
162         or      r9,r7,r9
163 6:
164         bf      cr7*4+1,1f
165         rotldi  r9,r9,32
166         stw     r9,0(r3)
167         addi    r3,r3,4
168 1:      bf      cr7*4+2,2f
169         rotldi  r9,r9,16
170         sth     r9,0(r3)
171         addi    r3,r3,2
172 2:      bf      cr7*4+3,3f
173         rotldi  r9,r9,8
174         stb     r9,0(r3)
175 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
176         blr
177
178 .Ldst_unaligned:
179         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
180         subf    r5,r6,r5
181         li      r7,0
182         cmpldi  cr1,r5,16
183         bf      cr7*4+3,1f
184         lbz     r0,0(r4)
185         stb     r0,0(r3)
186         addi    r7,r7,1
187 1:      bf      cr7*4+2,2f
188         lhzx    r0,r7,r4
189         sthx    r0,r7,r3
190         addi    r7,r7,2
191 2:      bf      cr7*4+1,3f
192         lwzx    r0,r7,r4
193         stwx    r0,r7,r3
194 3:      PPC_MTOCRF(0x01,r5)
195         add     r4,r6,r4
196         add     r3,r6,r3
197         b       .Ldst_aligned
198
199 .Lshort_copy:
200         bf      cr7*4+0,1f
201         lwz     r0,0(r4)
202         lwz     r9,4(r4)
203         addi    r4,r4,8
204         stw     r0,0(r3)
205         stw     r9,4(r3)
206         addi    r3,r3,8
207 1:      bf      cr7*4+1,2f
208         lwz     r0,0(r4)
209         addi    r4,r4,4
210         stw     r0,0(r3)
211         addi    r3,r3,4
212 2:      bf      cr7*4+2,3f
213         lhz     r0,0(r4)
214         addi    r4,r4,2
215         sth     r0,0(r3)
216         addi    r3,r3,2
217 3:      bf      cr7*4+3,4f
218         lbz     r0,0(r4)
219         stb     r0,0(r3)
220 4:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
221         blr
222 #endif
223 EXPORT_SYMBOL(memcpy)