-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathasm_vecAdd_sse.s
67 lines (50 loc) · 1015 Bytes
/
asm_vecAdd_sse.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// +build sse
// +build amd64
#include "textflag.h"
// func addAsm(a, b []float32)
TEXT ·addAsm(SB), NOSPLIT, $0
MOVQ a_data+0(FP), SI
MOVQ b_data+24(FP), DI // use destination index register for this
MOVQ a_len+8(FP), AX // len(a) into AX
// check if there are at least 16 elements
SUBQ $16, AX
JL remainder
loop:
// a[0]
MOVUPS (SI), X0
MOVUPS (DI), X1
ADDPS X0, X1
MOVUPS X1, (SI)
MOVUPS 16(SI), X2
MOVUPS 16(DI), X3
ADDPS X2, X3
MOVUPS X3, 16(SI)
MOVUPS 32(SI), X4
MOVUPS 32(DI), X5
ADDPS X4, X5
MOVUPS X5, 32(SI)
MOVUPS 48(SI), X6
MOVUPS 48(DI), X7
ADDPS X6, X7
MOVUPS X7, 48(SI)
// update pointers. 4 registers, 4 elements each, 4 bytes per element
ADDQ $64, SI
ADDQ $64, DI
// len(a) is now 4*4 elements less
SUBQ $16, AX
JGE loop
remainder:
ADDQ $16, AX
JE done
remainderloop:
MOVSS (SI), X0
MOVSS (DI), X1
ADDSS X0, X1
MOVSS X1, (SI)
// update pointer to the top of the data
ADDQ $4, SI
ADDQ $4, DI
DECQ AX
JNE remainderloop
done:
RET