Skip to content

Commit b7d3a2b

Browse files
committed
[ARM] Mark i64 and f64 shuffles as Custom for MVE
This way they get lowered through the ARMISD::BUILD_VECTOR, which can produce more efficient D register moves. Also helps D115653 not get stuck in a loop.
1 parent 3dff4f5 commit b7d3a2b

File tree

8 files changed

+248
-92
lines changed

8 files changed

+248
-92
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
392392
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
393393
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
394394
setOperationAction(ISD::VSELECT, VT, Legal);
395+
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
395396
}
396397
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
397398

llvm/test/CodeGen/Thumb2/mve-shuffle.ll

Lines changed: 185 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,6 +1474,189 @@ entry:
14741474
ret <2 x double> %out
14751475
}
14761476

1477+
define arm_aapcs_vfpcc <4 x double> @shuffle4_f64(<2 x double> %src1, <2 x double> %src2) {
1478+
; CHECK-LABEL: shuffle4_f64:
1479+
; CHECK: @ %bb.0: @ %entry
1480+
; CHECK-NEXT: vmov.f32 s8, s6
1481+
; CHECK-NEXT: vmov.f32 s6, s0
1482+
; CHECK-NEXT: vmov.f32 s9, s7
1483+
; CHECK-NEXT: vmov.f32 s7, s1
1484+
; CHECK-NEXT: vmov.f32 s10, s2
1485+
; CHECK-NEXT: vmov.f32 s11, s3
1486+
; CHECK-NEXT: vmov q0, q2
1487+
; CHECK-NEXT: bx lr
1488+
entry:
1489+
%out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
1490+
ret <4 x double> %out
1491+
}
1492+
define arm_aapcs_vfpcc <4 x double> @shuffle5_f64(<2 x double> %src1, <2 x double> %src2) {
1493+
; CHECK-LABEL: shuffle5_f64:
1494+
; CHECK: @ %bb.0: @ %entry
1495+
; CHECK-NEXT: vmov.f32 s8, s6
1496+
; CHECK-NEXT: vmov.f32 s10, s4
1497+
; CHECK-NEXT: vmov.f32 s4, s2
1498+
; CHECK-NEXT: vmov.f32 s6, s0
1499+
; CHECK-NEXT: vmov.f32 s9, s7
1500+
; CHECK-NEXT: vmov.f32 s11, s5
1501+
; CHECK-NEXT: vmov.f32 s5, s3
1502+
; CHECK-NEXT: vmov.f32 s7, s1
1503+
; CHECK-NEXT: vmov q0, q2
1504+
; CHECK-NEXT: bx lr
1505+
entry:
1506+
%out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1507+
ret <4 x double> %out
1508+
}
1509+
define arm_aapcs_vfpcc <2 x double> @shuffle6_f64(<2 x double> %src1, <2 x double> %src2) {
1510+
; CHECK-LABEL: shuffle6_f64:
1511+
; CHECK: @ %bb.0: @ %entry
1512+
; CHECK-NEXT: vmov.f32 s2, s6
1513+
; CHECK-NEXT: vmov.f32 s3, s7
1514+
; CHECK-NEXT: bx lr
1515+
entry:
1516+
%out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 0, i32 3>
1517+
ret <2 x double> %out
1518+
}
1519+
define arm_aapcs_vfpcc <2 x double> @shuffle7_f64(<2 x double> %src1, <2 x double> %src2) {
1520+
; CHECK-LABEL: shuffle7_f64:
1521+
; CHECK: @ %bb.0: @ %entry
1522+
; CHECK-NEXT: vmov.f32 s0, s6
1523+
; CHECK-NEXT: vmov.f32 s1, s7
1524+
; CHECK-NEXT: bx lr
1525+
entry:
1526+
%out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 3, i32 1>
1527+
ret <2 x double> %out
1528+
}
1529+
define arm_aapcs_vfpcc <2 x double> @shuffle8_f64(<2 x double> %src1, <2 x double> %src2) {
1530+
; CHECK-LABEL: shuffle8_f64:
1531+
; CHECK: @ %bb.0: @ %entry
1532+
; CHECK-NEXT: vmov.f32 s6, s2
1533+
; CHECK-NEXT: vmov.f32 s7, s3
1534+
; CHECK-NEXT: vmov q0, q1
1535+
; CHECK-NEXT: bx lr
1536+
entry:
1537+
%out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 2, i32 1>
1538+
ret <2 x double> %out
1539+
}
1540+
define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) {
1541+
; CHECK-LABEL: shuffle9_f64:
1542+
; CHECK: @ %bb.0: @ %entry
1543+
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1544+
; CHECK-NEXT: vpush {d8, d9, d10, d11}
1545+
; CHECK-NEXT: vmov q5, q2
1546+
; CHECK-NEXT: vmov.f32 s16, s0
1547+
; CHECK-NEXT: vmov.f32 s18, s20
1548+
; CHECK-NEXT: vmov.f32 s20, s2
1549+
; CHECK-NEXT: vmov.f32 s10, s12
1550+
; CHECK-NEXT: vmov.f32 s19, s21
1551+
; CHECK-NEXT: vmov.f32 s8, s4
1552+
; CHECK-NEXT: vmov.f32 s17, s1
1553+
; CHECK-NEXT: vmov.f32 s21, s3
1554+
; CHECK-NEXT: vmov q0, q4
1555+
; CHECK-NEXT: vmov.f32 s12, s6
1556+
; CHECK-NEXT: vmov.f32 s11, s13
1557+
; CHECK-NEXT: vmov.f32 s9, s5
1558+
; CHECK-NEXT: vmov.f32 s13, s7
1559+
; CHECK-NEXT: vmov q1, q5
1560+
; CHECK-NEXT: vpop {d8, d9, d10, d11}
1561+
; CHECK-NEXT: bx lr
1562+
entry:
1563+
%out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1564+
ret <8 x double> %out
1565+
}
1566+
1567+
1568+
1569+
1570+
define arm_aapcs_vfpcc <4 x i64> @shuffle4_i64(<2 x i64> %src1, <2 x i64> %src2) {
1571+
; CHECK-LABEL: shuffle4_i64:
1572+
; CHECK: @ %bb.0: @ %entry
1573+
; CHECK-NEXT: vmov.f32 s8, s6
1574+
; CHECK-NEXT: vmov.f32 s6, s0
1575+
; CHECK-NEXT: vmov.f32 s9, s7
1576+
; CHECK-NEXT: vmov.f32 s7, s1
1577+
; CHECK-NEXT: vmov.f32 s10, s2
1578+
; CHECK-NEXT: vmov.f32 s11, s3
1579+
; CHECK-NEXT: vmov q0, q2
1580+
; CHECK-NEXT: bx lr
1581+
entry:
1582+
%out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
1583+
ret <4 x i64> %out
1584+
}
1585+
define arm_aapcs_vfpcc <4 x i64> @shuffle5_i64(<2 x i64> %src1, <2 x i64> %src2) {
1586+
; CHECK-LABEL: shuffle5_i64:
1587+
; CHECK: @ %bb.0: @ %entry
1588+
; CHECK-NEXT: vmov.f32 s8, s6
1589+
; CHECK-NEXT: vmov.f32 s10, s4
1590+
; CHECK-NEXT: vmov.f32 s4, s2
1591+
; CHECK-NEXT: vmov.f32 s6, s0
1592+
; CHECK-NEXT: vmov.f32 s9, s7
1593+
; CHECK-NEXT: vmov.f32 s11, s5
1594+
; CHECK-NEXT: vmov.f32 s5, s3
1595+
; CHECK-NEXT: vmov.f32 s7, s1
1596+
; CHECK-NEXT: vmov q0, q2
1597+
; CHECK-NEXT: bx lr
1598+
entry:
1599+
%out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1600+
ret <4 x i64> %out
1601+
}
1602+
define arm_aapcs_vfpcc <2 x i64> @shuffle6_i64(<2 x i64> %src1, <2 x i64> %src2) {
1603+
; CHECK-LABEL: shuffle6_i64:
1604+
; CHECK: @ %bb.0: @ %entry
1605+
; CHECK-NEXT: vmov.f32 s2, s6
1606+
; CHECK-NEXT: vmov.f32 s3, s7
1607+
; CHECK-NEXT: bx lr
1608+
entry:
1609+
%out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 0, i32 3>
1610+
ret <2 x i64> %out
1611+
}
1612+
define arm_aapcs_vfpcc <2 x i64> @shuffle7_i64(<2 x i64> %src1, <2 x i64> %src2) {
1613+
; CHECK-LABEL: shuffle7_i64:
1614+
; CHECK: @ %bb.0: @ %entry
1615+
; CHECK-NEXT: vmov.f32 s0, s6
1616+
; CHECK-NEXT: vmov.f32 s1, s7
1617+
; CHECK-NEXT: bx lr
1618+
entry:
1619+
%out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 1>
1620+
ret <2 x i64> %out
1621+
}
1622+
define arm_aapcs_vfpcc <2 x i64> @shuffle8_i64(<2 x i64> %src1, <2 x i64> %src2) {
1623+
; CHECK-LABEL: shuffle8_i64:
1624+
; CHECK: @ %bb.0: @ %entry
1625+
; CHECK-NEXT: vmov.f32 s6, s2
1626+
; CHECK-NEXT: vmov.f32 s7, s3
1627+
; CHECK-NEXT: vmov q0, q1
1628+
; CHECK-NEXT: bx lr
1629+
entry:
1630+
%out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 2, i32 1>
1631+
ret <2 x i64> %out
1632+
}
1633+
define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) {
1634+
; CHECK-LABEL: shuffle9_i64:
1635+
; CHECK: @ %bb.0: @ %entry
1636+
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1637+
; CHECK-NEXT: vpush {d8, d9, d10, d11}
1638+
; CHECK-NEXT: vmov q5, q2
1639+
; CHECK-NEXT: vmov.f32 s16, s0
1640+
; CHECK-NEXT: vmov.f32 s18, s20
1641+
; CHECK-NEXT: vmov.f32 s20, s2
1642+
; CHECK-NEXT: vmov.f32 s10, s12
1643+
; CHECK-NEXT: vmov.f32 s19, s21
1644+
; CHECK-NEXT: vmov.f32 s8, s4
1645+
; CHECK-NEXT: vmov.f32 s17, s1
1646+
; CHECK-NEXT: vmov.f32 s21, s3
1647+
; CHECK-NEXT: vmov q0, q4
1648+
; CHECK-NEXT: vmov.f32 s12, s6
1649+
; CHECK-NEXT: vmov.f32 s11, s13
1650+
; CHECK-NEXT: vmov.f32 s9, s5
1651+
; CHECK-NEXT: vmov.f32 s13, s7
1652+
; CHECK-NEXT: vmov q1, q5
1653+
; CHECK-NEXT: vpop {d8, d9, d10, d11}
1654+
; CHECK-NEXT: bx lr
1655+
entry:
1656+
%out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1657+
ret <8 x i64> %out
1658+
}
1659+
14771660

14781661
define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) {
14791662
; CHECK-LABEL: insert_i32:
@@ -1548,7 +1731,7 @@ define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) {
15481731
; CHECK: @ %bb.0: @ %entry
15491732
; CHECK-NEXT: .pad #8
15501733
; CHECK-NEXT: sub sp, #8
1551-
; CHECK-NEXT: adr r2, .LCPI76_0
1734+
; CHECK-NEXT: adr r2, .LCPI88_0
15521735
; CHECK-NEXT: vmov.u16 r0, q0[0]
15531736
; CHECK-NEXT: vldrw.u32 q0, [r2]
15541737
; CHECK-NEXT: mov r1, sp
@@ -1558,7 +1741,7 @@ define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) {
15581741
; CHECK-NEXT: bx lr
15591742
; CHECK-NEXT: .p2align 4
15601743
; CHECK-NEXT: @ %bb.1:
1561-
; CHECK-NEXT: .LCPI76_0:
1744+
; CHECK-NEXT: .LCPI88_0:
15621745
; CHECK-NEXT: .zero 4
15631746
; CHECK-NEXT: .long 7 @ 0x7
15641747
; CHECK-NEXT: .long 1 @ 0x1

llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,8 @@ define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i
149149
; CHECK-LABEL: vmovn64_b2:
150150
; CHECK: @ %bb.0: @ %entry
151151
; CHECK-NEXT: vmov.f32 s4, s6
152-
; CHECK-NEXT: vmov.f32 s5, s7
153152
; CHECK-NEXT: vmov.f32 s6, s0
153+
; CHECK-NEXT: vmov.f32 s5, s7
154154
; CHECK-NEXT: vmov.f32 s7, s1
155155
; CHECK-NEXT: vstrw.32 q1, [r0]
156156
; CHECK-NEXT: bx lr
@@ -164,8 +164,8 @@ define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i
164164
; CHECK-LABEL: vmovn64_b3:
165165
; CHECK: @ %bb.0: @ %entry
166166
; CHECK-NEXT: vmov.f32 s0, s2
167-
; CHECK-NEXT: vmov.f32 s1, s3
168167
; CHECK-NEXT: vmov.f32 s2, s4
168+
; CHECK-NEXT: vmov.f32 s1, s3
169169
; CHECK-NEXT: vmov.f32 s3, s5
170170
; CHECK-NEXT: vstrw.32 q0, [r0]
171171
; CHECK-NEXT: bx lr

llvm/test/CodeGen/Thumb2/mve-vst2-post.ll

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,14 @@ entry:
7272
define <4 x i64> *@vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
7373
; CHECK-LABEL: vst2_v2i64:
7474
; CHECK: @ %bb.0: @ %entry
75-
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
76-
; CHECK-NEXT: vldrw.u32 q0, [r0]
77-
; CHECK-NEXT: add.w r0, r1, #32
78-
; CHECK-NEXT: vmov.f32 s8, s2
79-
; CHECK-NEXT: vmov.f32 s9, s3
80-
; CHECK-NEXT: vmov.f32 s2, s4
81-
; CHECK-NEXT: vmov.f32 s3, s5
82-
; CHECK-NEXT: vmov.f32 s10, s6
83-
; CHECK-NEXT: vstrb.8 q0, [r1], #16
84-
; CHECK-NEXT: vmov.f32 s11, s7
85-
; CHECK-NEXT: vstrw.32 q2, [r1]
75+
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
76+
; CHECK-NEXT: vldrw.u32 q1, [r0]
77+
; CHECK-NEXT: vmov.f64 d5, d0
78+
; CHECK-NEXT: vmov.f64 d0, d3
79+
; CHECK-NEXT: vmov.f64 d4, d2
80+
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
81+
; CHECK-NEXT: vstrw.32 q2, [r1], #32
82+
; CHECK-NEXT: mov r0, r1
8683
; CHECK-NEXT: bx lr
8784
entry:
8885
%s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0

llvm/test/CodeGen/Thumb2/mve-vst2.ll

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -327,14 +327,11 @@ define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
327327
; CHECK: @ %bb.0: @ %entry
328328
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
329329
; CHECK-NEXT: vldrw.u32 q1, [r0]
330-
; CHECK-NEXT: vmov.f32 s10, s0
331-
; CHECK-NEXT: vmov.f32 s11, s1
332-
; CHECK-NEXT: vmov.f32 s8, s4
333-
; CHECK-NEXT: vmov.f32 s9, s5
334-
; CHECK-NEXT: vmov.f32 s0, s6
335-
; CHECK-NEXT: vstrb.8 q2, [r1], #16
336-
; CHECK-NEXT: vmov.f32 s1, s7
337-
; CHECK-NEXT: vstrw.32 q0, [r1]
330+
; CHECK-NEXT: vmov.f64 d4, d3
331+
; CHECK-NEXT: vmov.f64 d5, d1
332+
; CHECK-NEXT: vmov.f64 d3, d0
333+
; CHECK-NEXT: vstrw.32 q2, [r1, #16]
334+
; CHECK-NEXT: vstrw.32 q1, [r1]
338335
; CHECK-NEXT: bx lr
339336
entry:
340337
%s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
@@ -349,29 +346,23 @@ entry:
349346
define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) {
350347
; CHECK-LABEL: vst2_v4i64:
351348
; CHECK: @ %bb.0: @ %entry
352-
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
353-
; CHECK-NEXT: vpush {d8, d9, d10, d11}
354-
; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
355-
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
356-
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
357-
; CHECK-NEXT: vldrw.u32 q1, [r0]
358-
; CHECK-NEXT: vmov.f32 s12, s2
359-
; CHECK-NEXT: vmov.f32 s13, s3
360-
; CHECK-NEXT: vmov.f32 s20, s6
361-
; CHECK-NEXT: vmov.f32 s21, s7
362-
; CHECK-NEXT: vmov.f32 s2, s16
363-
; CHECK-NEXT: vmov.f32 s3, s17
364-
; CHECK-NEXT: vmov.f32 s6, s8
365-
; CHECK-NEXT: vstrw.32 q0, [r1, #32]
366-
; CHECK-NEXT: vmov.f32 s7, s9
367-
; CHECK-NEXT: vmov.f32 s14, s18
368-
; CHECK-NEXT: vstrb.8 q1, [r1], #48
369-
; CHECK-NEXT: vmov.f32 s15, s19
370-
; CHECK-NEXT: vmov.f32 s22, s10
371-
; CHECK-NEXT: vstrw.32 q3, [r1]
372-
; CHECK-NEXT: vmov.f32 s23, s11
373-
; CHECK-NEXT: vstrw.32 q5, [r1, #-32]
374-
; CHECK-NEXT: vpop {d8, d9, d10, d11}
349+
; CHECK-NEXT: .vsave {d8, d9}
350+
; CHECK-NEXT: vpush {d8, d9}
351+
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
352+
; CHECK-NEXT: vldrw.u32 q2, [r0]
353+
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
354+
; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
355+
; CHECK-NEXT: vmov.f64 d8, d4
356+
; CHECK-NEXT: vmov.f64 d9, d0
357+
; CHECK-NEXT: vmov.f64 d0, d5
358+
; CHECK-NEXT: vstrw.32 q4, [r1]
359+
; CHECK-NEXT: vmov.f64 d5, d2
360+
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
361+
; CHECK-NEXT: vmov.f64 d4, d6
362+
; CHECK-NEXT: vmov.f64 d2, d7
363+
; CHECK-NEXT: vstrw.32 q2, [r1, #32]
364+
; CHECK-NEXT: vstrw.32 q1, [r1, #48]
365+
; CHECK-NEXT: vpop {d8, d9}
375366
; CHECK-NEXT: bx lr
376367
entry:
377368
%s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0

llvm/test/CodeGen/Thumb2/mve-vst3.ll

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -916,20 +916,16 @@ entry:
916916
define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
917917
; CHECK-LABEL: vst3_v2i64:
918918
; CHECK: @ %bb.0: @ %entry
919-
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
919+
; CHECK-NEXT: vldrw.u32 q0, [r0]
920+
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
920921
; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
921-
; CHECK-NEXT: vldrw.u32 q1, [r0]
922-
; CHECK-NEXT: vmov.f32 s14, s2
923-
; CHECK-NEXT: vmov.f32 s15, s3
924-
; CHECK-NEXT: vmov.f32 s2, s6
925-
; CHECK-NEXT: vmov.f32 s3, s7
926-
; CHECK-NEXT: vmov.f32 s6, s8
927-
; CHECK-NEXT: vmov.f32 s7, s9
928-
; CHECK-NEXT: vstrb.8 q1, [r1], #32
929-
; CHECK-NEXT: vmov.f32 s12, s10
930-
; CHECK-NEXT: vmov.f32 s13, s11
931-
; CHECK-NEXT: vstrw.32 q0, [r1, #-16]
932-
; CHECK-NEXT: vstrw.32 q3, [r1]
922+
; CHECK-NEXT: vmov.f64 d6, d2
923+
; CHECK-NEXT: vmov.f64 d7, d1
924+
; CHECK-NEXT: vmov.f64 d1, d4
925+
; CHECK-NEXT: vstrw.32 q3, [r1, #16]
926+
; CHECK-NEXT: vmov.f64 d2, d5
927+
; CHECK-NEXT: vstrw.32 q0, [r1]
928+
; CHECK-NEXT: vstrw.32 q1, [r1, #32]
933929
; CHECK-NEXT: bx lr
934930
entry:
935931
%s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0

llvm/test/CodeGen/Thumb2/mve-vst4-post.ll

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -105,25 +105,19 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
105105
; CHECK-NEXT: .vsave {d8, d9}
106106
; CHECK-NEXT: vpush {d8, d9}
107107
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
108-
; CHECK-NEXT: vldrw.u32 q4, [r0]
109-
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
110-
; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
111-
; CHECK-NEXT: vmov.f32 s8, s16
112-
; CHECK-NEXT: vmov.f32 s9, s17
113-
; CHECK-NEXT: vmov.f32 s10, s0
114-
; CHECK-NEXT: vmov.f32 s11, s1
115-
; CHECK-NEXT: vmov.f32 s0, s18
116-
; CHECK-NEXT: vmov.f32 s1, s19
117-
; CHECK-NEXT: vmov.f32 s18, s4
108+
; CHECK-NEXT: vldrw.u32 q3, [r0]
109+
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
110+
; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
111+
; CHECK-NEXT: vmov.f64 d2, d6
112+
; CHECK-NEXT: vmov.f64 d3, d0
113+
; CHECK-NEXT: vmov.f64 d0, d7
114+
; CHECK-NEXT: vmov.f64 d7, d4
118115
; CHECK-NEXT: vstrw.32 q0, [r1, #32]
119-
; CHECK-NEXT: vmov.f32 s19, s5
120-
; CHECK-NEXT: vmov.f32 s16, s12
121-
; CHECK-NEXT: vmov.f32 s17, s13
122-
; CHECK-NEXT: vmov.f32 s4, s14
123-
; CHECK-NEXT: vstrw.32 q4, [r1, #16]
124-
; CHECK-NEXT: vmov.f32 s5, s15
125-
; CHECK-NEXT: vstrw.32 q1, [r1, #48]
126-
; CHECK-NEXT: vstrw.32 q2, [r1], #64
116+
; CHECK-NEXT: vmov.f64 d6, d8
117+
; CHECK-NEXT: vmov.f64 d4, d9
118+
; CHECK-NEXT: vstrw.32 q3, [r1, #16]
119+
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
120+
; CHECK-NEXT: vstrw.32 q1, [r1], #64
127121
; CHECK-NEXT: mov r0, r1
128122
; CHECK-NEXT: vpop {d8, d9}
129123
; CHECK-NEXT: bx lr

0 commit comments

Comments
 (0)