Skip to content

Commit 0cb1019

Browse files
author
Jim Grosbach
committed
Legalize vector truncates by parts rather than just splitting.
Rather than just splitting the input type and hoping for the best, apply a bit more cleverness. Just splitting the types until the source is legal often leads to an illegal result time, which is then widened and a scalarization step is introduced which leads to truly horrible code generation. With the loop vectorizer, these sorts of operations are much more common, and so it's worth extra effort to do them well. Add a legalization hook for the operands of a TRUNCATE node, which will be encountered after the result type has been legalized, but if the operand type is still illegal. If simple splitting of both types ends up with the result type of each half still being legal, just do that (v16i16 -> v16i8 on ARM, for example). If, however, that would result in an illegal result type (v8i32 -> v8i8 on ARM, for example), we can get more clever with power-two vectors. Specifically, split the input type, but also widen the result element size, then concatenate the halves and truncate again. For example on ARM, To perform a "%res = v8i8 trunc v8i32 %in" we transform to: %inlo = v4i32 extract_subvector %in, 0 %inhi = v4i32 extract_subvector %in, 4 %lo16 = v4i16 trunc v4i32 %inlo %hi16 = v4i16 trunc v4i32 %inhi %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16 %res = v8i8 trunc v8i16 %in16 This allows instruction selection to generate three VMOVN instructions instead of a sequences of moves, stores and loads. Update the ARMTargetTransformInfo to take this improved legalization into account. Consider the simplified IR: define <16 x i8> @test1(<16 x i32>* %ap) { %a = load <16 x i32>* %ap %tmp = trunc <16 x i32> %a to <16 x i8> ret <16 x i8> %tmp } define <8 x i8> @test2(<8 x i32>* %ap) { %a = load <8 x i32>* %ap %tmp = trunc <8 x i32> %a to <8 x i8> ret <8 x i8> %tmp } Previously, we would generate the truly hideous: .syntax unified .section __TEXT,__text,regular,pure_instructions .globl _test1 .align 2 _test1: @ @test1 @ BB#0: push {r7} mov r7, sp sub sp, sp, #20 bic sp, sp, #7 add r1, r0, #48 add r2, r0, #32 vld1.64 {d24, d25}, [r0:128] vld1.64 {d16, d17}, [r1:128] vld1.64 {d18, d19}, [r2:128] add r1, r0, #16 vmovn.i32 d22, q8 vld1.64 {d16, d17}, [r1:128] vmovn.i32 d20, q9 vmovn.i32 d18, q12 vmov.u16 r0, d22[3] strb r0, [sp, #15] vmov.u16 r0, d22[2] strb r0, [sp, #14] vmov.u16 r0, d22[1] strb r0, [sp, #13] vmov.u16 r0, d22[0] vmovn.i32 d16, q8 strb r0, [sp, #12] vmov.u16 r0, d20[3] strb r0, [sp, #11] vmov.u16 r0, d20[2] strb r0, [sp, #10] vmov.u16 r0, d20[1] strb r0, [sp, #9] vmov.u16 r0, d20[0] strb r0, [sp, #8] vmov.u16 r0, d18[3] strb r0, [sp, #3] vmov.u16 r0, d18[2] strb r0, [sp, #2] vmov.u16 r0, d18[1] strb r0, [sp, #1] vmov.u16 r0, d18[0] strb r0, [sp] vmov.u16 r0, d16[3] strb r0, [sp, #7] vmov.u16 r0, d16[2] strb r0, [sp, #6] vmov.u16 r0, d16[1] strb r0, [sp, #5] vmov.u16 r0, d16[0] strb r0, [sp, #4] vldmia sp, {d16, d17} vmov r0, r1, d16 vmov r2, r3, d17 mov sp, r7 pop {r7} bx lr .globl _test2 .align 2 _test2: @ @test2 @ BB#0: push {r7} mov r7, sp sub sp, sp, #12 bic sp, sp, #7 vld1.64 {d16, d17}, [r0:128] add r0, r0, #16 vld1.64 {d20, d21}, [r0:128] vmovn.i32 d18, q8 vmov.u16 r0, d18[3] vmovn.i32 d16, q10 strb r0, [sp, #3] vmov.u16 r0, d18[2] strb r0, [sp, #2] vmov.u16 r0, d18[1] strb r0, [sp, #1] vmov.u16 r0, d18[0] strb r0, [sp] vmov.u16 r0, d16[3] strb r0, [sp, #7] vmov.u16 r0, d16[2] strb r0, [sp, #6] vmov.u16 r0, d16[1] strb r0, [sp, #5] vmov.u16 r0, d16[0] strb r0, [sp, #4] ldm sp, {r0, r1} mov sp, r7 pop {r7} bx lr Now, however, we generate the much more straightforward: .syntax unified .section __TEXT,__text,regular,pure_instructions .globl _test1 .align 2 _test1: @ @test1 @ BB#0: add r1, r0, #48 add r2, r0, #32 vld1.64 {d20, d21}, [r0:128] vld1.64 {d16, d17}, [r1:128] add r1, r0, #16 vld1.64 {d18, d19}, [r2:128] vld1.64 {d22, d23}, [r1:128] vmovn.i32 d17, q8 vmovn.i32 d16, q9 vmovn.i32 d18, q10 vmovn.i32 d19, q11 vmovn.i16 d17, q8 vmovn.i16 d16, q9 vmov r0, r1, d16 vmov r2, r3, d17 bx lr .globl _test2 .align 2 _test2: @ @test2 @ BB#0: vld1.64 {d16, d17}, [r0:128] add r0, r0, #16 vld1.64 {d18, d19}, [r0:128] vmovn.i32 d16, q8 vmovn.i32 d17, q9 vmovn.i16 d16, q8 vmov r0, r1, d16 bx lr git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179989 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 5eabdf2 commit 0cb1019

File tree

5 files changed

+81
-38
lines changed

5 files changed

+81
-38
lines changed

lib/CodeGen/SelectionDAG/LegalizeTypes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
581581
SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
582582
SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
583583
SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
584+
SDValue SplitVecOp_TRUNCATE(SDNode *N);
584585
SDValue SplitVecOp_VSETCC(SDNode *N);
585586
SDValue SplitVecOp_FP_ROUND(SDNode *N);
586587

lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
10461046
case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
10471047
case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
10481048
case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break;
1049+
case ISD::TRUNCATE: Res = SplitVecOp_TRUNCATE(N); break;
10491050
case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break;
10501051
case ISD::STORE:
10511052
Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
@@ -1062,7 +1063,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
10621063
case ISD::SINT_TO_FP:
10631064
case ISD::UINT_TO_FP:
10641065
case ISD::FTRUNC:
1065-
case ISD::TRUNCATE:
10661066
case ISD::SIGN_EXTEND:
10671067
case ISD::ZERO_EXTEND:
10681068
case ISD::ANY_EXTEND:
@@ -1293,6 +1293,66 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
12931293
&Elts[0], Elts.size());
12941294
}
12951295

1296+
SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
1297+
// The result type is legal, but the input type is illegal. If splitting
1298+
// ends up with the result type of each half still being legal, just
1299+
// do that. If, however, that would result in an illegal result type,
1300+
// we can try to get more clever with power-two vectors. Specifically,
1301+
// split the input type, but also widen the result element size, then
1302+
// concatenate the halves and truncate again. For example, consider a target
1303+
// where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
1304+
// vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
1305+
// %inlo = v4i32 extract_subvector %in, 0
1306+
// %inhi = v4i32 extract_subvector %in, 4
1307+
// %lo16 = v4i16 trunc v4i32 %inlo
1308+
// %hi16 = v4i16 trunc v4i32 %inhi
1309+
// %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
1310+
// %res = v8i8 trunc v8i16 %in16
1311+
//
1312+
// Without this transform, the original truncate would end up being
1313+
// scalarized, which is pretty much always a last resort.
1314+
SDValue InVec = N->getOperand(0);
1315+
EVT InVT = InVec->getValueType(0);
1316+
EVT OutVT = N->getValueType(0);
1317+
unsigned NumElements = OutVT.getVectorNumElements();
1318+
// Widening should have already made sure this is a power-two vector
1319+
// if we're trying to split it at all. assert() that's true, just in case.
1320+
assert(!(NumElements & 1) && "Splitting vector, but not in half!");
1321+
1322+
unsigned InElementSize = InVT.getVectorElementType().getSizeInBits();
1323+
unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits();
1324+
1325+
// If the input elements are only 1/2 the width of the result elements,
1326+
// just use the normal splitting. Our trick only work if there's room
1327+
// to split more than once.
1328+
if (InElementSize <= OutElementSize * 2)
1329+
return SplitVecOp_UnaryOp(N);
1330+
DebugLoc DL = N->getDebugLoc();
1331+
1332+
// Extract the halves of the input via extract_subvector.
1333+
EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
1334+
InVT.getVectorElementType(), NumElements/2);
1335+
SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
1336+
DAG.getIntPtrConstant(0));
1337+
SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
1338+
DAG.getIntPtrConstant(NumElements/2));
1339+
// Truncate them to 1/2 the element size.
1340+
EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
1341+
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
1342+
NumElements/2);
1343+
SDValue HalfLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InLoVec);
1344+
SDValue HalfHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InHiVec);
1345+
// Concatenate them to get the full intermediate truncation result.
1346+
EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
1347+
SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
1348+
HalfHi);
1349+
// Now finish up by truncating all the way down to the original result
1350+
// type. This should normally be something that ends up being legal directly,
1351+
// but in theory if a target has very wide vectors and an annoyingly
1352+
// restricted set of legal types, this split can chain to build things up.
1353+
return DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
1354+
}
1355+
12961356
SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
12971357
assert(N->getValueType(0).isVector() &&
12981358
N->getOperand(0).getValueType().isVector() &&

lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,9 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
223223
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
224224
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
225225

226-
// Operations that we legalize using load/stores to the stack.
227-
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
228-
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
226+
// Operations that we legalize using splitting.
227+
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
228+
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
229229

230230
// Vector float <-> i32 conversions.
231231
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },

test/Analysis/CostModel/ARM/cast.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,9 @@ define i32 @casts() {
175175
%rext_5 = zext <4 x i16> undef to <4 x i64>
176176

177177
; Vector cast cost of instructions lowering the cast to the stack.
178-
; CHECK: cost of 19 {{.*}} trunc
178+
; CHECK: cost of 3 {{.*}} trunc
179179
%r74 = trunc <8 x i32> undef to <8 x i8>
180-
; CHECK: cost of 38 {{.*}} trunc
180+
; CHECK: cost of 6 {{.*}} trunc
181181
%r75 = trunc <16 x i32> undef to <16 x i8>
182182

183183
; Floating point truncation costs.

test/CodeGen/ARM/vcvt-cost.ll

Lines changed: 14 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,22 @@ define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) {
3232
store %TA1_5 %r, %TA1_5* %storeaddr
3333
ret void
3434
}
35-
;; We currently estimate the cost of this instruction as expensive. If lowering
36-
;; is improved the cost needs to change.
35+
3736
%T0_51 = type <8 x i32>
3837
%T1_51 = type <8 x i8>
3938
; CHECK: func_cvt51:
4039
define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) {
41-
; CHECK: strb
42-
; CHECK: strb
43-
; CHECK: strb
44-
; CHECK: strb
45-
; CHECK: strb
46-
; CHECK: strb
47-
; CHECK: strb
48-
; CHECK: strb
40+
; CHECK: vmovn.i32
41+
; CHECK: vmovn.i32
42+
; CHECK: vmovn.i16
4943
%v0 = load %T0_51* %loadaddr
5044
; COST: func_cvt51
51-
; COST: cost of 19 {{.*}} trunc
45+
; COST: cost of 3 {{.*}} trunc
5246
%r = trunc %T0_51 %v0 to %T1_51
5347
store %T1_51 %r, %T1_51* %storeaddr
5448
ret void
5549
}
56-
;; We currently estimate the cost of this instruction as expensive. If lowering
57-
;; is improved the cost needs to change.
50+
5851
%TT0_5 = type <16 x i8>
5952
%TT1_5 = type <16 x i32>
6053
; CHECK: func_cvt52:
@@ -87,31 +80,20 @@ define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) {
8780
store %TTA1_5 %r, %TTA1_5* %storeaddr
8881
ret void
8982
}
90-
;; We currently estimate the cost of this instruction as expensive. If lowering
91-
;; is improved the cost needs to change.
83+
9284
%TT0_51 = type <16 x i32>
9385
%TT1_51 = type <16 x i8>
9486
; CHECK: func_cvt512:
9587
define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) {
96-
; CHECK: strb
97-
; CHECK: strb
98-
; CHECK: strb
99-
; CHECK: strb
100-
; CHECK: strb
101-
; CHECK: strb
102-
; CHECK: strb
103-
; CHECK: strb
104-
; CHECK: strb
105-
; CHECK: strb
106-
; CHECK: strb
107-
; CHECK: strb
108-
; CHECK: strb
109-
; CHECK: strb
110-
; CHECK: strb
111-
; CHECK: strb
88+
; CHECK: vmovn.i32
89+
; CHECK: vmovn.i32
90+
; CHECK: vmovn.i32
91+
; CHECK: vmovn.i32
92+
; CHECK: vmovn.i16
93+
; CHECK: vmovn.i16
11294
%v0 = load %TT0_51* %loadaddr
11395
; COST: func_cvt512
114-
; COST: cost of 38 {{.*}} trunc
96+
; COST: cost of 6 {{.*}} trunc
11597
%r = trunc %TT0_51 %v0 to %TT1_51
11698
store %TT1_51 %r, %TT1_51* %storeaddr
11799
ret void

0 commit comments

Comments
 (0)