Skip to content

Commit 414ff81

Browse files
committed
RegisterCoalescer: Add implicit-def of super register when coalescing SUBREG_TO_REG
Currently coalescing with SUBREG_TO_REG introduces an invisible load bearing undef. There is liveness for the super register not represented in the MIR. This is part 1 of a fix for regressions that appeared after b7836d8. The allocator started recognizing undef-def subregister MOVs as copies. Since there was no representation for the dependency on the high bits, different undef segments of the super register ended up disconnected and downstream users ended up observing different undefs than they did previously. This does not yet fix the regression. The isCopyInstr handling needs to start handling implicit-defs on any instruction. I wanted to include an end to end IR test since the actual failure only appeared with an interaction between the coalescer and the allocator. It's a bit bigger than I'd like but I'm having a bit of trouble reducing it to something which definitely shows a diff that's meaningful. The same problem likely exists everywhere trying to do anything with SUBREG_TO_REG. I don't understand how this managed to be broken for so long. This needs to be applied to the release branch. https://reviews.llvm.org/D156345
1 parent f906fd5 commit 414ff81

5 files changed

+623
-10
lines changed

llvm/lib/CodeGen/RegisterCoalescer.cpp

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,11 @@ namespace {
305305
/// number if it is not zero. If DstReg is a physical register and the
306306
/// existing subregister number of the def / use being updated is not zero,
307307
/// make sure to set it to the correct physical subregister.
308-
void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
308+
///
309+
/// If \p IsSubregToReg, we are coalescing a DstReg = SUBREG_TO_REG
310+
/// SrcReg. This introduces an implicit-def of DstReg on coalesced users.
311+
void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx,
312+
bool IsSubregToReg);
309313

310314
/// If the given machine operand reads only undefined lanes add an undef
311315
/// flag.
@@ -1323,8 +1327,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
13231327
if (DstReg.isPhysical()) {
13241328
Register NewDstReg = DstReg;
13251329

1326-
unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
1327-
DefMI->getOperand(0).getSubReg());
1330+
unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx);
13281331
if (NewDstIdx)
13291332
NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
13301333

@@ -1467,7 +1470,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
14671470
MRI->setRegClass(DstReg, NewRC);
14681471

14691472
// Update machine operands and add flags.
1470-
updateRegDefsUses(DstReg, DstReg, DstIdx);
1473+
updateRegDefsUses(DstReg, DstReg, DstIdx, false);
14711474
NewMI.getOperand(0).setSubReg(NewIdx);
14721475
// updateRegDefUses can add an "undef" flag to the definition, since
14731476
// it will replace DstReg with DstReg.DstIdx. If NewIdx is 0, make
@@ -1782,7 +1785,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
17821785
}
17831786

17841787
void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
1785-
unsigned SubIdx) {
1788+
unsigned SubIdx, bool IsSubregToReg) {
17861789
bool DstIsPhys = DstReg.isPhysical();
17871790
LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
17881791

@@ -1822,16 +1825,22 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
18221825
if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
18231826
Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
18241827

1828+
bool FullDef = true;
1829+
18251830
// Replace SrcReg with DstReg in all UseMI operands.
18261831
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
18271832
MachineOperand &MO = UseMI->getOperand(Ops[i]);
18281833

18291834
// Adjust <undef> flags in case of sub-register joins. We don't want to
18301835
// turn a full def into a read-modify-write sub-register def and vice
18311836
// versa.
1832-
if (SubIdx && MO.isDef())
1837+
if (SubIdx && MO.isDef()) {
18331838
MO.setIsUndef(!Reads);
18341839

1840+
if (!Reads)
1841+
FullDef = false;
1842+
}
1843+
18351844
// A subreg use of a partially undef (super) register may be a complete
18361845
// undef use now and then has to be marked that way.
18371846
if (MO.isUse() && !DstIsPhys) {
@@ -1863,6 +1872,25 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
18631872
MO.substVirtReg(DstReg, SubIdx, *TRI);
18641873
}
18651874

1875+
if (IsSubregToReg && !FullDef) {
1876+
// If the coalesed instruction doesn't fully define the register, we need
1877+
// to preserve the original super register liveness for SUBREG_TO_REG.
1878+
//
1879+
// We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
1880+
// but it introduces liveness for other subregisters. Downstream users may
1881+
// have been relying on those bits, so we need to ensure their liveness is
1882+
// captured with a def of other lanes.
1883+
1884+
// FIXME: Need to add new subrange if tracking subranges. We could also
1885+
// skip adding this if we knew the other lanes are dead, and only for
1886+
// other lanes.
1887+
1888+
assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
1889+
"this should update subranges");
1890+
MachineInstrBuilder MIB(*MF, UseMI);
1891+
MIB.addReg(DstReg, RegState::ImplicitDefine);
1892+
}
1893+
18661894
LLVM_DEBUG({
18671895
dbgs() << "\t\tupdated: ";
18681896
if (!UseMI->isDebugInstr())
@@ -2062,6 +2090,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
20622090
});
20632091
}
20642092

2093+
const bool IsSubregToReg = CopyMI->isSubregToReg();
2094+
20652095
ShrinkMask = LaneBitmask::getNone();
20662096
ShrinkMainRange = false;
20672097

@@ -2129,9 +2159,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
21292159

21302160
// Rewrite all SrcReg operands to DstReg.
21312161
// Also update DstReg operands to include DstIdx if it is set.
2132-
if (CP.getDstIdx())
2133-
updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
2134-
updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
2162+
if (CP.getDstIdx()) {
2163+
assert(!IsSubregToReg && "can this happen?");
2164+
updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx(), false);
2165+
}
2166+
updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
2167+
IsSubregToReg);
21352168

21362169
// Shrink subregister ranges if necessary.
21372170
if (ShrinkMask.any()) {

llvm/test/CodeGen/X86/bswap.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ define i64 @finally_useful_bswap() {
226226
; CHECK64: # %bb.0:
227227
; CHECK64-NEXT: movzwl var16(%rip), %ecx
228228
; CHECK64-NEXT: movzbl %cl, %eax
229-
; CHECK64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
229+
; CHECK64-NEXT: # kill: def $ecx killed $ecx def $rcx killed $rcx
230230
; CHECK64-NEXT: shrl $8, %ecx
231231
; CHECK64-NEXT: shlq $8, %rax
232232
; CHECK64-NEXT: orq %rcx, %rax
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=x86_64-grtev4-linux-gnu < %s | FileCheck %s
3+
4+
%struct.wibble = type { %struct.wombat }
5+
%struct.wombat = type { %struct.ham, [3 x i8] }
6+
%struct.ham = type { %struct.zot }
7+
%struct.zot = type { %struct.blam }
8+
%struct.blam = type { %struct.ham.0 }
9+
%struct.ham.0 = type { %struct.bar }
10+
%struct.bar = type { %struct.bar.1 }
11+
%struct.bar.1 = type { %struct.baz, i8 }
12+
%struct.baz = type { %struct.snork }
13+
%struct.snork = type <{ %struct.spam, i8, [3 x i8] }>
14+
%struct.spam = type { %struct.snork.2, %struct.snork.2 }
15+
%struct.snork.2 = type { i32 }
16+
%struct.snork.3 = type { %struct.baz, i8, [3 x i8] }
17+
18+
define void @foo(ptr %arg, ptr %arg1, i40 %arg2, ptr %arg3, i32 %arg4) #0 {
19+
; CHECK-LABEL: foo:
20+
; CHECK: # %bb.0: # %bb
21+
; CHECK-NEXT: pushq %rbp
22+
; CHECK-NEXT: .cfi_def_cfa_offset 16
23+
; CHECK-NEXT: .cfi_offset %rbp, -16
24+
; CHECK-NEXT: movq %rsp, %rbp
25+
; CHECK-NEXT: .cfi_def_cfa_register %rbp
26+
; CHECK-NEXT: pushq %r15
27+
; CHECK-NEXT: pushq %r14
28+
; CHECK-NEXT: pushq %r13
29+
; CHECK-NEXT: pushq %r12
30+
; CHECK-NEXT: pushq %rbx
31+
; CHECK-NEXT: subq $24, %rsp
32+
; CHECK-NEXT: .cfi_offset %rbx, -56
33+
; CHECK-NEXT: .cfi_offset %r12, -48
34+
; CHECK-NEXT: .cfi_offset %r13, -40
35+
; CHECK-NEXT: .cfi_offset %r14, -32
36+
; CHECK-NEXT: .cfi_offset %r15, -24
37+
; CHECK-NEXT: movl %r8d, %r14d
38+
; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
39+
; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
40+
; CHECK-NEXT: movq %rsi, %r13
41+
; CHECK-NEXT: movq %rdi, %r15
42+
; CHECK-NEXT: incl %r14d
43+
; CHECK-NEXT: xorl %ebx, %ebx
44+
; CHECK-NEXT: # implicit-def: $r12
45+
; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
46+
; CHECK-NEXT: jmp .LBB0_3
47+
; CHECK-NEXT: .p2align 4, 0x90
48+
; CHECK-NEXT: .LBB0_1: # %bb17
49+
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
50+
; CHECK-NEXT: movq %r15, %r13
51+
; CHECK-NEXT: xorl %r15d, %r15d
52+
; CHECK-NEXT: testq %rbx, %rbx
53+
; CHECK-NEXT: sete %r15b
54+
; CHECK-NEXT: xorl %edi, %edi
55+
; CHECK-NEXT: callq _Znwm@PLT
56+
; CHECK-NEXT: shlq $4, %r15
57+
; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
58+
; CHECK-NEXT: movq %r12, %rcx
59+
; CHECK-NEXT: shrq $32, %rcx
60+
; CHECK-NEXT: movb %cl, 12(%rax)
61+
; CHECK-NEXT: movl %r12d, 8(%rax)
62+
; CHECK-NEXT: movq %r15, %rbx
63+
; CHECK-NEXT: movq %r13, %r15
64+
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
65+
; CHECK-NEXT: decl %r14d
66+
; CHECK-NEXT: je .LBB0_8
67+
; CHECK-NEXT: .LBB0_3: # %bb7
68+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
69+
; CHECK-NEXT: callq widget@PLT
70+
; CHECK-NEXT: cmpb $-5, (%r13)
71+
; CHECK-NEXT: jae .LBB0_5
72+
; CHECK-NEXT: # %bb.4: # in Loop: Header=BB0_3 Depth=1
73+
; CHECK-NEXT: movl %r12d, %r12d
74+
; CHECK-NEXT: cmpq %r15, %rbx
75+
; CHECK-NEXT: jbe .LBB0_1
76+
; CHECK-NEXT: jmp .LBB0_7
77+
; CHECK-NEXT: .p2align 4, 0x90
78+
; CHECK-NEXT: .LBB0_5: # %bb12
79+
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
80+
; CHECK-NEXT: movq 0, %rax
81+
; CHECK-NEXT: movq 8, %rax
82+
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
83+
; CHECK-NEXT: cmpq %r15, %rbx
84+
; CHECK-NEXT: jbe .LBB0_1
85+
; CHECK-NEXT: .LBB0_7: # in Loop: Header=BB0_3 Depth=1
86+
; CHECK-NEXT: xorl %eax, %eax
87+
; CHECK-NEXT: xorl %ebx, %ebx
88+
; CHECK-NEXT: decl %r14d
89+
; CHECK-NEXT: jne .LBB0_3
90+
; CHECK-NEXT: .LBB0_8: # %bb21
91+
; CHECK-NEXT: cmpb $0, 12(%rax)
92+
; CHECK-NEXT: jne .LBB0_10
93+
; CHECK-NEXT: # %bb.9: # %bb26
94+
; CHECK-NEXT: addq $24, %rsp
95+
; CHECK-NEXT: popq %rbx
96+
; CHECK-NEXT: popq %r12
97+
; CHECK-NEXT: popq %r13
98+
; CHECK-NEXT: popq %r14
99+
; CHECK-NEXT: popq %r15
100+
; CHECK-NEXT: popq %rbp
101+
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
102+
; CHECK-NEXT: retq
103+
; CHECK-NEXT: .LBB0_10: # %bb25
104+
; CHECK-NEXT: .cfi_def_cfa %rbp, 16
105+
; CHECK-NEXT: movq %r15, %rdi
106+
; CHECK-NEXT: callq pluto@PLT
107+
bb:
108+
br label %bb7
109+
110+
bb5: ; preds = %bb17, %bb14
111+
%phi = phi ptr [ %call19, %bb17 ], [ null, %bb14 ]
112+
%phi6 = phi ptr [ %getelementptr, %bb17 ], [ null, %bb14 ]
113+
%add = add i32 %phi9, 1
114+
%icmp = icmp eq i32 %phi9, %arg4
115+
br i1 %icmp, label %bb21, label %bb7
116+
117+
bb7: ; preds = %bb5, %bb
118+
%phi8 = phi ptr [ null, %bb ], [ %phi6, %bb5 ]
119+
%phi9 = phi i32 [ 0, %bb ], [ %add, %bb5 ]
120+
%phi10 = phi i40 [ undef, %bb ], [ %phi15, %bb5 ]
121+
%call = call ptr @widget()
122+
%load = load i8, ptr %arg1, align 8
123+
%icmp11 = icmp ult i8 %load, -5
124+
%and = and i40 %phi10, 4294967295
125+
br i1 %icmp11, label %bb14, label %bb12
126+
127+
bb12: ; preds = %bb7
128+
%load13 = load volatile { i64, i64 }, ptr null, align 4294967296
129+
br label %bb14
130+
131+
bb14: ; preds = %bb12, %bb7
132+
%phi15 = phi i40 [ %and, %bb7 ], [ %arg2, %bb12 ]
133+
%icmp16 = icmp ugt ptr %phi8, %arg
134+
br i1 %icmp16, label %bb5, label %bb17
135+
136+
bb17: ; preds = %bb14
137+
%icmp18 = icmp eq ptr %phi8, null
138+
%zext = zext i1 %icmp18 to i64
139+
%call19 = call ptr @_Znwm(i64 0)
140+
%getelementptr = getelementptr %struct.wibble, ptr %arg3, i64 %zext
141+
%getelementptr20 = getelementptr i8, ptr %call19, i64 8
142+
store i40 %phi15, ptr %getelementptr20, align 4
143+
br label %bb5
144+
145+
bb21: ; preds = %bb5
146+
%getelementptr22 = getelementptr %struct.snork.3, ptr %phi, i64 0, i32 1
147+
%load23 = load i8, ptr %getelementptr22, align 4
148+
%icmp24 = icmp eq i8 %load23, 0
149+
br i1 %icmp24, label %bb26, label %bb25
150+
151+
bb25: ; preds = %bb21
152+
call void @pluto(ptr %arg)
153+
unreachable
154+
155+
bb26: ; preds = %bb21
156+
ret void
157+
}
158+
159+
define void @eggs(ptr %arg, ptr %arg1) {
160+
; CHECK-LABEL: eggs:
161+
; CHECK: # %bb.0: # %bb
162+
; CHECK-NEXT: pushq %rax
163+
; CHECK-NEXT: .cfi_def_cfa_offset 16
164+
; CHECK-NEXT: movq %rdi, %rax
165+
; CHECK-NEXT: movq %rsi, %rdi
166+
; CHECK-NEXT: movq %rax, %rsi
167+
; CHECK-NEXT: xorl %edx, %edx
168+
; CHECK-NEXT: xorl %ecx, %ecx
169+
; CHECK-NEXT: xorl %r8d, %r8d
170+
; CHECK-NEXT: callq foo@PLT
171+
; CHECK-NEXT: popq %rax
172+
; CHECK-NEXT: .cfi_def_cfa_offset 8
173+
; CHECK-NEXT: retq
174+
bb:
175+
call void @foo(ptr %arg1, ptr %arg, i40 0, ptr null, i32 0)
176+
ret void
177+
}
178+
179+
declare ptr @widget()
180+
181+
declare void @pluto(ptr)
182+
183+
declare ptr @_Znwm(i64)
184+
185+
attributes #0 = { noinline "frame-pointer"="all" }
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2+
# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -enable-subreg-liveness -verify-coalescing -o - %s | FileCheck %s
3+
4+
5+
# FIXME: Need to handle subrange updates when coalescing with subreg_to_reg
6+
# This will fail if x86 enables subregister liveness.
7+
---
8+
name: requires_new_subrange_coalesce_subreg_to_reg
9+
tracksRegLiveness: true
10+
body: |
11+
; CHECK-LABEL: name: requires_new_subrange_coalesce_subreg_to_reg
12+
; CHECK: bb.0:
13+
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
14+
; CHECK-NEXT: liveins: $eax
15+
; CHECK-NEXT: {{ $}}
16+
; CHECK-NEXT: undef %a.sub_32bit:gr64_with_sub_8bit = COPY $eax
17+
; CHECK-NEXT: %b:gr32 = IMPLICIT_DEF
18+
; CHECK-NEXT: %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
19+
; CHECK-NEXT: JCC_1 %bb.2, 4, implicit undef $eflags
20+
; CHECK-NEXT: {{ $}}
21+
; CHECK-NEXT: bb.1:
22+
; CHECK-NEXT: successors: %bb.2(0x80000000)
23+
; CHECK-NEXT: {{ $}}
24+
; CHECK-NEXT: undef %a.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
25+
; CHECK-NEXT: %c.sub_32bit:gr64 = COPY %a
26+
; CHECK-NEXT: {{ $}}
27+
; CHECK-NEXT: bb.2:
28+
; CHECK-NEXT: %c.sub_32bit:gr64 = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
29+
; CHECK-NEXT: RET 0, implicit %c
30+
bb.0:
31+
liveins: $eax
32+
%init_eax:gr32 = COPY $eax
33+
%a:gr64 = SUBREG_TO_REG 0, %init_eax, %subreg.sub_32bit
34+
%b:gr32 = IMPLICIT_DEF
35+
%c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
36+
JCC_1 %bb.2, 4, implicit undef $eflags
37+
38+
bb.1:
39+
%imm0:gr32 = MOV32r0 implicit-def dead $eflags
40+
%a = SUBREG_TO_REG 0, %imm0, %subreg.sub_32bit
41+
%c.sub_32bit = COPY %a
42+
43+
bb.2:
44+
%c.sub_32bit = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
45+
RET 0, implicit %c
46+
47+
...

0 commit comments

Comments
 (0)