Skip to content

Commit 8039886

Browse files
authored
AMDGPU: Handle folding frame indexes into s_add_i32 (#101694)
This does not yet enable producing direct frame index references in s_add_i32, only the lowering.
1 parent f3bf46f commit 8039886

File tree

7 files changed

+841
-247
lines changed

7 files changed

+841
-247
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2432,7 +2432,94 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24322432
MI->eraseFromParent();
24332433
return true;
24342434
}
2435+
case AMDGPU::S_ADD_I32: {
2436+
// TODO: Handle s_or_b32, s_and_b32.
2437+
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2438+
MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
24352439

2440+
assert(FrameReg || MFI->isBottomOfStack());
2441+
2442+
MachineOperand &DstOp = MI->getOperand(0);
2443+
const DebugLoc &DL = MI->getDebugLoc();
2444+
Register MaterializedReg = FrameReg;
2445+
2446+
// Defend against live scc, which should never happen in practice.
2447+
bool DeadSCC = MI->getOperand(3).isDead();
2448+
2449+
Register TmpReg;
2450+
2451+
if (FrameReg && !ST.enableFlatScratch()) {
2452+
// FIXME: In the common case where the add does not also read its result
2453+
// (i.e. this isn't a reg += fi), it's not finding the dest reg as
2454+
// available.
2455+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI,
2456+
false, 0);
2457+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2458+
.addDef(TmpReg, RegState::Renamable)
2459+
.addReg(FrameReg)
2460+
.addImm(ST.getWavefrontSizeLog2())
2461+
.setOperandDead(3); // Set SCC dead
2462+
MaterializedReg = TmpReg;
2463+
}
2464+
2465+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2466+
2467+
// For the non-immediate case, we could fall through to the default
2468+
// handling, but we do an in-place update of the result register here to
2469+
// avoid scavenging another register.
2470+
if (OtherOp.isImm()) {
2471+
OtherOp.setImm(OtherOp.getImm() + Offset);
2472+
Offset = 0;
2473+
2474+
if (MaterializedReg)
2475+
FIOp.ChangeToRegister(MaterializedReg, false);
2476+
else
2477+
FIOp.ChangeToImmediate(0);
2478+
} else if (MaterializedReg) {
2479+
// If we can't fold the other operand, do another increment.
2480+
Register DstReg = DstOp.getReg();
2481+
2482+
if (!TmpReg && MaterializedReg == FrameReg) {
2483+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2484+
MI, false, 0);
2485+
DstReg = TmpReg;
2486+
}
2487+
2488+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2489+
.addDef(DstReg, RegState::Renamable)
2490+
.addReg(MaterializedReg, RegState::Kill)
2491+
.add(OtherOp);
2492+
if (DeadSCC)
2493+
AddI32.setOperandDead(3);
2494+
2495+
MaterializedReg = DstReg;
2496+
2497+
OtherOp.ChangeToRegister(MaterializedReg, false);
2498+
OtherOp.setIsKill(true);
2499+
OtherOp.setIsRenamable(true);
2500+
FIOp.ChangeToImmediate(Offset);
2501+
} else {
2502+
// If we don't have any other offset to apply, we can just directly
2503+
// interpret the frame index as the offset.
2504+
FIOp.ChangeToImmediate(Offset);
2505+
}
2506+
2507+
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2508+
assert(Offset == 0);
2509+
MI->removeOperand(3);
2510+
MI->removeOperand(OtherOpIdx);
2511+
MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2512+
} else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2513+
assert(Offset == 0);
2514+
MI->removeOperand(3);
2515+
MI->removeOperand(FIOperandNum);
2516+
MI->setDesc(
2517+
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2518+
}
2519+
2520+
assert(!FIOp.isFI());
2521+
return true;
2522+
}
24362523
default: {
24372524
// Other access to frame index
24382525
const DebugLoc &DL = MI->getDebugLoc();

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
1515
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1616
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
1717
; GFX9-NEXT: s_and_b32 s0, s0, 15
18-
; GFX9-NEXT: s_add_i32 s1, s1, 0
1918
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
2019
; GFX9-NEXT: scratch_store_dword off, v0, s1
2120
; GFX9-NEXT: s_waitcnt vmcnt(0)
22-
; GFX9-NEXT: s_add_i32 s0, s0, 0
2321
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
2422
; GFX9-NEXT: s_waitcnt vmcnt(0)
2523
; GFX9-NEXT: s_endpgm
@@ -36,8 +34,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
3634
; GFX10-NEXT: s_and_b32 s1, s0, 15
3735
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
3836
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
39-
; GFX10-NEXT: s_add_i32 s0, s0, 0
40-
; GFX10-NEXT: s_add_i32 s1, s1, 0
4137
; GFX10-NEXT: scratch_store_dword off, v0, s0
4238
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4339
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
@@ -51,11 +47,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
5147
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
5248
; GFX940-NEXT: s_lshl_b32 s1, s0, 2
5349
; GFX940-NEXT: s_and_b32 s0, s0, 15
54-
; GFX940-NEXT: s_add_i32 s1, s1, 0
5550
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
5651
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
5752
; GFX940-NEXT: s_waitcnt vmcnt(0)
58-
; GFX940-NEXT: s_add_i32 s0, s0, 0
5953
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
6054
; GFX940-NEXT: s_waitcnt vmcnt(0)
6155
; GFX940-NEXT: s_endpgm
@@ -68,8 +62,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
6862
; GFX11-NEXT: s_and_b32 s1, s0, 15
6963
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
7064
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
71-
; GFX11-NEXT: s_add_i32 s0, s0, 0
72-
; GFX11-NEXT: s_add_i32 s1, s1, 0
7365
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
7466
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
7567
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
@@ -84,8 +76,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
8476
; GFX12-NEXT: s_and_b32 s1, s0, 15
8577
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
8678
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
87-
; GFX12-NEXT: s_add_co_i32 s0, s0, 0
88-
; GFX12-NEXT: s_add_co_i32 s1, s1, 0
8979
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
9080
; GFX12-NEXT: s_wait_storecnt 0x0
9181
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
@@ -1042,13 +1032,13 @@ define void @store_load_large_imm_offset_foo() {
10421032
; GFX9-LABEL: store_load_large_imm_offset_foo:
10431033
; GFX9: ; %bb.0: ; %bb
10441034
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1045-
; GFX9-NEXT: v_mov_b32_e32 v0, 13
10461035
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
1047-
; GFX9-NEXT: s_add_i32 s1, s32, 4
1036+
; GFX9-NEXT: v_mov_b32_e32 v0, 13
1037+
; GFX9-NEXT: s_add_i32 s1, s32, s0
10481038
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
10491039
; GFX9-NEXT: s_waitcnt vmcnt(0)
10501040
; GFX9-NEXT: v_mov_b32_e32 v0, 15
1051-
; GFX9-NEXT: s_add_i32 s0, s0, s1
1041+
; GFX9-NEXT: s_add_i32 s0, s1, 4
10521042
; GFX9-NEXT: scratch_store_dword off, v0, s0
10531043
; GFX9-NEXT: s_waitcnt vmcnt(0)
10541044
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1059,10 +1049,10 @@ define void @store_load_large_imm_offset_foo() {
10591049
; GFX10: ; %bb.0: ; %bb
10601050
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10611051
; GFX10-NEXT: v_mov_b32_e32 v0, 13
1062-
; GFX10-NEXT: v_mov_b32_e32 v1, 15
10631052
; GFX10-NEXT: s_movk_i32 s0, 0x3e80
1064-
; GFX10-NEXT: s_add_i32 s1, s32, 4
1065-
; GFX10-NEXT: s_add_i32 s0, s0, s1
1053+
; GFX10-NEXT: v_mov_b32_e32 v1, 15
1054+
; GFX10-NEXT: s_add_i32 s1, s32, s0
1055+
; GFX10-NEXT: s_add_i32 s0, s1, 4
10661056
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
10671057
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
10681058
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1074,13 +1064,13 @@ define void @store_load_large_imm_offset_foo() {
10741064
; GFX940-LABEL: store_load_large_imm_offset_foo:
10751065
; GFX940: ; %bb.0: ; %bb
10761066
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1077-
; GFX940-NEXT: v_mov_b32_e32 v0, 13
10781067
; GFX940-NEXT: s_movk_i32 s0, 0x3e80
1079-
; GFX940-NEXT: s_add_i32 s1, s32, 4
1068+
; GFX940-NEXT: v_mov_b32_e32 v0, 13
1069+
; GFX940-NEXT: s_add_i32 s1, s32, s0
10801070
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
10811071
; GFX940-NEXT: s_waitcnt vmcnt(0)
10821072
; GFX940-NEXT: v_mov_b32_e32 v0, 15
1083-
; GFX940-NEXT: s_add_i32 s0, s0, s1
1073+
; GFX940-NEXT: s_add_i32 s0, s1, 4
10841074
; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10851075
; GFX940-NEXT: s_waitcnt vmcnt(0)
10861076
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1092,9 +1082,9 @@ define void @store_load_large_imm_offset_foo() {
10921082
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10931083
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
10941084
; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1095-
; GFX11-NEXT: s_add_i32 s1, s32, 4
1096-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1097-
; GFX11-NEXT: s_add_i32 s0, s0, s1
1085+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1086+
; GFX11-NEXT: s_add_i32 s1, s32, s0
1087+
; GFX11-NEXT: s_add_i32 s0, s1, 4
10981088
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
10991089
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
11001090
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc

0 commit comments

Comments
 (0)