Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit fb6bc12

Browse files
committed
[AMDGPU] SI Load Store Optimizer: When merging with offset, use V_ADD_{I|U}32_e64
- Change inserted add ( V_ADD_{I|U}32_e32 ) to _e64 version ( V_ADD_{I|U}32_e64 ) so that the add uses a vreg for the carry; this prevents inserted v_add from killing VCC; the _e64 version doesn't accept a literal in its encoding, so we need to introduce a mov instr as well to get the imm into a register. - Change pass name to "SI Load Store Optimizer"; this removes the '/', which complicates scripts. Differential Revision: https://reviews.llvm.org/D42124 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323153 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent c127e0c commit fb6bc12

File tree

3 files changed

+100
-36
lines changed

3 files changed

+100
-36
lines changed

lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
137137

138138
bool runOnMachineFunction(MachineFunction &MF) override;
139139

140-
StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
140+
StringRef getPassName() const override { return "SI Load Store Optimizer"; }
141141

142142
void getAnalysisUsage(AnalysisUsage &AU) const override {
143143
AU.setPreservesCFG();
@@ -150,10 +150,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
150150
} // end anonymous namespace.
151151

152152
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
153-
"SI Load / Store Optimizer", false, false)
153+
"SI Load Store Optimizer", false, false)
154154
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
155155
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
156-
"SI Load / Store Optimizer", false, false)
156+
"SI Load Store Optimizer", false, false)
157157

158158
char SILoadStoreOptimizer::ID = 0;
159159

@@ -496,13 +496,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
496496
unsigned BaseReg = AddrReg->getReg();
497497
unsigned BaseRegFlags = 0;
498498
if (CI.BaseOff) {
499+
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
500+
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
501+
.addImm(CI.BaseOff);
502+
499503
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
500504
BaseRegFlags = RegState::Kill;
501505

502-
unsigned AddOpc = STM->hasAddNoCarry() ?
503-
AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
504-
BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
505-
.addImm(CI.BaseOff)
506+
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
507+
.addReg(ImmReg)
506508
.addReg(AddrReg->getReg());
507509
}
508510

@@ -556,7 +558,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
556558

557559
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
558560
// sure we preserve the subregister index and any register flags set on them.
559-
const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
561+
const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
560562
const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
561563
const MachineOperand *Data1
562564
= TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
@@ -579,17 +581,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
579581
const MCInstrDesc &Write2Desc = TII->get(Opc);
580582
DebugLoc DL = CI.I->getDebugLoc();
581583

582-
unsigned BaseReg = Addr->getReg();
584+
unsigned BaseReg = AddrReg->getReg();
583585
unsigned BaseRegFlags = 0;
584586
if (CI.BaseOff) {
587+
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
588+
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
589+
.addImm(CI.BaseOff);
590+
585591
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
586592
BaseRegFlags = RegState::Kill;
587593

588-
unsigned AddOpc = STM->hasAddNoCarry() ?
589-
AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
590-
BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
591-
.addImm(CI.BaseOff)
592-
.addReg(Addr->getReg());
594+
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
595+
.addReg(ImmReg)
596+
.addReg(AddrReg->getReg());
593597
}
594598

595599
MachineInstrBuilder Write2 =

test/CodeGen/AMDGPU/ds-combine-large-stride.ll

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
66
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
77

8-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
9-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
10-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
8+
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
9+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
10+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
1111

1212
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
1313
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
@@ -50,9 +50,9 @@ bb:
5050
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
5151
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
5252

53-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
54-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
55-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
53+
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
54+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
55+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
5656

5757
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
5858
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
@@ -132,8 +132,8 @@ bb:
132132
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
133133

134134
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
135-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
136-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
135+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
136+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
137137

138138
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
139139
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
@@ -170,7 +170,7 @@ bb:
170170
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
171171
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
172172

173-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
173+
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
174174
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
175175

176176
; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
@@ -211,8 +211,8 @@ bb:
211211
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
212212

213213
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
214-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
215-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
214+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
215+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
216216

217217
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
218218
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
@@ -249,9 +249,9 @@ bb:
249249
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
250250
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
251251

252-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
253-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
254-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
252+
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
253+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
254+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
255255

256256
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
257257
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
@@ -285,9 +285,9 @@ bb:
285285
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
286286
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
287287

288-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
289-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
290-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
288+
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
289+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
290+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
291291

292292
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
293293
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
@@ -349,8 +349,8 @@ bb:
349349
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
350350

351351
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
352-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]]
353-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]]
352+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
353+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
354354

355355
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]]
356356
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]]
@@ -380,7 +380,7 @@ bb:
380380
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
381381
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
382382

383-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
383+
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
384384
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
385385

386386
; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
@@ -412,8 +412,8 @@ bb:
412412
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
413413

414414
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
415-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
416-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
415+
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
416+
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
417417

418418
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
419419
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,VI %s
2+
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
3+
4+
# If there's a base offset, check that SILoadStoreOptimizer creates
5+
# V_ADD_{I|U}32_e64 for that offset; _e64 uses a vreg for the carry (rather than
6+
# %vcc, which is used in _e32); this ensures that %vcc is not inadvertently
7+
# clobbered.
8+
9+
# GCN-LABEL: name: kernel
10+
11+
# VI: V_ADD_I32_e64 %6, %0,
12+
# VI-NEXT: DS_WRITE2_B32 killed %7, %0, %3, 0, 8,
13+
# VI: V_ADD_I32_e64 %10, %3,
14+
# VI-NEXT: DS_READ2_B32 killed %11, 0, 8,
15+
16+
# GFX9: V_ADD_U32_e64 %6, %0,
17+
# GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0, %3, 0, 8,
18+
# GFX9: V_ADD_U32_e64 %9, %3,
19+
# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8,
20+
21+
--- |
22+
@0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4
23+
24+
define amdgpu_kernel void @kernel() {
25+
bb.0:
26+
br label %bb2
27+
28+
bb1:
29+
ret void
30+
31+
bb2:
32+
%tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
33+
%tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
34+
%tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
35+
%tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
36+
br label %bb1
37+
}
38+
---
39+
name: kernel
40+
body: |
41+
bb.0:
42+
%0:vgpr_32 = IMPLICIT_DEF
43+
S_BRANCH %bb.2
44+
45+
bb.1:
46+
S_ENDPGM
47+
48+
bb.2:
49+
%1:sreg_64_xexec = V_CMP_NE_U32_e64 %0, 0, implicit %exec
50+
%2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %1, implicit %exec
51+
V_CMP_NE_U32_e32 1, %2, implicit-def %vcc, implicit %exec
52+
DS_WRITE_B32 %0, %0, 1024, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp)
53+
%3:vgpr_32 = V_MOV_B32_e32 0, implicit %exec
54+
DS_WRITE_B32 %0, %3, 1056, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp1)
55+
%4:vgpr_32 = DS_READ_B32 %3, 1088, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp2)
56+
%5:vgpr_32 = DS_READ_B32 %3, 1120, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp3)
57+
%vcc = S_AND_B64 %exec, %vcc, implicit-def %scc
58+
S_CBRANCH_VCCNZ %bb.1, implicit %vcc
59+
S_BRANCH %bb.1
60+
...

0 commit comments

Comments
 (0)