Skip to content

Commit a98a0dc

Browse files
authored
[AArch64] Add streaming-mode stack hazard optimization remarks (#101695)
Emit an optimization remark when objects in the stack frame may cause hazards in a streaming mode function. The analysis requires either the `aarch64-stack-hazard-size` or `aarch64-stack-hazard-remark-size` flag to be set by the user, with the former flag taking precedence.
1 parent 40c2aaf commit a98a0dc

File tree

6 files changed

+364
-11
lines changed

6 files changed

+364
-11
lines changed

llvm/include/llvm/CodeGen/TargetFrameLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "llvm/ADT/BitVector.h"
1717
#include "llvm/CodeGen/MachineBasicBlock.h"
18+
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
1819
#include "llvm/Support/TypeSize.h"
1920
#include <vector>
2021

@@ -473,6 +474,11 @@ class TargetFrameLowering {
473474
/// Return the frame base information to be encoded in the DWARF subprogram
474475
/// debug info.
475476
virtual DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const;
477+
478+
/// This method is called at the end of prolog/epilog code insertion, so
479+
/// targets can emit remarks based on the final frame layout.
480+
virtual void emitRemarks(const MachineFunction &MF,
481+
MachineOptimizationRemarkEmitter *ORE) const {};
476482
};
477483

478484
} // End llvm namespace

llvm/lib/CodeGen/PrologEpilogInserter.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,9 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
341341
<< ore::NV("Function", MF.getFunction().getName()) << "'";
342342
});
343343

344+
// Emit any remarks implemented for the target, based on final frame layout.
345+
TFI->emitRemarks(MF, ORE);
346+
344347
delete RS;
345348
SaveBlocks.clear();
346349
RestoreBlocks.clear();

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 196 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@
240240
#include "llvm/Support/CommandLine.h"
241241
#include "llvm/Support/Debug.h"
242242
#include "llvm/Support/ErrorHandling.h"
243+
#include "llvm/Support/FormatVariadic.h"
243244
#include "llvm/Support/MathExtras.h"
244245
#include "llvm/Support/raw_ostream.h"
245246
#include "llvm/Target/TargetMachine.h"
@@ -275,6 +276,10 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
275276
// Stack hazard padding size. 0 = disabled.
276277
static cl::opt<unsigned> StackHazardSize("aarch64-stack-hazard-size",
277278
cl::init(0), cl::Hidden);
279+
// Stack hazard size for analysis remarks. StackHazardSize takes precedence.
280+
static cl::opt<unsigned>
281+
StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0),
282+
cl::Hidden);
278283
// Whether to insert padding into non-streaming functions (for testing).
279284
static cl::opt<bool>
280285
StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
@@ -2616,9 +2621,16 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
26162621
const auto &MFI = MF.getFrameInfo();
26172622

26182623
int64_t ObjectOffset = MFI.getObjectOffset(FI);
2624+
StackOffset SVEStackSize = getSVEStackSize(MF);
2625+
2626+
// For VLA-area objects, just emit an offset at the end of the stack frame.
2627+
// Whilst not quite correct, these objects do live at the end of the frame and
2628+
// so it is more useful for analysis for the offset to reflect this.
2629+
if (MFI.isVariableSizedObjectIndex(FI)) {
2630+
return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize;
2631+
}
26192632

26202633
// This is correct in the absence of any SVE stack objects.
2621-
StackOffset SVEStackSize = getSVEStackSize(MF);
26222634
if (!SVEStackSize)
26232635
return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());
26242636

@@ -3529,13 +3541,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
35293541
return true;
35303542
}
35313543

3532-
// Return the FrameID for a Load/Store instruction by looking at the MMO.
3533-
static std::optional<int> getLdStFrameID(const MachineInstr &MI,
3534-
const MachineFrameInfo &MFI) {
3535-
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
3536-
return std::nullopt;
3537-
3538-
MachineMemOperand *MMO = *MI.memoperands_begin();
3544+
// Return the FrameID for a MMO.
3545+
static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
3546+
const MachineFrameInfo &MFI) {
35393547
auto *PSV =
35403548
dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue());
35413549
if (PSV)
@@ -3553,6 +3561,15 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
35533561
return std::nullopt;
35543562
}
35553563

3564+
// Return the FrameID for a Load/Store instruction by looking at the first MMO.
3565+
static std::optional<int> getLdStFrameID(const MachineInstr &MI,
3566+
const MachineFrameInfo &MFI) {
3567+
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
3568+
return std::nullopt;
3569+
3570+
return getMMOFrameID(*MI.memoperands_begin(), MFI);
3571+
}
3572+
35563573
// Check if a Hazard slot is needed for the current function, and if so create
35573574
// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
35583575
// which can be used to determine if any hazard padding is needed.
@@ -5030,3 +5047,174 @@ void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
50305047
MI->eraseFromParent();
50315048
}
50325049
}
5050+
5051+
struct StackAccess {
5052+
enum AccessType {
5053+
NotAccessed = 0, // Stack object not accessed by load/store instructions.
5054+
GPR = 1 << 0, // A general purpose register.
5055+
PPR = 1 << 1, // A predicate register.
5056+
FPR = 1 << 2, // A floating point/Neon/SVE register.
5057+
};
5058+
5059+
int Idx;
5060+
StackOffset Offset;
5061+
int64_t Size;
5062+
unsigned AccessTypes;
5063+
5064+
StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {}
5065+
5066+
bool operator<(const StackAccess &Rhs) const {
5067+
return std::make_tuple(start(), Idx) <
5068+
std::make_tuple(Rhs.start(), Rhs.Idx);
5069+
}
5070+
5071+
bool isCPU() const {
5072+
// Predicate register load and store instructions execute on the CPU.
5073+
return AccessTypes & (AccessType::GPR | AccessType::PPR);
5074+
}
5075+
bool isSME() const { return AccessTypes & AccessType::FPR; }
5076+
bool isMixed() const { return isCPU() && isSME(); }
5077+
5078+
int64_t start() const { return Offset.getFixed() + Offset.getScalable(); }
5079+
int64_t end() const { return start() + Size; }
5080+
5081+
std::string getTypeString() const {
5082+
switch (AccessTypes) {
5083+
case AccessType::FPR:
5084+
return "FPR";
5085+
case AccessType::PPR:
5086+
return "PPR";
5087+
case AccessType::GPR:
5088+
return "GPR";
5089+
case AccessType::NotAccessed:
5090+
return "NA";
5091+
default:
5092+
return "Mixed";
5093+
}
5094+
}
5095+
5096+
void print(raw_ostream &OS) const {
5097+
OS << getTypeString() << " stack object at [SP"
5098+
<< (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed();
5099+
if (Offset.getScalable())
5100+
OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable()
5101+
<< " * vscale";
5102+
OS << "]";
5103+
}
5104+
};
5105+
5106+
static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) {
5107+
SA.print(OS);
5108+
return OS;
5109+
}
5110+
5111+
void AArch64FrameLowering::emitRemarks(
5112+
const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const {
5113+
5114+
SMEAttrs Attrs(MF.getFunction());
5115+
if (Attrs.hasNonStreamingInterfaceAndBody())
5116+
return;
5117+
5118+
const uint64_t HazardSize =
5119+
(StackHazardSize) ? StackHazardSize : StackHazardRemarkSize;
5120+
5121+
if (HazardSize == 0)
5122+
return;
5123+
5124+
const MachineFrameInfo &MFI = MF.getFrameInfo();
5125+
// Bail if function has no stack objects.
5126+
if (!MFI.hasStackObjects())
5127+
return;
5128+
5129+
std::vector<StackAccess> StackAccesses(MFI.getNumObjects());
5130+
5131+
size_t NumFPLdSt = 0;
5132+
size_t NumNonFPLdSt = 0;
5133+
5134+
// Collect stack accesses via Load/Store instructions.
5135+
for (const MachineBasicBlock &MBB : MF) {
5136+
for (const MachineInstr &MI : MBB) {
5137+
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
5138+
continue;
5139+
for (MachineMemOperand *MMO : MI.memoperands()) {
5140+
std::optional<int> FI = getMMOFrameID(MMO, MFI);
5141+
if (FI && !MFI.isDeadObjectIndex(*FI)) {
5142+
int FrameIdx = *FI;
5143+
5144+
size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects();
5145+
if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) {
5146+
StackAccesses[ArrIdx].Idx = FrameIdx;
5147+
StackAccesses[ArrIdx].Offset =
5148+
getFrameIndexReferenceFromSP(MF, FrameIdx);
5149+
StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx);
5150+
}
5151+
5152+
unsigned RegTy = StackAccess::AccessType::GPR;
5153+
if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
5154+
if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
5155+
RegTy = StackAccess::PPR;
5156+
else
5157+
RegTy = StackAccess::FPR;
5158+
} else if (AArch64InstrInfo::isFpOrNEON(MI)) {
5159+
RegTy = StackAccess::FPR;
5160+
}
5161+
5162+
StackAccesses[ArrIdx].AccessTypes |= RegTy;
5163+
5164+
if (RegTy == StackAccess::FPR)
5165+
++NumFPLdSt;
5166+
else
5167+
++NumNonFPLdSt;
5168+
}
5169+
}
5170+
}
5171+
}
5172+
5173+
if (NumFPLdSt == 0 || NumNonFPLdSt == 0)
5174+
return;
5175+
5176+
llvm::sort(StackAccesses);
5177+
StackAccesses.erase(llvm::remove_if(StackAccesses,
5178+
[](const StackAccess &S) {
5179+
return S.AccessTypes ==
5180+
StackAccess::NotAccessed;
5181+
}),
5182+
StackAccesses.end());
5183+
5184+
SmallVector<const StackAccess *> MixedObjects;
5185+
SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs;
5186+
5187+
if (StackAccesses.front().isMixed())
5188+
MixedObjects.push_back(&StackAccesses.front());
5189+
5190+
for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end());
5191+
It != End; ++It) {
5192+
const auto &First = *It;
5193+
const auto &Second = *(It + 1);
5194+
5195+
if (Second.isMixed())
5196+
MixedObjects.push_back(&Second);
5197+
5198+
if ((First.isSME() && Second.isCPU()) ||
5199+
(First.isCPU() && Second.isSME())) {
5200+
uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end());
5201+
if (Distance < HazardSize)
5202+
HazardPairs.emplace_back(&First, &Second);
5203+
}
5204+
}
5205+
5206+
auto EmitRemark = [&](llvm::StringRef Str) {
5207+
ORE->emit([&]() {
5208+
auto R = MachineOptimizationRemarkAnalysis(
5209+
"sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front());
5210+
return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str;
5211+
});
5212+
};
5213+
5214+
for (const auto &P : HazardPairs)
5215+
EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str());
5216+
5217+
for (const auto *Obj : MixedObjects)
5218+
EmitRemark(
5219+
formatv("{0} accessed by both GP and FP instructions", *Obj).str());
5220+
}

llvm/lib/Target/AArch64/AArch64FrameLowering.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
1414
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
1515

16-
#include "llvm/Support/TypeSize.h"
16+
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
1717
#include "llvm/CodeGen/TargetFrameLowering.h"
18+
#include "llvm/Support/TypeSize.h"
1819

1920
namespace llvm {
2021

@@ -178,6 +179,9 @@ class AArch64FrameLowering : public TargetFrameLowering {
178179
inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
179180
int64_t NegProbeSize,
180181
Register TargetReg) const;
182+
183+
void emitRemarks(const MachineFunction &MF,
184+
MachineOptimizationRemarkEmitter *ORE) const override;
181185
};
182186

183187
} // End llvm namespace

0 commit comments

Comments
 (0)