freebsd-dev/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff
Dimitry Andric ef6fa9e26d Upgrade our copy of clang and llvm to 3.6.1 release.
This release contains the following cherry-picked revisions from
upstream trunk:

  226124 226151 226164 226165 226166 226407 226408 226409 226652
  226905 226983 227084 227087 227089 227208 227209 227210 227211
  227212 227213 227214 227269 227430 227482 227503 227519 227574
  227822 227986 227987 227988 227989 227990 228037 228038 228039
  228040 228188 228189 228190 228273 228372 228373 228374 228403
  228765 228848 228918 229223 229225 229226 229227 229228 229230
  229234 229235 229236 229238 229239 229413 229507 229680 229750
  229751 229752 229911 230146 230147 230235 230253 230255 230469
  230500 230564 230603 230657 230742 230748 230956 231219 231237
  231245 231259 231280 231451 231563 231601 231658 231659 231662
  231984 231986 232046 232085 232142 232176 232179 232189 232382
  232386 232389 232425 232438 232443 232675 232786 232797 232943
  232957 233075 233080 233351 233353 233409 233410 233508 233584
  233819 233904 234629 234636 234891 234975 234977 235524 235641
  235662 235931 236099 236306 236307

Please note that from 3.5.0 onwards, clang and llvm require C++11
support to build; see UPDATING for more information.
2015-05-25 13:43:03 +00:00

1272 lines
47 KiB
Diff

Pull in r227752 from upstream llvm trunk (by Michael Kuperstein):
[X86] Convert esp-relative movs of function arguments to pushes, step 2
This moves the transformation introduced in r223757 into a separate MI pass.
This allows it to cover many more cases (not only cases where there must be a
reserved call frame), and perform rudimentary call folding. It still doesn't
have a heuristic, so it is enabled only for optsize/minsize, with stack
alignment <= 8, where it ought to be a fairly clear win.
(Re-commit of r227728)
Differential Revision: http://reviews.llvm.org/D6789
This helps to get sys/boot/i386/boot2 below the required size again,
when optimizing with -Oz.
Introduced here: http://svnweb.freebsd.org/changeset/base/278112
Index: include/llvm/Target/TargetFrameLowering.h
===================================================================
--- include/llvm/Target/TargetFrameLowering.h
+++ include/llvm/Target/TargetFrameLowering.h
@@ -193,6 +193,11 @@ class TargetFrameLowering {
return hasReservedCallFrame(MF) || hasFP(MF);
}
+ // needsFrameIndexResolution - Do we need to perform FI resolution for
+ // this function. Normally, this is required only when the function
+ // has any stack objects. However, targets may want to override this.
+ virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;
+
/// getFrameIndexOffset - Returns the displacement from the frame register to
/// the stack frame of the specified index.
virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
Index: lib/CodeGen/PrologEpilogInserter.cpp
===================================================================
--- lib/CodeGen/PrologEpilogInserter.cpp
+++ lib/CodeGen/PrologEpilogInserter.cpp
@@ -703,7 +703,8 @@ void PEI::insertPrologEpilogCode(MachineFunction &
/// register references and actual offsets.
///
void PEI::replaceFrameIndices(MachineFunction &Fn) {
- if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+ const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+ if (!TFI.needsFrameIndexResolution(Fn)) return;
// Store SPAdj at exit of a basic block.
SmallVector<int, 8> SPState;
@@ -769,13 +770,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B
continue;
}
- // If we are looking at a call sequence, we need to keep track of
- // the SP adjustment made by each instruction in the sequence.
- // This includes both the frame setup/destroy pseudos (handled above),
- // as well as other instructions that have side effects w.r.t the SP.
- if (InsideCallSequence)
- SPAdj += TII.getSPAdjust(I);
-
MachineInstr *MI = I;
bool DoIncr = true;
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -854,6 +848,16 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B
break;
}
+ // If we are looking at a call sequence, we need to keep track of
+ // the SP adjustment made by each instruction in the sequence.
+ // This includes both the frame setup/destroy pseudos (handled above),
+ // as well as other instructions that have side effects w.r.t the SP.
+ // Note that this must come after eliminateFrameIndex, because
+ // if I itself referred to a frame index, we shouldn't count its own
+ // adjustment.
+ if (MI && InsideCallSequence)
+ SPAdj += TII.getSPAdjust(MI);
+
if (DoIncr && I != BB->end()) ++I;
// Update register states.
Index: lib/CodeGen/TargetFrameLoweringImpl.cpp
===================================================================
--- lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -42,3 +42,8 @@ int TargetFrameLowering::getFrameIndexReference(co
FrameReg = RI->getFrameRegister(MF);
return getFrameIndexOffset(MF, FI);
}
+
+bool TargetFrameLowering::needsFrameIndexResolution(
+ const MachineFunction &MF) const {
+ return MF.getFrameInfo()->hasStackObjects();
+}
Index: lib/Target/X86/CMakeLists.txt
===================================================================
--- lib/Target/X86/CMakeLists.txt
+++ lib/Target/X86/CMakeLists.txt
@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen)
set(sources
X86AsmPrinter.cpp
+ X86CallFrameOptimization.cpp
X86FastISel.cpp
X86FloatingPoint.cpp
X86FrameLowering.cpp
Index: lib/Target/X86/X86.h
===================================================================
--- lib/Target/X86/X86.h
+++ lib/Target/X86/X86.h
@@ -67,6 +67,11 @@ FunctionPass *createX86PadShortFunctions();
/// to eliminate execution delays in some Atom processors.
FunctionPass *createX86FixupLEAs();
+/// createX86CallFrameOptimization - Return a pass that optimizes
+/// the code-size of x86 call sequences. This is done by replacing
+/// esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
} // End llvm namespace
#endif
Index: lib/Target/X86/X86CallFrameOptimization.cpp
===================================================================
--- lib/Target/X86/X86CallFrameOptimization.cpp
+++ lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,400 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 2) It is possible to push memory arguments directly. So, if the
+// the transformation is preformed pre-reg-alloc, it can help relieve
+// register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
+ cl::desc("Avoid optimizing x86 call frames for size"),
+ cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+ X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ bool shouldPerformTransformation(MachineFunction &MF);
+
+ bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I);
+
+ MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+ unsigned Reg);
+
+ const char *getPassName() const override {
+ return "X86 Optimize Call Frame";
+ }
+
+ const TargetInstrInfo *TII;
+ const TargetFrameLowering *TFL;
+ const MachineRegisterInfo *MRI;
+ static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+ return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal and profitable
+bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
+ if (NoX86CFOpt.getValue())
+ return false;
+
+ // We currently only support call sequences where *all* parameters.
+ // are passed on the stack.
+ // No point in running this in 64-bit mode, since some arguments are
+ // passed in-register in all common calling conventions, so the pattern
+ // we're looking for will never match.
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+ if (STI.is64Bit())
+ return false;
+
+ // You would expect straight-line code between call-frame setup and
+ // call-frame destroy. You would be wrong. There are circumstances (e.g.
+ // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+ // end up with the setup and the destroy in different basic blocks.
+ // This is bad, and breaks SP adjustment.
+ // So, check that all of the frames in the function are closed inside
+ // the same block, and, for good measure, that there are no nested frames.
+ int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+ for (MachineBasicBlock &BB : MF) {
+ bool InsideFrameSequence = false;
+ for (MachineInstr &MI : BB) {
+ if (MI.getOpcode() == FrameSetupOpcode) {
+ if (InsideFrameSequence)
+ return false;
+ InsideFrameSequence = true;
+ }
+ else if (MI.getOpcode() == FrameDestroyOpcode) {
+ if (!InsideFrameSequence)
+ return false;
+ InsideFrameSequence = false;
+ }
+ }
+
+ if (InsideFrameSequence)
+ return false;
+ }
+
+ // Now that we know the transformation is legal, check if it is
+ // profitable.
+ // TODO: Add a heuristic that actually looks at the function,
+ // and enable this for more cases.
+
+ // This transformation is always a win when we expected to have
+ // a reserved call frame. Under other circumstances, it may be either
+ // a win or a loss, and requires a heuristic.
+ // For now, enable it only for the relatively clear win cases.
+ bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+ if (CannotReserveFrame)
+ return true;
+
+ // For now, don't even try to evaluate the profitability when
+ // not optimizing for size.
+ AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+ bool OptForSize =
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::OptimizeForSize) ||
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+
+ if (!OptForSize)
+ return false;
+
+ // Stack re-alignment can make this unprofitable even in terms of size.
+ // As mentioned above, a better heuristic is needed. For now, don't do this
+ // when the required alignment is above 8. (4 would be the safe choice, but
+ // some experimentation showed 8 is generally good).
+ if (TFL->getStackAlignment() > 8)
+ return false;
+
+ return true;
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getSubtarget().getInstrInfo();
+ TFL = MF.getSubtarget().getFrameLowering();
+ MRI = &MF.getRegInfo();
+
+ if (!shouldPerformTransformation(MF))
+ return false;
+
+ int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+ bool Changed = false;
+
+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+ if (I->getOpcode() == FrameSetupOpcode)
+ Changed |= adjustCallSequence(MF, *BB, I);
+
+ return Changed;
+}
+
+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+
+ // Check that this particular call sequence is amenable to the
+ // transformation.
+ const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ unsigned StackPtr = RegInfo.getStackRegister();
+ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+ // We expect to enter this at the beginning of a call sequence
+ assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+ MachineBasicBlock::iterator FrameSetup = I++;
+
+
+ // For globals in PIC mode, we can have some LEAs here.
+ // Ignore them, they don't bother us.
+ // TODO: Extend this to something that covers more cases.
+ while (I->getOpcode() == X86::LEA32r)
+ ++I;
+
+ // We expect a copy instruction here.
+ // TODO: The copy instruction is a lowering artifact.
+ // We should also support a copy-less version, where the stack
+ // pointer is used directly.
+ if (!I->isCopy() || !I->getOperand(0).isReg())
+ return false;
+ MachineBasicBlock::iterator SPCopy = I++;
+ StackPtr = SPCopy->getOperand(0).getReg();
+
+ // Scan the call setup sequence for the pattern we're looking for.
+ // We only handle a simple case - a sequence of MOV32mi or MOV32mr
+ // instructions, that push a sequence of 32-bit values onto the stack, with
+ // no gaps between them.
+ SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
+ unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+ if (MaxAdjust > 4)
+ MovVector.resize(MaxAdjust, nullptr);
+
+ do {
+ int Opcode = I->getOpcode();
+ if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+ break;
+
+ // We only want movs of the form:
+ // movl imm/r32, k(%esp)
+ // If we run into something else, bail.
+ // Note that AddrBaseReg may, counter to its name, not be a register,
+ // but rather a frame index.
+ // TODO: Support the fi case. This should probably work now that we
+ // have the infrastructure to track the stack pointer within a call
+ // sequence.
+ if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+ (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+ !I->getOperand(X86::AddrScaleAmt).isImm() ||
+ (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+ (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+ (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+ !I->getOperand(X86::AddrDisp).isImm())
+ return false;
+
+ int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+ assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
+
+ // We really don't want to consider the unaligned case.
+ if (StackDisp % 4)
+ return false;
+ StackDisp /= 4;
+
+ assert((size_t)StackDisp < MovVector.size() &&
+ "Function call has more parameters than the stack is adjusted for.");
+
+ // If the same stack slot is being filled twice, something's fishy.
+ if (MovVector[StackDisp] != nullptr)
+ return false;
+ MovVector[StackDisp] = I;
+
+ ++I;
+ } while (I != MBB.end());
+
+ // We now expect the end of the sequence - a call and a stack adjust.
+ if (I == MBB.end())
+ return false;
+
+ // For PCrel calls, we expect an additional COPY of the basereg.
+ // If we find one, skip it.
+ if (I->isCopy()) {
+ if (I->getOperand(1).getReg() ==
+ MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
+ ++I;
+ else
+ return false;
+ }
+
+ if (!I->isCall())
+ return false;
+ MachineBasicBlock::iterator Call = I;
+ if ((++I)->getOpcode() != FrameDestroyOpcode)
+ return false;
+
+ // Now, go through the vector, and see that we don't have any gaps,
+ // but only a series of 32-bit MOVs.
+
+ int64_t ExpectedDist = 0;
+ auto MMI = MovVector.begin(), MME = MovVector.end();
+ for (; MMI != MME; ++MMI, ExpectedDist += 4)
+ if (*MMI == nullptr)
+ break;
+
+ // If the call had no parameters, do nothing
+ if (!ExpectedDist)
+ return false;
+
+ // We are either at the last parameter, or a gap.
+ // Make sure it's not a gap
+ for (; MMI != MME; ++MMI)
+ if (*MMI != nullptr)
+ return false;
+
+ // Ok, we can in fact do the transformation for this call.
+ // Do not remove the FrameSetup instruction, but adjust the parameters.
+ // PEI will end up finalizing the handling of this.
+ FrameSetup->getOperand(1).setImm(ExpectedDist);
+
+ DebugLoc DL = I->getDebugLoc();
+ // Now, iterate through the vector in reverse order, and replace the movs
+ // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+ // replace uses.
+ for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+ MachineBasicBlock::iterator MOV = *MovVector[Idx];
+ MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+ if (MOV->getOpcode() == X86::MOV32mi) {
+ unsigned PushOpcode = X86::PUSHi32;
+ // If the operand is a small (8-bit) immediate, we can use a
+ // PUSH instruction with a shorter encoding.
+ // Note that isImm() may fail even though this is a MOVmi, because
+ // the operand can also be a symbol.
+ if (PushOp.isImm()) {
+ int64_t Val = PushOp.getImm();
+ if (isInt<8>(Val))
+ PushOpcode = X86::PUSH32i8;
+ }
+ BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+ } else {
+ unsigned int Reg = PushOp.getReg();
+
+ // If PUSHrmm is not slow on this target, try to fold the source of the
+ // push into the instruction.
+ const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+ bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+
+ // Check that this is legal to fold. Right now, we're extremely
+ // conservative about that.
+ MachineInstr *DefMov = nullptr;
+ if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+ MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
+
+ unsigned NumOps = DefMov->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+ Push->addOperand(DefMov->getOperand(i));
+
+ DefMov->eraseFromParent();
+ } else {
+ BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
+ }
+ }
+
+ MBB.erase(MOV);
+ }
+
+ // The stack-pointer copy is no longer used in the call sequences.
+ // There should not be any other users, but we can't commit to that, so:
+ if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
+ SPCopy->eraseFromParent();
+
+ // Once we've done this, we need to make sure PEI doesn't assume a reserved
+ // frame.
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ FuncInfo->setHasPushSequences(true);
+
+ return true;
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+ MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+ // Do an extremely restricted form of load folding.
+ // ISel will often create patterns like:
+ // movl 4(%edi), %eax
+ // movl 8(%edi), %ecx
+ // movl 12(%edi), %edx
+ // movl %edx, 8(%esp)
+ // movl %ecx, 4(%esp)
+ // movl %eax, (%esp)
+ // call
+ // Get rid of those with prejudice.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+
+ // Make sure this is the only use of Reg.
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return nullptr;
+
+ MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+
+ // Make sure the def is a MOV from memory.
+ // If the def is an another block, give up.
+ if (DefMI->getOpcode() != X86::MOV32rm ||
+ DefMI->getParent() != FrameSetup->getParent())
+ return nullptr;
+
+ // Be careful with movs that load from a stack slot, since it may get
+ // resolved incorrectly.
+ // TODO: Again, we already have the infrastructure, so this should work.
+ if (!DefMI->getOperand(1).isReg())
+ return nullptr;
+
+ // Now, make sure everything else up until the ADJCALLSTACK is a sequence
+ // of MOVs. To be less conservative would require duplicating a lot of the
+ // logic from PeepholeOptimizer.
+ // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
+ // to be smarter about folding into pushes.
+ for (auto I = DefMI; I != FrameSetup; ++I)
+ if (I->getOpcode() != X86::MOV32rm)
+ return nullptr;
+
+ return DefMI;
+}
Index: lib/Target/X86/X86FastISel.cpp
===================================================================
--- lib/Target/X86/X86FastISel.cpp
+++ lib/Target/X86/X86FastISel.cpp
@@ -2735,7 +2735,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes);
+ .addImm(NumBytes).addImm(0);
// Walk the register/memloc assignments, inserting copies/loads.
const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
Index: lib/Target/X86/X86FrameLowering.cpp
===================================================================
--- lib/Target/X86/X86FrameLowering.cpp
+++ lib/Target/X86/X86FrameLowering.cpp
@@ -38,9 +38,36 @@ using namespace llvm;
extern cl::opt<bool> ForceStackAlign;
bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
- return !MF.getFrameInfo()->hasVarSizedObjects();
+ return !MF.getFrameInfo()->hasVarSizedObjects() &&
+ !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
}
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified. Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+ const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
+ (MF.getSubtarget().getRegisterInfo());
+ return hasReservedCallFrame(MF) ||
+ (hasFP(MF) && !TRI->needsStackRealignment(MF))
+ || TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+ return MF.getFrameInfo()->hasStackObjects() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
/// hasFP - Return true if the specified function should have a dedicated frame
/// pointer register. This is true if the function has variable sized allocas
/// or if frame pointer elimination is disabled.
@@ -93,16 +120,6 @@ static unsigned getANDriOpcode(bool IsLP64, int64_
return X86::AND32ri;
}
-static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) {
- // We don't support LP64 for now.
- assert(!IsLP64);
-
- if (MO.isImm() && isInt<8>(MO.getImm()))
- return X86::PUSH32i8;
-
- return X86::PUSHi32;;
-}
-
static unsigned getLEArOpcode(unsigned IsLP64) {
return IsLP64 ? X86::LEA64r : X86::LEA32r;
}
@@ -1882,100 +1899,6 @@ void X86FrameLowering::adjustForHiPEPrologue(Machi
#endif
}
-bool X86FrameLowering::
-convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, uint64_t Amount) const {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- unsigned StackPtr = RegInfo.getStackRegister();
-
- // Scan the call setup sequence for the pattern we're looking for.
- // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
- // instructions, that push a sequence of 32-bit values onto the stack, with
- // no gaps.
- std::map<int64_t, MachineBasicBlock::iterator> MovMap;
- do {
- int Opcode = I->getOpcode();
- if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
- break;
-
- // We only want movs of the form:
- // movl imm/r32, k(%ecx)
- // If we run into something else, bail
- // Note that AddrBaseReg may, counterintuitively, not be a register...
- if (!I->getOperand(X86::AddrBaseReg).isReg() ||
- (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
- !I->getOperand(X86::AddrScaleAmt).isImm() ||
- (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
- (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
- (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
- !I->getOperand(X86::AddrDisp).isImm())
- return false;
-
- int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
-
- // We don't want to consider the unaligned case.
- if (StackDisp % 4)
- return false;
-
- // If the same stack slot is being filled twice, something's fishy.
- if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
- return false;
-
- ++I;
- } while (I != MBB.end());
-
- // We now expect the end of the sequence - a call and a stack adjust.
- if (I == MBB.end())
- return false;
- if (!I->isCall())
- return false;
- MachineBasicBlock::iterator Call = I;
- if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
- return false;
-
- // Now, go through the map, and see that we don't have any gaps,
- // but only a series of 32-bit MOVs.
- // Since std::map provides ordered iteration, the original order
- // of the MOVs doesn't matter.
- int64_t ExpectedDist = 0;
- for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME;
- ++MMI, ExpectedDist += 4)
- if (MMI->first != ExpectedDist)
- return false;
-
- // Ok, everything looks fine. Do the transformation.
- DebugLoc DL = I->getDebugLoc();
-
- // It's possible the original stack adjustment amount was larger than
- // that done by the pushes. If so, we still need a SUB.
- Amount -= ExpectedDist;
- if (Amount) {
- MachineInstr* Sub = BuildMI(MBB, Call, DL,
- TII.get(getSUBriOpcode(false, Amount)), StackPtr)
- .addReg(StackPtr).addImm(Amount);
- Sub->getOperand(3).setIsDead();
- }
-
- // Now, iterate through the map in reverse order, and replace the movs
- // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
- for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
- MachineBasicBlock::iterator MOV = MMI->second;
- MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
-
- // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size
- int PushOpcode = X86::PUSH32r;
- if (MOV->getOpcode() == X86::MOV32mi)
- PushOpcode = getPUSHiOpcode(false, PushOp);
-
- BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp);
- MBB.erase(MOV);
- }
-
- return true;
-}
-
void X86FrameLowering::
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
@@ -1990,7 +1913,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
bool IsLP64 = STI.isTarget64BitLP64();
DebugLoc DL = I->getDebugLoc();
uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
- uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+ uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
I = MBB.erase(I);
if (!reserveCallFrame) {
@@ -2010,24 +1933,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
MachineInstr *New = nullptr;
- if (Opcode == TII.getCallFrameSetupOpcode()) {
- // Try to convert movs to the stack into pushes.
- // We currently only look for a pattern that appears in 32-bit
- // calling conventions.
- if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
- return;
- New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
- StackPtr)
- .addReg(StackPtr)
- .addImm(Amount);
- } else {
- assert(Opcode == TII.getCallFrameDestroyOpcode());
+ // Factor out the amount that gets handled inside the sequence
+ // (Pushes of argument for frame setup, callee pops for frame destroy)
+ Amount -= InternalAmt;
- // Factor out the amount the callee already popped.
- Amount -= CalleeAmt;
+ if (Amount) {
+ if (Opcode == TII.getCallFrameSetupOpcode()) {
+ New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
+ .addReg(StackPtr).addImm(Amount);
+ } else {
+ assert(Opcode == TII.getCallFrameDestroyOpcode());
- if (Amount) {
unsigned Opc = getADDriOpcode(IsLP64, Amount);
New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
.addReg(StackPtr).addImm(Amount);
@@ -2045,13 +1962,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
return;
}
- if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+ if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
// If we are performing frame pointer elimination and if the callee pops
// something off the stack pointer, add it back. We do this until we have
// more advanced stack pointer tracking ability.
- unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+ unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr).addImm(CalleeAmt);
+ .addReg(StackPtr).addImm(InternalAmt);
// The EFLAGS implicit def is dead.
New->getOperand(3).setIsDead();
Index: lib/Target/X86/X86FrameLowering.h
===================================================================
--- lib/Target/X86/X86FrameLowering.h
+++ lib/Target/X86/X86FrameLowering.h
@@ -66,6 +66,8 @@ class X86FrameLowering : public TargetFrameLowerin
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+ bool needsFrameIndexResolution(const MachineFunction &MF) const override;
int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
Index: lib/Target/X86/X86InstrCompiler.td
===================================================================
--- lib/Target/X86/X86InstrCompiler.td
+++ lib/Target/X86/X86InstrCompiler.td
@@ -43,9 +43,9 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(X86callseq_start timm:$amt)]>,
+ []>,
Requires<[NotLP64]>;
def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
@@ -52,7 +52,10 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[NotLP64]>;
}
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
// a stack adjustment and the codegen must know that they may modify the stack
// pointer before prolog-epilog rewriting occurs.
@@ -59,9 +62,9 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(X86callseq_start timm:$amt)]>,
+ []>,
Requires<[IsLP64]>;
def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
@@ -68,9 +71,10 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[IsLP64]>;
}
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
-
// x86-64 va_start lowering magic.
let usesCustomInserter = 1, Defs = [EFLAGS] in {
def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -1692,6 +1692,58 @@ X86InstrInfo::isCoalescableExtInstr(const MachineI
return false;
}
+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+ if (MI->getOpcode() == getCallFrameSetupOpcode() ||
+ MI->getOpcode() == getCallFrameDestroyOpcode()) {
+ unsigned StackAlign = TFI->getStackAlignment();
+ int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
+ StackAlign;
+
+ SPAdj -= MI->getOperand(1).getImm();
+
+ if (MI->getOpcode() == getCallFrameSetupOpcode())
+ return SPAdj;
+ else
+ return -SPAdj;
+ }
+
+ // To know whether a call adjusts the stack, we need information
+ // that is bound to the following ADJCALLSTACKUP pseudo.
+ // Look for the next ADJCALLSTACKUP that follows the call.
+ if (MI->isCall()) {
+ const MachineBasicBlock* MBB = MI->getParent();
+ auto I = ++MachineBasicBlock::const_iterator(MI);
+ for (auto E = MBB->end(); I != E; ++I) {
+ if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+ I->isCall())
+ break;
+ }
+
+ // If we could not find a frame destroy opcode, then it has already
+ // been simplified, so we don't care.
+ if (I->getOpcode() != getCallFrameDestroyOpcode())
+ return 0;
+
+ return -(I->getOperand(1).getImm());
+ }
+
+ // Currently handle only PUSHes we can reasonably expect to see
+ // in call sequences
+ switch (MI->getOpcode()) {
+ default:
+ return 0;
+ case X86::PUSH32i8:
+ case X86::PUSH32r:
+ case X86::PUSH32rmm:
+ case X86::PUSH32rmr:
+ case X86::PUSHi32:
+ return 4;
+ }
+}
+
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
Index: lib/Target/X86/X86InstrInfo.h
===================================================================
--- lib/Target/X86/X86InstrInfo.h
+++ lib/Target/X86/X86InstrInfo.h
@@ -175,6 +175,11 @@ class X86InstrInfo final : public X86GenInstrInfo
///
const X86RegisterInfo &getRegisterInfo() const { return RI; }
+ /// getSPAdjust - This returns the stack pointer adjustment made by
+ /// this instruction. For x86, we need to handle more complex call
+ /// sequences involving PUSHes.
+ int getSPAdjust(const MachineInstr *MI) const override;
+
/// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
/// extension instruction. That is, it's like a copy where it's legal for the
/// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
Index: lib/Target/X86/X86MachineFunctionInfo.h
===================================================================
--- lib/Target/X86/X86MachineFunctionInfo.h
+++ lib/Target/X86/X86MachineFunctionInfo.h
@@ -77,6 +77,9 @@ class X86MachineFunctionInfo : public MachineFunct
unsigned ArgumentStackSize;
/// NumLocalDynamics - Number of local-dynamic TLS accesses.
unsigned NumLocalDynamics;
+ /// HasPushSequences - Keeps track of whether this function uses sequences
+ /// of pushes to pass function parameters.
+ bool HasPushSequences;
private:
/// ForwardedMustTailRegParms - A list of virtual and physical registers
@@ -97,7 +100,8 @@ class X86MachineFunctionInfo : public MachineFunct
VarArgsGPOffset(0),
VarArgsFPOffset(0),
ArgumentStackSize(0),
- NumLocalDynamics(0) {}
+ NumLocalDynamics(0),
+ HasPushSequences(false) {}
explicit X86MachineFunctionInfo(MachineFunction &MF)
: ForceFramePointer(false),
@@ -113,11 +117,15 @@ class X86MachineFunctionInfo : public MachineFunct
VarArgsGPOffset(0),
VarArgsFPOffset(0),
ArgumentStackSize(0),
- NumLocalDynamics(0) {}
+ NumLocalDynamics(0),
+ HasPushSequences(false) {}
bool getForceFramePointer() const { return ForceFramePointer;}
void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+ bool getHasPushSequences() const { return HasPushSequences; }
+ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
void setRestoreBasePointer(const MachineFunction *MF);
int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
Index: lib/Target/X86/X86RegisterInfo.cpp
===================================================================
--- lib/Target/X86/X86RegisterInfo.cpp
+++ lib/Target/X86/X86RegisterInfo.cpp
@@ -468,8 +468,6 @@ void
X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
- assert(SPAdj == 0 && "Unexpected");
-
MachineInstr &MI = *II;
MachineFunction &MF = *MI.getParent()->getParent();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
@@ -506,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicB
} else
FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+ if (BasePtr == StackPtr)
+ FIOffset += SPAdj;
+
// The frame index format for stackmaps and patchpoints is different from the
// X86 format. It only has a FI and an offset.
if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -154,6 +154,7 @@ class X86PassConfig : public TargetPassConfig {
void addIRPasses() override;
bool addInstSelector() override;
bool addILPOpts() override;
+ void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
};
@@ -187,6 +188,10 @@ bool X86PassConfig::addILPOpts() {
return true;
}
+void X86PassConfig::addPreRegAlloc() {
+ addPass(createX86CallFrameOptimization());
+}
+
void X86PassConfig::addPostRegAlloc() {
addPass(createX86FloatingPointStackifierPass());
}
Index: test/CodeGen/X86/inalloca-invoke.ll
===================================================================
--- test/CodeGen/X86/inalloca-invoke.ll
+++ test/CodeGen/X86/inalloca-invoke.ll
@@ -31,7 +31,7 @@ blah:
to label %invoke.cont unwind label %lpad
; Uses end as sret param.
-; CHECK: movl %[[end]], (%esp)
+; CHECK: pushl %[[end]]
; CHECK: calll _plus
invoke.cont:
Index: test/CodeGen/X86/movtopush.ll
===================================================================
--- test/CodeGen/X86/movtopush.ll
+++ test/CodeGen/X86/movtopush.ll
@@ -1,10 +1,12 @@
; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
+
declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
; Here, we should have a reserved frame, so we don't expect pushes
-; NORMAL-LABEL: test1
+; NORMAL-LABEL: test1:
; NORMAL: subl $16, %esp
; NORMAL-NEXT: movl $4, 12(%esp)
; NORMAL-NEXT: movl $3, 8(%esp)
@@ -11,6 +13,7 @@ declare void @inreg(i32 %a, i32 inreg %b, i32 %c,
; NORMAL-NEXT: movl $2, 4(%esp)
; NORMAL-NEXT: movl $1, (%esp)
; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
define void @test1() {
entry:
call void @good(i32 1, i32 2, i32 3, i32 4)
@@ -17,8 +20,10 @@ entry:
ret void
}
-; Here, we expect a sequence of 4 immediate pushes
-; NORMAL-LABEL: test2
+; We're optimizing for code size, so we should get pushes for x86,
+; even though there is a reserved call frame.
+; Make sure we don't touch x86-64
+; NORMAL-LABEL: test1b:
; NORMAL-NOT: subl {{.*}} %esp
; NORMAL: pushl $4
; NORMAL-NEXT: pushl $3
@@ -25,6 +30,42 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+; X64-LABEL: test1b:
+; X64: movl $1, %ecx
+; X64-NEXT: movl $2, %edx
+; X64-NEXT: movl $3, %r8d
+; X64-NEXT: movl $4, %r9d
+; X64-NEXT: callq good
+define void @test1b() optsize {
+entry:
+ call void @good(i32 1, i32 2, i32 3, i32 4)
+ ret void
+}
+
+; Same as above, but for minsize
+; NORMAL-LABEL: test1c:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test1c() minsize {
+entry:
+ call void @good(i32 1, i32 2, i32 3, i32 4)
+ ret void
+}
+
+; If we have a reserved frame, we should have pushes
+; NORMAL-LABEL: test2:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
define void @test2(i32 %k) {
entry:
%a = alloca i32, i32 %k
@@ -34,7 +75,7 @@ entry:
; Again, we expect a sequence of 4 immediate pushes
; Checks that we generate the right pushes for >8bit immediates
-; NORMAL-LABEL: test2b
+; NORMAL-LABEL: test2b:
; NORMAL-NOT: subl {{.*}} %esp
; NORMAL: pushl $4096
; NORMAL-NEXT: pushl $3072
@@ -41,15 +82,15 @@ entry:
; NORMAL-NEXT: pushl $2048
; NORMAL-NEXT: pushl $1024
; NORMAL-NEXT: call
-define void @test2b(i32 %k) {
+; NORMAL-NEXT: addl $16, %esp
+define void @test2b() optsize {
entry:
- %a = alloca i32, i32 %k
call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
ret void
}
; The first push should push a register
-; NORMAL-LABEL: test3
+; NORMAL-LABEL: test3:
; NORMAL-NOT: subl {{.*}} %esp
; NORMAL: pushl $4
; NORMAL-NEXT: pushl $3
@@ -56,15 +97,15 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl %e{{..}}
; NORMAL-NEXT: call
-define void @test3(i32 %k) {
+; NORMAL-NEXT: addl $16, %esp
+define void @test3(i32 %k) optsize {
entry:
- %a = alloca i32, i32 %k
call void @good(i32 %k, i32 2, i32 3, i32 4)
ret void
}
; We don't support weird calling conventions
-; NORMAL-LABEL: test4
+; NORMAL-LABEL: test4:
; NORMAL: subl $12, %esp
; NORMAL-NEXT: movl $4, 8(%esp)
; NORMAL-NEXT: movl $3, 4(%esp)
@@ -71,16 +112,16 @@ entry:
; NORMAL-NEXT: movl $1, (%esp)
; NORMAL-NEXT: movl $2, %eax
; NORMAL-NEXT: call
-define void @test4(i32 %k) {
+; NORMAL-NEXT: addl $12, %esp
+define void @test4() optsize {
entry:
- %a = alloca i32, i32 %k
call void @inreg(i32 1, i32 2, i32 3, i32 4)
ret void
}
-; Check that additional alignment is added when the pushes
-; don't add up to the required alignment.
-; ALIGNED-LABEL: test5
+; When there is no reserved call frame, check that additional alignment
+; is added when the pushes don't add up to the required alignment.
+; ALIGNED-LABEL: test5:
; ALIGNED: subl $16, %esp
; ALIGNED-NEXT: pushl $4
; ALIGNED-NEXT: pushl $3
@@ -97,7 +138,7 @@ entry:
; Check that pushing the addresses of globals (Or generally, things that
; aren't exactly immediates) isn't broken.
; Fixes PR21878.
-; NORMAL-LABEL: test6
+; NORMAL-LABEL: test6:
; NORMAL: pushl $_ext
; NORMAL-NEXT: call
declare void @f(i8*)
@@ -110,3 +151,108 @@ bb:
alloca i32
ret void
}
+
+; Check that we fold simple cases into the push
+; NORMAL-LABEL: test7:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: movl 4(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: pushl $4
+; NORMAL-NEXT: pushl ([[EAX]])
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test7(i32* %ptr) optsize {
+entry:
+ %val = load i32* %ptr
+ call void @good(i32 1, i32 2, i32 %val, i32 4)
+ ret void
+}
+
+; But we don't want to fold stack-relative loads into the push,
+; because the offset will be wrong
+; NORMAL-LABEL: test8:
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: movl 4(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: pushl $4
+; NORMAL-NEXT: pushl [[EAX]]
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test8(i32* %ptr) optsize {
+entry:
+ %val = ptrtoint i32* %ptr to i32
+ call void @good(i32 1, i32 2, i32 %val, i32 4)
+ ret void
+}
+
+; If one function is using push instructions, and the other isn't
+; (because it has frame-index references), then we must resolve
+; these references correctly.
+; NORMAL-LABEL: test9:
+; NORMAL-NOT: leal (%esp),
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: subl $16, %esp
+; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]]
+; NORMAL-NEXT: movl [[EAX]], 12(%esp)
+; NORMAL-NEXT: movl $7, 8(%esp)
+; NORMAL-NEXT: movl $6, 4(%esp)
+; NORMAL-NEXT: movl $5, (%esp)
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+define void @test9() optsize {
+entry:
+ %p = alloca i32, align 4
+ call void @good(i32 1, i32 2, i32 3, i32 4)
+ %0 = ptrtoint i32* %p to i32
+ call void @good(i32 5, i32 6, i32 7, i32 %0)
+ ret void
+}
+
+; We can end up with an indirect call which gets reloaded on the spot.
+; Make sure we reference the correct stack slot - we spill into (%esp)
+; and reload from 16(%esp) due to the pushes.
+; NORMAL-LABEL: test10:
+; NORMAL: movl $_good, [[ALLOC:.*]]
+; NORMAL-NEXT: movl [[ALLOC]], [[EAX:%e..]]
+; NORMAL-NEXT: movl [[EAX]], (%esp) # 4-byte Spill
+; NORMAL: nop
+; NORMAL: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: calll *16(%esp)
+; NORMAL-NEXT: addl $16, %esp
+define void @test10() optsize {
+ %stack_fptr = alloca void (i32, i32, i32, i32)*
+ store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr
+ %good_ptr = load volatile void (i32, i32, i32, i32)** %stack_fptr
+ call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
+ call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4)
+ ret void
+}
+
+; We can't fold the load from the global into the push because of
+; interference from the store
+; NORMAL-LABEL: test11:
+; NORMAL: movl _the_global, [[EAX:%e..]]
+; NORMAL-NEXT: movl $42, _the_global
+; NORMAL-NEXT: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl [[EAX]]
+; NORMAL-NEXT: call
+; NORMAL-NEXT: addl $16, %esp
+@the_global = external global i32
+define void @test11() optsize {
+ %myload = load i32* @the_global
+ store i32 42, i32* @the_global
+ call void @good(i32 %myload, i32 2, i32 3, i32 4)
+ ret void
+}