ef6fa9e26d
This release contains the following cherry-picked revisions from upstream trunk: 226124 226151 226164 226165 226166 226407 226408 226409 226652 226905 226983 227084 227087 227089 227208 227209 227210 227211 227212 227213 227214 227269 227430 227482 227503 227519 227574 227822 227986 227987 227988 227989 227990 228037 228038 228039 228040 228188 228189 228190 228273 228372 228373 228374 228403 228765 228848 228918 229223 229225 229226 229227 229228 229230 229234 229235 229236 229238 229239 229413 229507 229680 229750 229751 229752 229911 230146 230147 230235 230253 230255 230469 230500 230564 230603 230657 230742 230748 230956 231219 231237 231245 231259 231280 231451 231563 231601 231658 231659 231662 231984 231986 232046 232085 232142 232176 232179 232189 232382 232386 232389 232425 232438 232443 232675 232786 232797 232943 232957 233075 233080 233351 233353 233409 233410 233508 233584 233819 233904 234629 234636 234891 234975 234977 235524 235641 235662 235931 236099 236306 236307 Please note that from 3.5.0 onwards, clang and llvm require C++11 support to build; see UPDATING for more information.
420 lines
16 KiB
Diff
420 lines
16 KiB
Diff
Pull in r230348 from upstream llvm trunk (by Tim Northover):
|
|
|
|
ARM: treat [N x i32] and [N x i64] as AAPCS composite types
|
|
|
|
The logic is almost there already, with our special homogeneous
|
|
aggregate handling. Tweaking it like this allows front-ends to emit
|
|
AAPCS compliant code without ever having to count registers or add
|
|
discarded padding arguments.
|
|
|
|
Only arrays of i32 and i64 are needed to model AAPCS rules, but I
|
|
decided to apply the logic to all integer arrays for more consistency.
|
|
|
|
This fixes a possible "Unexpected member type for HA" error when
|
|
compiling lib/msun/bsdsrc/b_tgamma.c for armv6.
|
|
|
|
Reported by: Jakub Palider <jpa@semihalf.com>
|
|
|
|
Introduced here: https://svnweb.freebsd.org/changeset/base/280400
|
|
|
|
Index: include/llvm/CodeGen/CallingConvLower.h
|
|
===================================================================
|
|
--- include/llvm/CodeGen/CallingConvLower.h
|
|
+++ include/llvm/CodeGen/CallingConvLower.h
|
|
@@ -122,8 +122,8 @@ class CCValAssign {
|
|
// There is no need to differentiate between a pending CCValAssign and other
|
|
// kinds, as they are stored in a different list.
|
|
static CCValAssign getPending(unsigned ValNo, MVT ValVT, MVT LocVT,
|
|
- LocInfo HTP) {
|
|
- return getReg(ValNo, ValVT, 0, LocVT, HTP);
|
|
+ LocInfo HTP, unsigned ExtraInfo = 0) {
|
|
+ return getReg(ValNo, ValVT, ExtraInfo, LocVT, HTP);
|
|
}
|
|
|
|
void convertToReg(unsigned RegNo) {
|
|
@@ -146,6 +146,7 @@ class CCValAssign {
|
|
|
|
unsigned getLocReg() const { assert(isRegLoc()); return Loc; }
|
|
unsigned getLocMemOffset() const { assert(isMemLoc()); return Loc; }
|
|
+ unsigned getExtraInfo() const { return Loc; }
|
|
MVT getLocVT() const { return LocVT; }
|
|
|
|
LocInfo getLocInfo() const { return HTP; }
|
|
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
|
|
===================================================================
|
|
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
|
|
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
|
|
@@ -7429,11 +7429,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo
|
|
}
|
|
if (Args[i].isNest)
|
|
Flags.setNest();
|
|
- if (NeedsRegBlock) {
|
|
+ if (NeedsRegBlock)
|
|
Flags.setInConsecutiveRegs();
|
|
- if (Value == NumValues - 1)
|
|
- Flags.setInConsecutiveRegsLast();
|
|
- }
|
|
Flags.setOrigAlign(OriginalAlignment);
|
|
|
|
MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
|
|
@@ -7482,6 +7479,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo
|
|
CLI.Outs.push_back(MyFlags);
|
|
CLI.OutVals.push_back(Parts[j]);
|
|
}
|
|
+
|
|
+ if (NeedsRegBlock && Value == NumValues - 1)
|
|
+ CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
|
|
}
|
|
}
|
|
|
|
@@ -7697,11 +7697,8 @@ void SelectionDAGISel::LowerArguments(const Functi
|
|
}
|
|
if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
|
|
Flags.setNest();
|
|
- if (NeedsRegBlock) {
|
|
+ if (NeedsRegBlock)
|
|
Flags.setInConsecutiveRegs();
|
|
- if (Value == NumValues - 1)
|
|
- Flags.setInConsecutiveRegsLast();
|
|
- }
|
|
Flags.setOrigAlign(OriginalAlignment);
|
|
|
|
MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
|
|
@@ -7716,6 +7713,8 @@ void SelectionDAGISel::LowerArguments(const Functi
|
|
MyFlags.Flags.setOrigAlign(1);
|
|
Ins.push_back(MyFlags);
|
|
}
|
|
+ if (NeedsRegBlock && Value == NumValues - 1)
|
|
+ Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
|
|
PartBase += VT.getStoreSize();
|
|
}
|
|
}
|
|
Index: lib/Target/ARM/ARMCallingConv.h
|
|
===================================================================
|
|
--- lib/Target/ARM/ARMCallingConv.h
|
|
+++ lib/Target/ARM/ARMCallingConv.h
|
|
@@ -160,6 +160,8 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &V
|
|
State);
|
|
}
|
|
|
|
+static const uint16_t RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
|
|
+
|
|
static const uint16_t SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3,
|
|
ARM::S4, ARM::S5, ARM::S6, ARM::S7,
|
|
ARM::S8, ARM::S9, ARM::S10, ARM::S11,
|
|
@@ -168,81 +170,114 @@ static const uint16_t DRegList[] = { ARM::D0, ARM:
|
|
ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
|
|
static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
|
|
|
|
+
|
|
// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
|
|
// has InConsecutiveRegs set, and that the last member also has
|
|
// InConsecutiveRegsLast set. We must process all members of the HA before
|
|
// we can allocate it, as we need to know the total number of registers that
|
|
// will be needed in order to (attempt to) allocate a contiguous block.
|
|
-static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
|
|
- CCValAssign::LocInfo &LocInfo,
|
|
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
|
|
- SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
|
|
+static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
|
|
+ MVT &LocVT,
|
|
+ CCValAssign::LocInfo &LocInfo,
|
|
+ ISD::ArgFlagsTy &ArgFlags,
|
|
+ CCState &State) {
|
|
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
|
|
|
|
// AAPCS HFAs must have 1-4 elements, all of the same type
|
|
- assert(PendingHAMembers.size() < 4);
|
|
- if (PendingHAMembers.size() > 0)
|
|
- assert(PendingHAMembers[0].getLocVT() == LocVT);
|
|
+ if (PendingMembers.size() > 0)
|
|
+ assert(PendingMembers[0].getLocVT() == LocVT);
|
|
|
|
// Add the argument to the list to be allocated once we know the size of the
|
|
- // HA
|
|
- PendingHAMembers.push_back(
|
|
- CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
|
|
+ // aggregate. Store the type's required alignmnent as extra info for later: in
|
|
+ // the [N x i64] case all trace has been removed by the time we actually get
|
|
+ // to do allocation.
|
|
+ PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
|
|
+ ArgFlags.getOrigAlign()));
|
|
|
|
- if (ArgFlags.isInConsecutiveRegsLast()) {
|
|
- assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 4 &&
|
|
- "Homogeneous aggregates must have between 1 and 4 members");
|
|
+ if (!ArgFlags.isInConsecutiveRegsLast())
|
|
+ return true;
|
|
|
|
- // Try to allocate a contiguous block of registers, each of the correct
|
|
- // size to hold one member.
|
|
- ArrayRef<uint16_t> RegList;
|
|
- switch (LocVT.SimpleTy) {
|
|
- case MVT::f32:
|
|
- RegList = SRegList;
|
|
- break;
|
|
- case MVT::f64:
|
|
- RegList = DRegList;
|
|
- break;
|
|
- case MVT::v2f64:
|
|
- RegList = QRegList;
|
|
- break;
|
|
- default:
|
|
- llvm_unreachable("Unexpected member type for HA");
|
|
- break;
|
|
- }
|
|
+ // Try to allocate a contiguous block of registers, each of the correct
|
|
+ // size to hold one member.
|
|
+ unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U);
|
|
|
|
- unsigned RegResult =
|
|
- State.AllocateRegBlock(RegList, PendingHAMembers.size());
|
|
+ ArrayRef<uint16_t> RegList;
|
|
+ switch (LocVT.SimpleTy) {
|
|
+ case MVT::i32: {
|
|
+ RegList = RRegList;
|
|
+ unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size());
|
|
|
|
- if (RegResult) {
|
|
- for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
|
|
- It != PendingHAMembers.end(); ++It) {
|
|
- It->convertToReg(RegResult);
|
|
- State.addLoc(*It);
|
|
- ++RegResult;
|
|
- }
|
|
- PendingHAMembers.clear();
|
|
- return true;
|
|
- }
|
|
+ // First consume all registers that would give an unaligned object. Whether
|
|
+ // we go on stack or in regs, no-one will be using them in future.
|
|
+ unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4;
|
|
+ while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
|
|
+ State.AllocateReg(RegList[RegIdx++]);
|
|
|
|
- // Register allocation failed, fall back to the stack
|
|
+ break;
|
|
+ }
|
|
+ case MVT::f32:
|
|
+ RegList = SRegList;
|
|
+ break;
|
|
+ case MVT::f64:
|
|
+ RegList = DRegList;
|
|
+ break;
|
|
+ case MVT::v2f64:
|
|
+ RegList = QRegList;
|
|
+ break;
|
|
+ default:
|
|
+ llvm_unreachable("Unexpected member type for block aggregate");
|
|
+ break;
|
|
+ }
|
|
|
|
- // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp)
|
|
- for (unsigned regNo = 0; regNo < 16; ++regNo)
|
|
- State.AllocateReg(SRegList[regNo]);
|
|
+ unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
|
|
+ if (RegResult) {
|
|
+ for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
|
|
+ It != PendingMembers.end(); ++It) {
|
|
+ It->convertToReg(RegResult);
|
|
+ State.addLoc(*It);
|
|
+ ++RegResult;
|
|
+ }
|
|
+ PendingMembers.clear();
|
|
+ return true;
|
|
+ }
|
|
|
|
- unsigned Size = LocVT.getSizeInBits() / 8;
|
|
- unsigned Align = std::min(Size, 8U);
|
|
+ // Register allocation failed, we'll be needing the stack
|
|
+ unsigned Size = LocVT.getSizeInBits() / 8;
|
|
+ if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
|
|
+ // If nothing else has used the stack until this point, a non-HFA aggregate
|
|
+ // can be split between regs and stack.
|
|
+ unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size());
|
|
+ for (auto &It : PendingMembers) {
|
|
+ if (RegIdx >= RegList.size())
|
|
+ It.convertToMem(State.AllocateStack(Size, Size));
|
|
+ else
|
|
+ It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
|
|
|
|
- for (auto It : PendingHAMembers) {
|
|
- It.convertToMem(State.AllocateStack(Size, Align));
|
|
State.addLoc(It);
|
|
}
|
|
+ PendingMembers.clear();
|
|
+ return true;
|
|
+ } else if (LocVT != MVT::i32)
|
|
+ RegList = SRegList;
|
|
|
|
- // All pending members have now been allocated
|
|
- PendingHAMembers.clear();
|
|
+ // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
|
|
+ for (auto Reg : RegList)
|
|
+ State.AllocateReg(Reg);
|
|
+
|
|
+ for (auto &It : PendingMembers) {
|
|
+ It.convertToMem(State.AllocateStack(Size, Align));
|
|
+ State.addLoc(It);
|
|
+
|
|
+ // After the first item has been allocated, the rest are packed as tightly
|
|
+ // as possible. (E.g. an incoming i64 would have starting Align of 8, but
|
|
+ // we'll be allocating a bunch of i32 slots).
|
|
+ Align = Size;
|
|
}
|
|
|
|
- // This will be allocated by the last member of the HA
|
|
+ // All pending members have now been allocated
|
|
+ PendingMembers.clear();
|
|
+
|
|
+ // This will be allocated by the last member of the aggregate
|
|
return true;
|
|
}
|
|
|
|
Index: lib/Target/ARM/ARMCallingConv.td
|
|
===================================================================
|
|
--- lib/Target/ARM/ARMCallingConv.td
|
|
+++ lib/Target/ARM/ARMCallingConv.td
|
|
@@ -175,7 +175,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
|
|
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
|
|
|
|
// HFAs are passed in a contiguous block of registers, or on the stack
|
|
- CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_HA">>,
|
|
+ CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
|
|
|
|
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
|
|
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
|
|
Index: lib/Target/ARM/ARMISelLowering.cpp
|
|
===================================================================
|
|
--- lib/Target/ARM/ARMISelLowering.cpp
|
|
+++ lib/Target/ARM/ARMISelLowering.cpp
|
|
@@ -11285,7 +11285,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABas
|
|
return (Members > 0 && Members <= 4);
|
|
}
|
|
|
|
-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
|
|
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
|
|
+/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
|
|
+/// passing according to AAPCS rules.
|
|
bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
|
|
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
|
|
if (getEffectiveCallingConv(CallConv, isVarArg) !=
|
|
@@ -11294,7 +11296,9 @@ bool ARMTargetLowering::functionArgumentNeedsConse
|
|
|
|
HABaseType Base = HA_UNKNOWN;
|
|
uint64_t Members = 0;
|
|
- bool result = isHomogeneousAggregate(Ty, Base, Members);
|
|
- DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump());
|
|
- return result;
|
|
+ bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
|
|
+ DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
|
|
+
|
|
+ bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
|
|
+ return IsHA || IsIntArray;
|
|
}
|
|
Index: test/CodeGen/ARM/aggregate-padding.ll
|
|
===================================================================
|
|
--- test/CodeGen/ARM/aggregate-padding.ll
|
|
+++ test/CodeGen/ARM/aggregate-padding.ll
|
|
@@ -0,0 +1,101 @@
|
|
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
|
|
+
|
|
+; [2 x i64] should be contiguous when split (e.g. we shouldn't try to align all
|
|
+; i32 components to 64 bits). Also makes sure i64 based types are properly
|
|
+; aligned on the stack.
|
|
+define i64 @test_i64_contiguous_on_stack([8 x double], float, i32 %in, [2 x i64] %arg) nounwind {
|
|
+; CHECK-LABEL: test_i64_contiguous_on_stack:
|
|
+; CHECK-DAG: ldr [[LO0:r[0-9]+]], [sp, #8]
|
|
+; CHECK-DAG: ldr [[HI0:r[0-9]+]], [sp, #12]
|
|
+; CHECK-DAG: ldr [[LO1:r[0-9]+]], [sp, #16]
|
|
+; CHECK-DAG: ldr [[HI1:r[0-9]+]], [sp, #20]
|
|
+; CHECK: adds r0, [[LO0]], [[LO1]]
|
|
+; CHECK: adc r1, [[HI0]], [[HI1]]
|
|
+
|
|
+ %val1 = extractvalue [2 x i64] %arg, 0
|
|
+ %val2 = extractvalue [2 x i64] %arg, 1
|
|
+ %sum = add i64 %val1, %val2
|
|
+ ret i64 %sum
|
|
+}
|
|
+
|
|
+; [2 x i64] should try to use looks for 4 regs, not 8 (which might happen if the
|
|
+; i64 -> i32, i32 split wasn't handled correctly).
|
|
+define i64 @test_2xi64_uses_4_regs([8 x double], float, [2 x i64] %arg) nounwind {
|
|
+; CHECK-LABEL: test_2xi64_uses_4_regs:
|
|
+; CHECK-DAG: mov r0, r2
|
|
+; CHECK-DAG: mov r1, r3
|
|
+
|
|
+ %val = extractvalue [2 x i64] %arg, 1
|
|
+ ret i64 %val
|
|
+}
|
|
+
|
|
+; An aggregate should be able to split between registers and stack if there is
|
|
+; nothing else on the stack.
|
|
+define i32 @test_aggregates_split([8 x double], i32, [4 x i32] %arg) nounwind {
|
|
+; CHECK-LABEL: test_aggregates_split:
|
|
+; CHECK: ldr [[VAL3:r[0-9]+]], [sp]
|
|
+; CHECK: add r0, r1, [[VAL3]]
|
|
+
|
|
+ %val0 = extractvalue [4 x i32] %arg, 0
|
|
+ %val3 = extractvalue [4 x i32] %arg, 3
|
|
+ %sum = add i32 %val0, %val3
|
|
+ ret i32 %sum
|
|
+}
|
|
+
|
|
+; If an aggregate has to be moved entirely onto the stack, nothing should be
|
|
+; able to use r0-r3 any more. Also checks that [2 x i64] properly aligned when
|
|
+; it uses regs.
|
|
+define i32 @test_no_int_backfilling([8 x double], float, i32, [2 x i64], i32 %arg) nounwind {
|
|
+; CHECK-LABEL: test_no_int_backfilling:
|
|
+; CHECK: ldr r0, [sp, #24]
|
|
+ ret i32 %arg
|
|
+}
|
|
+
|
|
+; Even if the argument was successfully allocated as reg block, there should be
|
|
+; no backfillig to r1.
|
|
+define i32 @test_no_int_backfilling_regsonly(i32, [1 x i64], i32 %arg) {
|
|
+; CHECK-LABEL: test_no_int_backfilling_regsonly:
|
|
+; CHECK: ldr r0, [sp]
|
|
+ ret i32 %arg
|
|
+}
|
|
+
|
|
+; If an aggregate has to be moved entirely onto the stack, nothing should be
|
|
+; able to use r0-r3 any more.
|
|
+define float @test_no_float_backfilling([7 x double], [4 x i32], i32, [4 x double], float %arg) nounwind {
|
|
+; CHECK-LABEL: test_no_float_backfilling:
|
|
+; CHECK: vldr s0, [sp, #40]
|
|
+ ret float %arg
|
|
+}
|
|
+
|
|
+; They're a bit pointless, but types like [N x i8] should work as well.
|
|
+define i8 @test_i8_in_regs(i32, [3 x i8] %arg) {
|
|
+; CHECK-LABEL: test_i8_in_regs:
|
|
+; CHECK: add r0, r1, r3
|
|
+ %val0 = extractvalue [3 x i8] %arg, 0
|
|
+ %val2 = extractvalue [3 x i8] %arg, 2
|
|
+ %sum = add i8 %val0, %val2
|
|
+ ret i8 %sum
|
|
+}
|
|
+
|
|
+define i16 @test_i16_split(i32, i32, [3 x i16] %arg) {
|
|
+; CHECK-LABEL: test_i16_split:
|
|
+; CHECK: ldrh [[VAL2:r[0-9]+]], [sp]
|
|
+; CHECK: add r0, r2, [[VAL2]]
|
|
+ %val0 = extractvalue [3 x i16] %arg, 0
|
|
+ %val2 = extractvalue [3 x i16] %arg, 2
|
|
+ %sum = add i16 %val0, %val2
|
|
+ ret i16 %sum
|
|
+}
|
|
+
|
|
+; Beware: on the stack each i16 still gets a 32-bit slot, the array is not
|
|
+; packed.
|
|
+define i16 @test_i16_forced_stack([8 x double], double, i32, i32, [3 x i16] %arg) {
|
|
+; CHECK-LABEL: test_i16_forced_stack:
|
|
+; CHECK-DAG: ldrh [[VAL0:r[0-9]+]], [sp, #8]
|
|
+; CHECK-DAG: ldrh [[VAL2:r[0-9]+]], [sp, #16]
|
|
+; CHECK: add r0, [[VAL0]], [[VAL2]]
|
|
+ %val0 = extractvalue [3 x i16] %arg, 0
|
|
+ %val2 = extractvalue [3 x i16] %arg, 2
|
|
+ %sum = add i16 %val0, %val2
|
|
+ ret i16 %sum
|
|
+}
|